rocksdb/file/file_prefetch_buffer.cc

993 lines
36 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "file/file_prefetch_buffer.h"
#include <algorithm>
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
#include <cassert>
#include "file/random_access_file_reader.h"
#include "monitoring/histogram.h"
#include "monitoring/iostats_context_imp.h"
#include "port/port.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/rate_limiter_impl.h"
namespace ROCKSDB_NAMESPACE {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
uint64_t offset,
size_t roundup_len,
bool refit_tail,
uint64_t& aligned_useful_len) {
uint64_t aligned_useful_offset_in_buf = 0;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
bool copy_data_to_new_buffer = false;
// Check if requested bytes are in the existing buffer_.
// If only a few bytes exist -- reuse them & read only what is really needed.
// This is typically the case of incremental reading of data.
// If no bytes exist in buffer -- full pread.
if (buf->DoesBufferContainData() && buf->IsOffsetInBuffer(offset)) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// Only a few requested bytes are in the buffer. memmove those chunk of
// bytes to the beginning, and memcpy them back into the new buffer if a
// new buffer is created.
aligned_useful_offset_in_buf =
Rounddown(static_cast<size_t>(offset - buf->offset_), alignment);
aligned_useful_len = static_cast<uint64_t>(buf->CurrentSize()) -
aligned_useful_offset_in_buf;
assert(aligned_useful_offset_in_buf % alignment == 0);
assert(aligned_useful_len % alignment == 0);
assert(aligned_useful_offset_in_buf + aligned_useful_len <=
buf->offset_ + buf->CurrentSize());
if (aligned_useful_len > 0) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
copy_data_to_new_buffer = true;
} else {
// this reset is not necessary, but just to be safe.
aligned_useful_offset_in_buf = 0;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
}
// Create a new buffer only if current capacity is not sufficient, and memcopy
// bytes from old buffer if needed (i.e., if aligned_useful_len is greater
// than 0).
if (buf->buffer_.Capacity() < roundup_len) {
buf->buffer_.Alignment(alignment);
buf->buffer_.AllocateNewBuffer(
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
aligned_useful_offset_in_buf, static_cast<size_t>(aligned_useful_len));
} else if (aligned_useful_len > 0 && refit_tail) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// New buffer not needed. But memmove bytes from tail to the beginning since
// aligned_useful_len is greater than 0.
buf->buffer_.RefitTail(static_cast<size_t>(aligned_useful_offset_in_buf),
static_cast<size_t>(aligned_useful_len));
} else if (aligned_useful_len > 0) {
// For async prefetching, it doesn't call RefitTail with aligned_useful_len
// > 0. Allocate new buffer if needed because aligned buffer calculate
// remaining buffer as capacity - cursize which might not be the case in
// this as it's not refitting.
// TODO: Use refit_tail for async prefetching too.
buf->buffer_.Alignment(alignment);
buf->buffer_.AllocateNewBuffer(
Fix some errors in async prefetching in FilePrefetchBuffer (#9734) Summary: In ReadOption `async_io` which prefetches the data asynchronously, db_bench and db_stress runs were failing because wrong data was prefetched which resulted in Error: Checksum mismatched. Wrong data was copied because capacity was less than actual size needed. It has been fixed in this PR. Since there are two separate methods for async and sync prefetching, these changes are in async prefetching methods and any changes would not effect normal prefetching. I ran the regressions to make sure normal prefetching is fine. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9734 Test Plan: 1. CircleCI jobs 2. Ran db_bench ``` . /db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 ``` 3. Ran db_stress test ``` export CRASH_TEST_EXT_ARGS=" --async_io=1 --adaptive_readahead=1" make crash_test -j ``` 4. Run regressions for async_io disabled. Old flow without any async changes: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` With async prefetching changes and async_io disabled to make sure in normal prefetching there is no regression. ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 --async_io=0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.1 Date: Wed Mar 23 15:56:37 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 481819.816 micros/op 2 ops/sec; 340.2 MB/s (250 of 250 found) ``` Reviewed By: riversand963 Differential Revision: D35058471 Pulled By: akankshamahajan15 fbshipit-source-id: 9233a1e6d97cea0c7a8111bfb9e8ac3251c341ce
2022-03-26 01:26:22 +00:00
static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
aligned_useful_offset_in_buf, static_cast<size_t>(aligned_useful_len));
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
}
Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
RandomAccessFileReader* reader,
uint64_t read_len, uint64_t aligned_useful_len,
uint64_t start_offset) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
Slice result;
char* to_buf = buf->buffer_.BufferStart() + aligned_useful_len;
Status s = reader->Read(opts, start_offset + aligned_useful_len, read_len,
&result, to_buf, /*aligned_buf=*/nullptr);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
#ifndef NDEBUG
if (result.size() < read_len) {
// Fake an IO error to force db_stress fault injection to ignore
// truncated read errors
IGNORE_STATUS_IF_ERROR(Status::IOError());
}
#endif
if (!s.ok()) {
return s;
}
Fix and defend against FilePrefetchBuffer combined with mmap reads (#12206) Summary: FilePrefetchBuffer makes an unchecked assumption about the behavior of RandomAccessFileReader::Read: that it will write to the provided buffer rather than returning the data in an alternate buffer. FilePrefetchBuffer has been quietly incompatible with mmap reads (e.g. allow_mmap_reads / use_mmap_reads) because in that case an alternate buffer is returned (mmapped memory). This incompatibility currently leads to quiet data corruption, as seen in amplified crash test failure in https://github.com/facebook/rocksdb/issues/12200. In this change, * Check whether RandomAccessFileReader::Read has the expected behavior, and fail if not. (Assertion failure in debug build, return Corruption in release build.) This will detect future regressions synchronously and precisely, rather than relying on debugging downstream data corruption. * Why not recover? My understanding is that FilePrefetchBuffer is not intended for use when RandomAccessFileReader::Read uses an alternate buffer, so quietly recovering could lead to undesirable (inefficient) behavior. * Mention incompatibility with mmap-based readers in the internal API comments for FilePrefetchBuffer * Fix two cases where FilePrefetchBuffer could be used with mmap, both stemming from SstFileDumper, though one fix is in BlockBasedTableReader. There is currently no way to ask a RandomAccessFileReader whether it's using mmap, so we currently have to rely on other options as clues. Keeping separate from https://github.com/facebook/rocksdb/issues/12200 in part because this change is more appropriate for backport than that one. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12206 Test Plan: * Manually verified that the new check aids in debugging. * Unit test added, that fails if either fix is missed. * Ran blackbox_crash_test for hours, with and without https://github.com/facebook/rocksdb/issues/12200 Reviewed By: akankshamahajan15 Differential Revision: D52551701 Pulled By: pdillinger fbshipit-source-id: dea87c5782b7c484a6c6e424585c8832dfc580dc
2024-01-05 02:39:05 +00:00
if (result.data() != to_buf) {
// If the read is coming from some other buffer already in memory (such as
// mmap) then it would be inefficient to create another copy in this
// FilePrefetchBuffer. The caller is expected to exclude this case.
assert(false);
return Status::Corruption("File read didn't populate our buffer");
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
RecordTick(stats_, PREFETCH_BYTES, read_len);
}
// Update the buffer size.
buf->buffer_.Size(static_cast<size_t>(aligned_useful_len) + result.size());
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
return s;
}
Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts,
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
RandomAccessFileReader* reader,
uint64_t read_len, uint64_t start_offset) {
TEST_SYNC_POINT("FilePrefetchBuffer::ReadAsync");
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// callback for async read request.
auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this,
std::placeholders::_1, std::placeholders::_2);
FSReadRequest req;
Slice result;
req.len = read_len;
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
req.offset = start_offset;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
req.result = result;
req.scratch = buf->buffer_.BufferStart();
buf->async_req_len_ = req.len;
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
Status s = reader->ReadAsync(req, opts, fp, buf, &(buf->io_handle_),
&(buf->del_fn_), /*aligned_buf =*/nullptr);
req.status.PermitUncheckedError();
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (s.ok()) {
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
RecordTick(stats_, PREFETCH_BYTES, read_len);
buf->async_read_in_progress_ = true;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
return s;
}
Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
RandomAccessFileReader* reader,
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
uint64_t offset, size_t n) {
if (!enable_ || reader == nullptr) {
return Status::OK();
}
assert(num_buffers_ == 1);
AllocateBufferIfEmpty();
BufferInfo* buf = GetFirstBuffer();
TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (offset + n <= buf->offset_ + buf->CurrentSize()) {
// All requested bytes are already in the buffer. So no need to Read again.
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
return Status::OK();
}
size_t alignment = reader->file()->GetRequiredBufferAlignment();
uint64_t rounddown_offset = offset, roundup_end = 0, aligned_useful_len = 0;
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
size_t read_len = 0;
ReadAheadSizeTuning(buf, /*read_curr_block=*/true,
/*refit_tail=*/true, rounddown_offset, alignment, 0, n,
rounddown_offset, roundup_end, read_len,
aligned_useful_len);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
Status s;
if (read_len > 0) {
s = Read(buf, opts, reader, read_len, aligned_useful_len, rounddown_offset);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
}
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265) Summary: **Context/Summary:** We are adding new stats to measure behavior of prefetched tail size and look up into this buffer The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265 Test Plan: **- Piggyback on existing test** **- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.** ``` ./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true ``` ``` rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600 rocksdb.table.open.prefetch.tail.miss COUNT : 91 rocksdb.table.open.prefetch.tail.hit COUNT : 1034 ``` **- No perf regression observed in db_bench** SETUP command: create same db with ~900 files for pre-change/post-change. ``` ./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none ``` TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing. ``` ./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true ``` async io = 0, direct io read = true | seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs -- | -- | -- | -- pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec async io = 1, direct io read = true | seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs -- | -- | -- | -- pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec Reviewed By: ajkr Differential Revision: D43781467 Pulled By: hx235 fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) {
RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len);
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
return s;
}
// Copy data from src to overlap_buf_.
void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset,
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
size_t& length) {
if (length == 0) {
return;
}
uint64_t copy_offset = (offset - src->offset_);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
size_t copy_len = 0;
if (src->IsDataBlockInBuffer(offset, length)) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// All the bytes are in src.
copy_len = length;
} else {
copy_len = src->CurrentSize() - copy_offset;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
BufferInfo* dst = overlap_buf_;
memcpy(dst->buffer_.BufferStart() + dst->CurrentSize(),
src->buffer_.BufferStart() + copy_offset, copy_len);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
dst->buffer_.Size(dst->CurrentSize() + copy_len);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// Update offset and length.
offset += copy_len;
length -= copy_len;
// length > 0 indicates it has consumed all data from the src buffer and it
// still needs to read more other buffer.
if (length > 0) {
FreeFrontBuffer();
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
}
// Clear the buffers if it contains outdated data. Outdated data can be because
// previous sequential reads were read from the cache instead of these buffer.
// In that case outdated IOs should be aborted.
void FilePrefetchBuffer::AbortOutdatedIO(uint64_t offset) {
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
std::vector<void*> handles;
std::vector<BufferInfo*> tmp_buf;
for (auto& buf : bufs_) {
if (buf->IsBufferOutdatedWithAsyncProgress(offset)) {
handles.emplace_back(buf->io_handle_);
tmp_buf.emplace_back(buf);
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
if (!handles.empty()) {
StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
Status s = fs_->AbortIO(handles);
assert(s.ok());
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
for (auto& buf : tmp_buf) {
if (buf->async_read_in_progress_) {
DestroyAndClearIOHandle(buf);
buf->async_read_in_progress_ = false;
}
buf->ClearBuffer();
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
}
void FilePrefetchBuffer::AbortAllIOs() {
std::vector<void*> handles;
for (auto& buf : bufs_) {
if (buf->async_read_in_progress_ && buf->io_handle_ != nullptr) {
handles.emplace_back(buf->io_handle_);
Fix some errors in async prefetching in FilePrefetchBuffer (#9734) Summary: In ReadOption `async_io` which prefetches the data asynchronously, db_bench and db_stress runs were failing because wrong data was prefetched which resulted in Error: Checksum mismatched. Wrong data was copied because capacity was less than actual size needed. It has been fixed in this PR. Since there are two separate methods for async and sync prefetching, these changes are in async prefetching methods and any changes would not effect normal prefetching. I ran the regressions to make sure normal prefetching is fine. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9734 Test Plan: 1. CircleCI jobs 2. Ran db_bench ``` . /db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 ``` 3. Ran db_stress test ``` export CRASH_TEST_EXT_ARGS=" --async_io=1 --adaptive_readahead=1" make crash_test -j ``` 4. Run regressions for async_io disabled. Old flow without any async changes: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` With async prefetching changes and async_io disabled to make sure in normal prefetching there is no regression. ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 --async_io=0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.1 Date: Wed Mar 23 15:56:37 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 481819.816 micros/op 2 ops/sec; 340.2 MB/s (250 of 250 found) ``` Reviewed By: riversand963 Differential Revision: D35058471 Pulled By: akankshamahajan15 fbshipit-source-id: 9233a1e6d97cea0c7a8111bfb9e8ac3251c341ce
2022-03-26 01:26:22 +00:00
}
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
if (!handles.empty()) {
StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
Status s = fs_->AbortIO(handles);
assert(s.ok());
}
Fix some errors in async prefetching in FilePrefetchBuffer (#9734) Summary: In ReadOption `async_io` which prefetches the data asynchronously, db_bench and db_stress runs were failing because wrong data was prefetched which resulted in Error: Checksum mismatched. Wrong data was copied because capacity was less than actual size needed. It has been fixed in this PR. Since there are two separate methods for async and sync prefetching, these changes are in async prefetching methods and any changes would not effect normal prefetching. I ran the regressions to make sure normal prefetching is fine. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9734 Test Plan: 1. CircleCI jobs 2. Ran db_bench ``` . /db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 ``` 3. Ran db_stress test ``` export CRASH_TEST_EXT_ARGS=" --async_io=1 --adaptive_readahead=1" make crash_test -j ``` 4. Run regressions for async_io disabled. Old flow without any async changes: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` With async prefetching changes and async_io disabled to make sure in normal prefetching there is no regression. ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 --async_io=0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.1 Date: Wed Mar 23 15:56:37 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 481819.816 micros/op 2 ops/sec; 340.2 MB/s (250 of 250 found) ``` Reviewed By: riversand963 Differential Revision: D35058471 Pulled By: akankshamahajan15 fbshipit-source-id: 9233a1e6d97cea0c7a8111bfb9e8ac3251c341ce
2022-03-26 01:26:22 +00:00
for (auto& buf : bufs_) {
if (buf->io_handle_ != nullptr && buf->del_fn_ != nullptr) {
DestroyAndClearIOHandle(buf);
}
buf->async_read_in_progress_ = false;
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
}
// Clear the buffers if it contains outdated data wrt offset. Outdated data can
// be because previous sequential reads were read from the cache instead of
// these buffer or there is IOError while filling the buffers.
//
// offset - the offset requested to be read. This API makes sure that the
// front/first buffer in bufs_ should contain this offset, otherwise, all
// buffers will be freed.
void FilePrefetchBuffer::ClearOutdatedData(uint64_t offset, size_t length) {
while (!IsBufferQueueEmpty()) {
BufferInfo* buf = GetFirstBuffer();
// Offset is greater than this buffer's end offset.
if (buf->IsBufferOutdated(offset)) {
FreeFrontBuffer();
} else {
break;
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
return;
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
BufferInfo* buf = GetFirstBuffer();
if (buf->async_read_in_progress_) {
FreeEmptyBuffers();
return;
}
// Below handles the case for Overlapping buffers (NumBuffersAllocated > 1).
bool abort_io = false;
if (buf->DoesBufferContainData() && buf->IsOffsetInBuffer(offset)) {
BufferInfo* next_buf = bufs_[1];
if (/* next buffer doesn't align with first buffer and requested data
overlaps with next buffer */
((buf->offset_ + buf->CurrentSize() != next_buf->offset_) &&
(offset + length > buf->offset_ + buf->CurrentSize()))) {
abort_io = true;
}
} else {
// buffer with offset doesn't contain data or offset doesn't lie in this
// buffer.
buf->ClearBuffer();
abort_io = true;
}
if (abort_io) {
AbortAllIOs();
// Clear all buffers after first.
for (size_t i = 1; i < bufs_.size(); ++i) {
bufs_[i]->ClearBuffer();
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
FreeEmptyBuffers();
assert(IsBufferQueueEmpty() || buf->IsOffsetInBuffer(offset));
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
}
void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
BufferInfo* buf = GetFirstBuffer();
if (buf->async_read_in_progress_ && fs_ != nullptr) {
if (buf->io_handle_ != nullptr) {
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
// Wait for prefetch data to complete.
// No mutex is needed as async_read_in_progress behaves as mutex and is
// updated by main thread only.
std::vector<void*> handles;
handles.emplace_back(buf->io_handle_);
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
fs_->Poll(handles, 1).PermitUncheckedError();
}
// Reset and Release io_handle after the Poll API as request has been
// completed.
DestroyAndClearIOHandle(buf);
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
// Always call outdated data after Poll as Buffers might be out of sync w.r.t
// offset and length.
ClearOutdatedData(offset, length);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
}
// ReadAheadSizeTuning API calls readaheadsize_cb_
// (BlockBasedTableIterator::BlockCacheLookupForReadAheadSize) to lookup in the
// cache and tune the start and end offsets based on cache hits/misses.
//
// Arguments -
// read_curr_block : True if this call was due to miss in the cache and
// FilePrefetchBuffer wants to read that block
// synchronously.
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// False if current call is to prefetch additional data in
// extra buffers through ReadAsync API.
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// prev_buf_end_offset : End offset of the previous buffer. It's used in case
// of ReadAsync to make sure it doesn't read anything from
// previous buffer which is already prefetched.
void FilePrefetchBuffer::ReadAheadSizeTuning(
BufferInfo* buf, bool read_curr_block, bool refit_tail,
uint64_t prev_buf_end_offset, size_t alignment, size_t length,
size_t readahead_size, uint64_t& start_offset, uint64_t& end_offset,
size_t& read_len, uint64_t& aligned_useful_len) {
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
uint64_t updated_start_offset = Rounddown(start_offset, alignment);
uint64_t updated_end_offset =
Roundup(start_offset + length + readahead_size, alignment);
uint64_t initial_end_offset = updated_end_offset;
uint64_t initial_start_offset = updated_start_offset;
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// Callback to tune the start and end offsets.
if (readaheadsize_cb_ != nullptr && readahead_size > 0) {
readaheadsize_cb_(read_curr_block, updated_start_offset,
updated_end_offset);
}
// read_len will be 0 and there is nothing to read/prefetch.
if (updated_start_offset == updated_end_offset) {
start_offset = end_offset = updated_start_offset;
UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset),
(updated_end_offset - updated_start_offset));
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
return;
}
assert(updated_start_offset < updated_end_offset);
if (!read_curr_block) {
// Handle the case when callback added block handles which are already
// prefetched and nothing new needs to be prefetched. In that case end
// offset updated by callback will be less than prev_buf_end_offset which
// means data has been already prefetched.
if (updated_end_offset <= prev_buf_end_offset) {
start_offset = end_offset = prev_buf_end_offset;
UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset),
(end_offset - start_offset));
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
return;
}
}
// Realign if start and end offsets are not aligned after tuning.
start_offset = Rounddown(updated_start_offset, alignment);
end_offset = Roundup(updated_end_offset, alignment);
if (!read_curr_block && start_offset < prev_buf_end_offset) {
// Previous buffer already contains the data till prev_buf_end_offset
// because of alignment. Update the start offset after that to avoid
// prefetching it again.
start_offset = prev_buf_end_offset;
}
uint64_t roundup_len = end_offset - start_offset;
PrepareBufferForRead(buf, alignment, start_offset, roundup_len, refit_tail,
aligned_useful_len);
assert(roundup_len >= aligned_useful_len);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// Update the buffer offset.
buf->offset_ = start_offset;
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// Update the initial end offset of this buffer which will be the starting
// offset of next prefetch.
buf->initial_end_offset_ = initial_end_offset;
read_len = static_cast<size_t>(roundup_len - aligned_useful_len);
UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset),
(end_offset - start_offset));
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
// If data is overlapping between two buffers then during this call:
// - data from first buffer is copied into overlapping buffer,
// - first is removed from bufs_ and freed so that it can be used for async
// prefetching of further data.
Status FilePrefetchBuffer::HandleOverlappingData(
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
size_t length, size_t readahead_size, bool& copy_to_overlap_buffer,
uint64_t& tmp_offset, size_t& tmp_length) {
// No Overlapping of data between 2 buffers.
if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
return Status::OK();
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
Status s;
size_t alignment = reader->file()->GetRequiredBufferAlignment();
BufferInfo* buf = GetFirstBuffer();
// Check if the first buffer has the required offset and the async read is
// still in progress. This should only happen if a prefetch was initiated
// by Seek, but the next access is at another offset.
if (buf->async_read_in_progress_ &&
buf->IsOffsetInBufferWithAsyncProgress(offset)) {
PollIfNeeded(offset, length);
}
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
return Status::OK();
}
BufferInfo* next_buf = bufs_[1];
// If data is overlapping over two buffers, copy the data from front and
// call ReadAsync on freed buffer.
if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() &&
buf->IsOffsetInBuffer(offset) &&
(/*Data extends over two buffers and second buffer either has data or in
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
process of population=*/
(offset + length > next_buf->offset_) &&
(next_buf->async_read_in_progress_ ||
next_buf->DoesBufferContainData()))) {
// Allocate new buffer to overlap_buf_.
overlap_buf_->ClearBuffer();
overlap_buf_->buffer_.Alignment(alignment);
overlap_buf_->buffer_.AllocateNewBuffer(length);
overlap_buf_->offset_ = offset;
copy_to_overlap_buffer = true;
CopyDataToBuffer(buf, tmp_offset, tmp_length);
UpdateStats(/*found_in_buffer=*/false, overlap_buf_->CurrentSize());
// Call async prefetching on freed buffer since data has been consumed
// only if requested data lies within next buffer.
size_t second_size = next_buf->async_read_in_progress_
? next_buf->async_req_len_
: next_buf->CurrentSize();
uint64_t start_offset = next_buf->initial_end_offset_;
// If requested bytes - tmp_offset + tmp_length are in next buffer, freed
// buffer can go for further prefetching.
// If requested bytes are not in next buffer, next buffer has to go for sync
// call to get remaining requested bytes. In that case it shouldn't go for
// async prefetching as async prefetching calculates offset based on
// previous buffer end offset and previous buffer has to go for sync
// prefetching.
if (tmp_offset + tmp_length <= next_buf->offset_ + second_size) {
AllocateBuffer();
BufferInfo* new_buf = GetLastBuffer();
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
size_t read_len = 0;
uint64_t end_offset = start_offset, aligned_useful_len = 0;
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
/*refit_tail=*/false, next_buf->offset_ + second_size,
alignment,
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
/*length=*/0, readahead_size, start_offset,
end_offset, read_len, aligned_useful_len);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
if (read_len > 0) {
s = ReadAsync(new_buf, opts, reader, read_len, start_offset);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
if (!s.ok()) {
DestroyAndClearIOHandle(new_buf);
FreeLastBuffer();
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
return s;
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
}
}
return s;
}
// When data is outdated, we clear the first buffer and free it as the
// data has been consumed because of sequential reads.
//
// Scenarios for prefetching asynchronously:
// Case1: If all buffers are in free_bufs_, prefetch n + readahead_size_/2 bytes
// synchronously in first buffer and prefetch readahead_size_/2 async in
// remaining buffers (num_buffers_ -1 ).
// Case2: If first buffer has partial data, prefetch readahead_size_/2 async in
// remaining buffers. In case of partial data, prefetch remaining bytes
// from size n synchronously to fulfill the requested bytes request.
// Case5: (Special case) If data is overlapping in two buffers, copy requested
// data from first, free that buffer to send for async request, wait for
// poll to fill next buffer (if any), and copy remaining data from that
// buffer to overlap buffer.
Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
RandomAccessFileReader* reader,
uint64_t offset, size_t length,
size_t readahead_size,
bool& copy_to_overlap_buffer) {
if (!enable_) {
return Status::OK();
}
TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
size_t alignment = reader->file()->GetRequiredBufferAlignment();
Status s;
uint64_t tmp_offset = offset;
size_t tmp_length = length;
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
size_t original_length = length;
// Abort outdated IO.
if (!explicit_prefetch_submitted_) {
AbortOutdatedIO(offset);
FreeEmptyBuffers();
}
ClearOutdatedData(offset, length);
// Handle overlapping data over two buffers.
s = HandleOverlappingData(opts, reader, offset, length, readahead_size,
copy_to_overlap_buffer, tmp_offset, tmp_length);
if (!s.ok()) {
return s;
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
AllocateBufferIfEmpty();
BufferInfo* buf = GetFirstBuffer();
// Call Poll only if data is needed for the second buffer.
// - Return if whole data is in first and second buffer is in progress or
// already full.
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
// - If second buffer is empty, it will go for ReadAsync for second buffer.
if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() &&
buf->IsDataBlockInBuffer(offset, length)) {
// Whole data is in buffer.
if (!IsEligibleForFurtherPrefetching()) {
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
UpdateStats(/*found_in_buffer=*/true, original_length);
return s;
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
} else {
PollIfNeeded(tmp_offset, tmp_length);
}
AllocateBufferIfEmpty();
buf = GetFirstBuffer();
offset = tmp_offset;
length = tmp_length;
// After polling, if all the requested bytes are in first buffer, it will only
// go for async prefetching.
if (buf->DoesBufferContainData()) {
if (copy_to_overlap_buffer) {
// Data is overlapping i.e. some of the data has been copied to overlap
// buffer and remaining will be updated below.
size_t initial_buf_size = overlap_buf_->CurrentSize();
CopyDataToBuffer(buf, offset, length);
UpdateStats(
/*found_in_buffer=*/false,
overlap_buf_->CurrentSize() - initial_buf_size);
// Length == 0: All the requested data has been copied to overlap buffer
// and it has already gone for async prefetching. It can return without
// doing anything further.
// Length > 0: More data needs to be consumed so it will continue async
// and sync prefetching and copy the remaining data to overlap buffer in
// the end.
if (length == 0) {
UpdateStats(/*found_in_buffer=*/true, length);
return s;
}
} else {
if (buf->IsDataBlockInBuffer(offset, length)) {
offset += length;
length = 0;
// Since async request was submitted directly by calling PrefetchAsync
// in last call, we don't need to prefetch further as this call is to
// poll the data submitted in previous call.
if (explicit_prefetch_submitted_) {
return s;
}
if (!IsEligibleForFurtherPrefetching()) {
UpdateStats(/*found_in_buffer=*/true, original_length);
return s;
}
}
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
AllocateBufferIfEmpty();
buf = GetFirstBuffer();
assert(!buf->async_read_in_progress_);
// Go for ReadAsync and Read (if needed).
// offset and size alignment for first buffer with synchronous prefetching
uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0;
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
size_t read_len1 = 0;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// For length == 0, skip the synchronous prefetching. read_len1 will be 0.
if (length > 0) {
if (buf->IsOffsetInBuffer(offset)) {
UpdateStats(/*found_in_buffer=*/false,
(buf->offset_ + buf->CurrentSize() - offset));
}
ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail*/
true, start_offset1, alignment, length, readahead_size,
start_offset1, end_offset1, read_len1,
aligned_useful_len1);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
} else {
UpdateStats(/*found_in_buffer=*/true, original_length);
}
// Prefetch in remaining buffer only if readahead_size > 0.
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
if (readahead_size > 0) {
s = PrefetchRemBuffers(opts, reader, end_offset1, alignment,
readahead_size);
if (!s.ok()) {
return s;
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (read_len1 > 0) {
s = Read(buf, opts, reader, read_len1, aligned_useful_len1, start_offset1);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (!s.ok()) {
AbortAllIOs();
FreeAllBuffers();
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
return s;
}
}
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// Copy remaining requested bytes to overlap_buffer. No need to update stats
// as data is prefetched during this call.
if (copy_to_overlap_buffer && length > 0) {
CopyDataToBuffer(buf, offset, length);
}
return s;
}
bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
RandomAccessFileReader* reader,
uint64_t offset, size_t n,
Slice* result, Status* status,
bool for_compaction) {
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265) Summary: **Context/Summary:** We are adding new stats to measure behavior of prefetched tail size and look up into this buffer The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265 Test Plan: **- Piggyback on existing test** **- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.** ``` ./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true ``` ``` rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600 rocksdb.table.open.prefetch.tail.miss COUNT : 91 rocksdb.table.open.prefetch.tail.hit COUNT : 1034 ``` **- No perf regression observed in db_bench** SETUP command: create same db with ~900 files for pre-change/post-change. ``` ./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none ``` TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing. ``` ./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true ``` async io = 0, direct io read = true | seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs -- | -- | -- | -- pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec async io = 1, direct io read = true | seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs -- | -- | -- | -- pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec Reviewed By: ajkr Differential Revision: D43781467 Pulled By: hx235 fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status,
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
for_compaction);
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265) Summary: **Context/Summary:** We are adding new stats to measure behavior of prefetched tail size and look up into this buffer The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265 Test Plan: **- Piggyback on existing test** **- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.** ``` ./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true ``` ``` rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600 rocksdb.table.open.prefetch.tail.miss COUNT : 91 rocksdb.table.open.prefetch.tail.hit COUNT : 1034 ``` **- No perf regression observed in db_bench** SETUP command: create same db with ~900 files for pre-change/post-change. ``` ./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none ``` TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing. ``` ./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true ``` async io = 0, direct io read = true | seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs -- | -- | -- | -- pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec async io = 1, direct io read = true | seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs -- | -- | -- | -- pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec Reviewed By: ajkr Differential Revision: D43781467 Pulled By: hx235 fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) {
if (ret) {
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT);
} else {
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS);
}
}
return ret;
}
bool FilePrefetchBuffer::TryReadFromCacheUntracked(
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
size_t n, Slice* result, Status* status, bool for_compaction) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (track_min_offset_ && offset < min_offset_read_) {
min_offset_read_ = static_cast<size_t>(offset);
}
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
Fix stress test failure "Corruption: checksum mismatch" or "Iterator Diverged" with async_io enabled (#10032) Summary: In case of non sequential reads with `async_io`, `FilePRefetchBuffer::TryReadFromCacheAsync` can be called for previous blocks with `offset < bufs_[curr_].offset_` which wasn't handled correctly resulting wrong data being returned from buffer. Since `FilePRefetchBuffer::PrefetchAsync` can be called for any data block, it sets `prev_len_` to 0 indicating `FilePRefetchBuffer::TryReadFromCacheAsync` to go for the prefetching even though offset < bufs_[curr_].offset_ This is because async prefetching is always done in second buffer (to avoid mutex) even though curr_ is empty leading to offset < bufs_[curr_].offset_ in some cases. If prev_len_ is non zero then `TryReadFromCacheAsync` returns false if `offset < bufs_[curr_].offset_ && prev_len != 0` indicating reads are not sequential and previous call wasn't PrefetchAsync. - This PR also simplifies `FilePRefetchBuffer::TryReadFromCacheAsync` as it was getting complicated covering different scenarios based on `async_io` enabled/disabled. If `for_compaction` is set true, it now calls `FilePRefetchBufferTryReadFromCache` following synchronous flow as before. Its decided in BlockFetcher.cc Pull Request resolved: https://github.com/facebook/rocksdb/pull/10032 Test Plan: 1. export CRASH_TEST_EXT_ARGS=" --async_io=1" make crash_test -j completed successfully locally 2. make crash_test -j completed successfully locally 3. Reran CircleCi mini crashtest job 4 - 5 times. 4. Updated prefetch_test for more coverage. Reviewed By: anand1976 Differential Revision: D36579858 Pulled By: akankshamahajan15 fbshipit-source-id: 0c428d62b45e12e082a83acf533a5e37a584bedf
2022-05-23 19:15:26 +00:00
if (!enable_) {
return false;
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
if (explicit_prefetch_submitted_) {
// explicit_prefetch_submitted_ is special case where it expects request
// submitted in PrefetchAsync should match with this request. Otherwise
// buffers will be outdated.
// Random offset called. So abort the IOs.
if (prev_offset_ != offset) {
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
AbortAllIOs();
FreeAllBuffers();
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
explicit_prefetch_submitted_ = false;
return false;
}
}
AllocateBufferIfEmpty();
BufferInfo* buf = GetFirstBuffer();
if (!explicit_prefetch_submitted_ && offset < buf->offset_) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
return false;
}
bool prefetched = false;
bool copy_to_overlap_buffer = false;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// If the buffer contains only a few of the requested bytes:
// If readahead is enabled: prefetch the remaining bytes + readahead
// bytes
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// and satisfy the request.
// If readahead is not enabled: return false.
TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
&readahead_size_);
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
if (explicit_prefetch_submitted_ ||
(buf->async_read_in_progress_ ||
offset + n > buf->offset_ + buf->CurrentSize())) {
// In case readahead_size is trimmed (=0), we still want to poll the data
// submitted with explicit_prefetch_submitted_=true.
if (readahead_size_ > 0 || explicit_prefetch_submitted_) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
Status s;
assert(reader != nullptr);
assert(max_readahead_size_ >= readahead_size_);
Fix stress test failure "Corruption: checksum mismatch" or "Iterator Diverged" with async_io enabled (#10032) Summary: In case of non sequential reads with `async_io`, `FilePRefetchBuffer::TryReadFromCacheAsync` can be called for previous blocks with `offset < bufs_[curr_].offset_` which wasn't handled correctly resulting wrong data being returned from buffer. Since `FilePRefetchBuffer::PrefetchAsync` can be called for any data block, it sets `prev_len_` to 0 indicating `FilePRefetchBuffer::TryReadFromCacheAsync` to go for the prefetching even though offset < bufs_[curr_].offset_ This is because async prefetching is always done in second buffer (to avoid mutex) even though curr_ is empty leading to offset < bufs_[curr_].offset_ in some cases. If prev_len_ is non zero then `TryReadFromCacheAsync` returns false if `offset < bufs_[curr_].offset_ && prev_len != 0` indicating reads are not sequential and previous call wasn't PrefetchAsync. - This PR also simplifies `FilePRefetchBuffer::TryReadFromCacheAsync` as it was getting complicated covering different scenarios based on `async_io` enabled/disabled. If `for_compaction` is set true, it now calls `FilePRefetchBufferTryReadFromCache` following synchronous flow as before. Its decided in BlockFetcher.cc Pull Request resolved: https://github.com/facebook/rocksdb/pull/10032 Test Plan: 1. export CRASH_TEST_EXT_ARGS=" --async_io=1" make crash_test -j completed successfully locally 2. make crash_test -j completed successfully locally 3. Reran CircleCi mini crashtest job 4 - 5 times. 4. Updated prefetch_test for more coverage. Reviewed By: anand1976 Differential Revision: D36579858 Pulled By: akankshamahajan15 fbshipit-source-id: 0c428d62b45e12e082a83acf533a5e37a584bedf
2022-05-23 19:15:26 +00:00
if (for_compaction) {
s = Prefetch(opts, reader, offset, std::max(n, readahead_size_));
} else {
if (implicit_auto_readahead_) {
if (!IsEligibleForPrefetch(offset, n)) {
// Ignore status as Prefetch is not called.
s.PermitUncheckedError();
return false;
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
// Prefetch n + readahead_size_/2 synchronously as remaining
// readahead_size_/2 will be prefetched asynchronously if num_buffers_
// > 1.
s = PrefetchInternal(
opts, reader, offset, n,
(num_buffers_ > 1 ? readahead_size_ / 2 : readahead_size_),
copy_to_overlap_buffer);
explicit_prefetch_submitted_ = false;
}
if (!s.ok()) {
if (status) {
*status = s;
}
#ifndef NDEBUG
IGNORE_STATUS_IF_ERROR(s);
#endif
return false;
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
prefetched = explicit_prefetch_submitted_ ? false : true;
} else {
return false;
}
} else if (!for_compaction) {
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
UpdateStats(/*found_in_buffer=*/true, n);
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
buf = GetFirstBuffer();
if (copy_to_overlap_buffer) {
buf = overlap_buf_;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
uint64_t offset_in_buffer = offset - buf->offset_;
*result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n);
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (prefetched) {
readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
}
return true;
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
void FilePrefetchBuffer::PrefetchAsyncCallback(FSReadRequest& req,
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
void* cb_arg) {
BufferInfo* buf = static_cast<BufferInfo*>(cb_arg);
Fix stress test failure in ReadAsync. (#9824) Summary: Fix stress test failure in ReadAsync by ignoring errors injected during async read by FaultInjectionFS. Failure: ``` WARNING: prefix_size is non-zero but memtablerep != prefix_hash Didn't get expected error from MultiGet. num_keys 14 Expected 1 errors, seen 0 Callstack that injected the fault Injected error type = 32538 Message: error; #0 ./db_stress() [0x6f7dd4] rocksdb::port::SaveStack(int*, int) /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/port/stack_trace.cc:152 https://github.com/facebook/rocksdb/issues/1 ./db_stress() [0x7f2bda] rocksdb::FaultInjectionTestFS::InjectThreadSpecificReadError(rocksdb::FaultInjectionTestFS::ErrorOperation, rocksdb::Slice*, bool, char*, bool, bool*) /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/utilities/fault_injection_fs.cc:891 https://github.com/facebook/rocksdb/issues/2 ./db_stress() [0x7f2e78] rocksdb::TestFSRandomAccessFile::Read(unsigned long, unsigned long, rocksdb::IOOptions const&, rocksdb::Slice*, char*, rocksdb::IODebugContext*) const /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/utilities/fault_injection_fs.cc:367 https://github.com/facebook/rocksdb/issues/3 ./db_stress() [0x6483d7] rocksdb::(anonymous namespace)::CompositeRandomAccessFileWrapper::Read(unsigned long, unsigned long, rocksdb::Slice*, char*) const /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/env/composite_env.cc:61 https://github.com/facebook/rocksdb/issues/4 ./db_stress() [0x654564] rocksdb::(anonymous namespace)::LegacyRandomAccessFileWrapper::Read(unsigned long, unsigned long, rocksdb::IOOptions const&, rocksdb::Slice*, char*, rocksdb::IODebugContext*) const /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/env/env.cc:152 https://github.com/facebook/rocksdb/issues/5 ./db_stress() [0x659b3b] rocksdb::FSRandomAccessFile::ReadAsync(rocksdb::FSReadRequest&, rocksdb::IOOptions const&, std::function<void (rocksdb::FSReadRequest const&, void*)>, void*, void**, std::function<void (void*)>*, rocksdb::IODebugContext*) /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/./include/rocksdb/file_system.h:896 https://github.com/facebook/rocksdb/issues/6 ./db_stress() [0x8b8bab] rocksdb::RandomAccessFileReader::ReadAsync(rocksdb::FSReadRequest&, rocksdb::IOOptions const&, std::function<void (rocksdb::FSReadRequest const&, void*)>, void*, void**, std::function<void (void*)>*, rocksdb::Env::IOPriority) /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/file/random_access_file_reader.cc:459 https://github.com/facebook/rocksdb/issues/7 ./db_stress() [0x8b501f] rocksdb::FilePrefetchBuffer::ReadAsync(rocksdb::IOOptions const&, rocksdb::RandomAccessFileReader*, rocksdb::Env::IOPriority, unsigned long, unsigned long, unsigned long, unsigned int) /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/file/file_prefetch_buffer.cc:124 https://github.com/facebook/rocksdb/issues/8 ./db_stress() [0x8b55fc] rocksdb::FilePrefetchBuffer::PrefetchAsync(rocksdb::IOOptions const&, rocksdb::RandomAccessFileReader*, unsigned long, unsigned long, unsigned long, rocksdb::Env::IOPriority, bool&) /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/file/file_prefetch_buffer.cc:363 https://github.com/facebook/rocksdb/issues/9 ./db_stress() [0x8b61f8] rocksdb::FilePrefetchBuffer::TryReadFromCacheAsync(rocksdb::IOOptions const&, rocksdb::RandomAccessFileReader*, unsigned long, unsigned long, rocksdb::Slice*, rocksdb::Status*, rocksdb::Env::IOPriority, bool) /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/file/file_prefetch_buffer.cc:482 https://github.com/facebook/rocksdb/issues/10 ./db_stress() [0x745e04] rocksdb::BlockFetcher::TryGetFromPrefetchBuffer() /data/sandcastle/boxes/trunk-hg-fbcode-fbsource/fbcode/internal_repo_rocksdb/repo/table/block_fetcher.cc:76 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/9824 Test Plan: ``` ./db_stress --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --allow_concurrent_memtable_write=0 --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 -- backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=5.037629726741734 --bottommost_compression_type=lz4hc --cache_index_and_filter_blocks=0 --cache_size=8388608 --checkpoint_one_in=1000000 --checksum_type=kxxHash --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_ttl=100 --compression_max_dict_buffer_bytes=1073741823 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --db=/home/akankshamahajan/dev/shm/rocksdb/rocksdb_crashtest_blackbox --db_write_buffer_size=8388608 --delpercent=0 --delrangepercent=0 --destroy_db_initially=0 - detect_filter_construct_corruption=1 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=/home/akankshamahajan/dev/shm/rocksdb/rocksdb_crashtest_expected --experimental_mempurge_threshold=8.772789063014715 --fail_if_options_file_error=0 --file_checksum_impl=crc32c --flush_one_in=1000000 --format_version=3 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=15 --index_type=3 --iterpercent=0 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=False --long_running_snapshots=0 --mark_for_compaction_one_file_in=0 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=25000000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=2097152 --memtable_prefix_bloom_size_ratio=0.001 --memtable_whole_key_filtering=1 --memtablerep=skip_list --mmap_read=0 --mock_direct_io=True --nooverwritepercent=1 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=2 --pause_background_one_in=1000000 --periodic_compaction_seconds=1000 --prefix_size=-1 --prefixpercent=0 --prepopulate_block_cache=0 --progress_reports=0 --read_fault_one_in=32 --readpercent=100 --recycle_log_file_num=1 --reopen=0 --reserve_table_reader_memory=1 --ribbon_starting_level=999 --secondary_cache_fault_one_in=0 --set_options_one_in=0 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --subcompactions=2 --sync=0 --sync_fault_injection=False --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --unpartitioned_pinning=2 --use_block_based_filter=0 --use_clock_cache=0 --use_direct_io_for_flush_and_compaction=1 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=1 --use_multiget=1 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=0 ``` Reviewed By: anand1976 Differential Revision: D35514566 Pulled By: akankshamahajan15 fbshipit-source-id: e2a868fdd7422604774c1419738f9926a21e92a4
2022-04-11 17:56:11 +00:00
#ifndef NDEBUG
if (req.result.size() < req.len) {
// Fake an IO error to force db_stress fault injection to ignore
// truncated read errors
IGNORE_STATUS_IF_ERROR(Status::IOError());
}
IGNORE_STATUS_IF_ERROR(req.status);
#endif
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
if (req.status.ok()) {
if (req.offset + req.result.size() <= buf->offset_ + buf->CurrentSize()) {
// All requested bytes are already in the buffer or no data is read
// because of EOF. So no need to update.
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
return;
}
if (req.offset < buf->offset_) {
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
// Next block to be read has changed (Recent read was not a sequential
// read). So ignore this read.
return;
}
size_t current_size = buf->CurrentSize();
buf->buffer_.Size(current_size + req.result.size());
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
}
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
RandomAccessFileReader* reader,
uint64_t offset, size_t n,
Slice* result) {
assert(reader != nullptr);
if (!enable_) {
return Status::NotSupported();
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
num_file_reads_ = 0;
explicit_prefetch_submitted_ = false;
bool is_eligible_for_prefetching = false;
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
if (readahead_size_ > 0 &&
(!implicit_auto_readahead_ ||
num_file_reads_ >= num_file_reads_for_auto_readahead_)) {
is_eligible_for_prefetching = true;
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
// Cancel any pending async read to make code simpler as buffers can be out
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
// of sync.
AbortAllIOs();
// Free empty buffers after aborting IOs.
FreeEmptyBuffers();
ClearOutdatedData(offset, n);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// - Since PrefetchAsync can be called on non sequential reads. So offset can
// be less than first buffers' offset. In that case it clears all
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// buffers.
// - In case of tuning of readahead_size, on Reseek, we have to clear all
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// buffers otherwise, we may end up with inconsistent BlockHandles in queue
// and data in buffer.
if (!IsBufferQueueEmpty()) {
BufferInfo* buf = GetFirstBuffer();
if (readaheadsize_cb_ != nullptr || !buf->IsOffsetInBuffer(offset)) {
FreeAllBuffers();
}
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
bool data_found = false;
// If first buffer has full data.
if (!IsBufferQueueEmpty()) {
BufferInfo* buf = GetFirstBuffer();
if (buf->DoesBufferContainData() && buf->IsDataBlockInBuffer(offset, n)) {
uint64_t offset_in_buffer = offset - buf->offset_;
*result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n);
data_found = true;
UpdateStats(/*found_in_buffer=*/true, n);
// Update num_file_reads_ as TryReadFromCacheAsync won't be called for
// poll and update num_file_reads_ if data is found.
num_file_reads_++;
// If next buffer contains some data or is not eligible for prefetching,
// return.
if (!is_eligible_for_prefetching || NumBuffersAllocated() > 1) {
return Status::OK();
}
} else {
// Partial data in first buffer. Clear it to return continous data in one
// buffer.
FreeAllBuffers();
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
}
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
std::string msg;
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
Status s;
size_t alignment = reader->file()->GetRequiredBufferAlignment();
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
size_t readahead_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0;
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
size_t offset_to_read = static_cast<size_t>(offset);
uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0;
size_t read_len1 = 0;
AllocateBufferIfEmpty();
BufferInfo* buf = GetFirstBuffer();
// - If first buffer is empty.
// - Call async read for full data + readahead_size on first buffer.
// - Call async read for readahead_size on all remaining buffers if
// eligible.
// - If first buffer contains data,
// - Call async read for readahead_size on all remaining buffers if
// eligible.
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
// Calculate length and offsets for reading.
if (!buf->DoesBufferContainData()) {
uint64_t roundup_len1;
// Prefetch full data + readahead_size in the first buffer.
if (is_eligible_for_prefetching || reader->use_direct_io()) {
ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail=*/false,
/*prev_buf_end_offset=*/start_offset1, alignment, n,
readahead_size, start_offset1, end_offset1, read_len1,
aligned_useful_len1);
} else {
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
// No alignment or extra prefetching.
start_offset1 = offset_to_read;
end_offset1 = offset_to_read + n;
roundup_len1 = end_offset1 - start_offset1;
PrepareBufferForRead(buf, alignment, start_offset1, roundup_len1, false,
aligned_useful_len1);
assert(aligned_useful_len1 == 0);
assert(roundup_len1 >= aligned_useful_len1);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
read_len1 = static_cast<size_t>(roundup_len1);
buf->offset_ = start_offset1;
}
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
if (read_len1 > 0) {
s = ReadAsync(buf, opts, reader, read_len1, start_offset1);
if (!s.ok()) {
DestroyAndClearIOHandle(buf);
FreeLastBuffer();
return s;
}
explicit_prefetch_submitted_ = true;
prev_len_ = 0;
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
}
}
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
if (is_eligible_for_prefetching) {
s = PrefetchRemBuffers(opts, reader, end_offset1, alignment,
readahead_size);
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
if (!s.ok()) {
return s;
}
readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
}
return (data_found ? Status::OK() : Status::TryAgain());
Seek parallelization (#9994) Summary: The RocksDB iterator is a hierarchy of iterators. MergingIterator maintains a heap of LevelIterators, one for each L0 file and for each non-zero level. The Seek() operation naturally lends itself to parallelization, as it involves positioning every LevelIterator on the correct data block in the correct SST file. It lookups a level for a target key, to find the first key that's >= the target key. This typically involves reading one data block that is likely to contain the target key, and scan forward to find the first valid key. The forward scan may read more data blocks. In order to find the right data block, the iterator may read some metadata blocks (required for opening a file and searching the index). This flow can be parallelized. Design: Seek will be called two times under async_io option. First seek will send asynchronous request to prefetch the data blocks at each level and second seek will follow the normal flow and in FilePrefetchBuffer::TryReadFromCacheAsync it will wait for the Poll() to get the results and add the iterator to min_heap. - Status::TryAgain is passed down from FilePrefetchBuffer::PrefetchAsync to block_iter_.Status indicating asynchronous request has been submitted. - If for some reason asynchronous request returns error in submitting the request, it will fallback to sequential reading of blocks in one pass. - If the data already exists in prefetch_buffer, it will return the data without prefetching further and it will be treated as single pass of seek. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9994 Test Plan: - **Run Regressions.** ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 -use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` i) Previous release 7.0 run for normal prefetching with async_io disabled: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Set seed to 1652922591315307 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:09:51 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483080.466 micros/op 2 ops/sec 120.287 seconds 249 operations; 340.8 MB/s (249 of 249 found) ``` iii) db_bench with async_io enabled completed succesfully ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 -async_io=1 -adaptive_readahead=1 Set seed to 1652924062021732 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.3 Date: Wed May 18 18:34:22 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 553913.576 micros/op 1 ops/sec 120.199 seconds 217 operations; 293.6 MB/s (217 of 217 found) ``` - db_stress with async_io disabled completed succesfully ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` I**n Progress**: db_stress with async_io is failing and working on debugging/fixing it. Reviewed By: anand1976 Differential Revision: D36459323 Pulled By: akankshamahajan15 fbshipit-source-id: abb1cd944abe712bae3986ae5b16704b3338917c
2022-05-20 23:09:33 +00:00
}
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
Status FilePrefetchBuffer::PrefetchRemBuffers(const IOOptions& opts,
RandomAccessFileReader* reader,
uint64_t end_offset1,
size_t alignment,
size_t readahead_size) {
Status s;
while (NumBuffersAllocated() < num_buffers_) {
BufferInfo* prev_buf = GetLastBuffer();
uint64_t start_offset2 = prev_buf->initial_end_offset_;
AllocateBuffer();
BufferInfo* new_buf = GetLastBuffer();
uint64_t end_offset2 = start_offset2, aligned_useful_len2 = 0;
size_t read_len2 = 0;
ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
/*refit_tail=*/false,
/*prev_buf_end_offset=*/end_offset1, alignment,
/*length=*/0, readahead_size, start_offset2,
end_offset2, read_len2, aligned_useful_len2);
if (read_len2 > 0) {
TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching");
s = ReadAsync(new_buf, opts, reader, read_len2, start_offset2);
if (!s.ok()) {
DestroyAndClearIOHandle(new_buf);
FreeLastBuffer();
return s;
}
}
end_offset1 = end_offset2;
}
return s;
}
} // namespace ROCKSDB_NAMESPACE