2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-09-11 16:57:02 +00:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <algorithm>
|
remove dependency on options.h for port_posix.h andport_win.h (#11214)
Summary:
The files in `port/`, such as `port_posix.h`, are layering over the system libraries, so shouldn't include the DB-specific files like `options.h`. This PR remove this dependency.
# How
The reason that `port_posix.h` (or `port_win.h`) include `options.h` is to use `CpuPriority`, as there is a method `SetCpuPriority()` in `port_posix.h` that uses `CpuPriority.`
- I think `SetCpuPriority()` make sense to exist in `port_posix.h` as it provides has platform-dependent implementation
- `CpuPriority` enum is defined in `env.h`, but used in `rocksdb/include` and `port/`.
Hence, let us define `CpuPriority` enum in a common file, say `port_defs.h`, such that both directories `rocksdb/include` and `port/` can include.
When we remove this dependency, some other files have compile errors because they can't find definitions, so add header files to resolve
# Test
make all check -j
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11214
Reviewed By: pdillinger
Differential Revision: D43196910
Pulled By: guowentian
fbshipit-source-id: 70deccb72844cfb08fcc994f76c6ef6df5d55ab9
2023-02-13 10:21:38 +00:00
|
|
|
#include <cassert>
|
2022-10-26 19:08:20 +00:00
|
|
|
|
2015-09-11 16:57:02 +00:00
|
|
|
#include "port/port.h"
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// This file contains utilities to handle the alignment of pages and buffers.
|
|
|
|
|
|
|
|
// Truncate to a multiple of page_size, which is also a page boundary. This
|
|
|
|
// helps to figuring out the right alignment.
|
|
|
|
// Example:
|
2020-10-07 19:33:03 +00:00
|
|
|
// TruncateToPageBoundary(4096, 5000) => 4096
|
|
|
|
// TruncateToPageBoundary((4096, 10000) => 8192
|
2015-09-11 16:57:02 +00:00
|
|
|
inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
|
|
|
|
s -= (s & (page_size - 1));
|
|
|
|
assert((s % page_size) == 0);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// Round up x to a multiple of y.
|
|
|
|
// Example:
|
|
|
|
// Roundup(13, 5) => 15
|
|
|
|
// Roundup(201, 16) => 208
|
2022-10-26 19:08:20 +00:00
|
|
|
inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// Round down x to a multiple of y.
|
|
|
|
// Example:
|
|
|
|
// Rounddown(13, 5) => 10
|
|
|
|
// Rounddown(201, 16) => 192
|
2018-01-26 20:50:48 +00:00
|
|
|
inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; }
|
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// AlignedBuffer manages a buffer by taking alignment into consideration, and
|
|
|
|
// aligns the buffer start and end positions. It is mainly used for direct I/O,
|
|
|
|
// though it can be used other purposes as well.
|
|
|
|
// It also supports expanding the managed buffer, and copying whole or part of
|
|
|
|
// the data from old buffer into the new expanded buffer. Such a copy especially
|
|
|
|
// helps in cases avoiding an IO to re-fetch the data from disk.
|
|
|
|
//
|
|
|
|
// Example:
|
|
|
|
// AlignedBuffer buf;
|
|
|
|
// buf.Alignment(alignment);
|
|
|
|
// buf.AllocateNewBuffer(user_requested_buf_size);
|
|
|
|
// ...
|
|
|
|
// buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true,
|
|
|
|
// copy_offset, copy_len);
|
2015-09-11 16:57:02 +00:00
|
|
|
class AlignedBuffer {
|
|
|
|
size_t alignment_;
|
|
|
|
std::unique_ptr<char[]> buf_;
|
|
|
|
size_t capacity_;
|
|
|
|
size_t cursize_;
|
|
|
|
char* bufstart_;
|
|
|
|
|
2022-10-26 19:08:20 +00:00
|
|
|
public:
|
2015-09-11 16:57:02 +00:00
|
|
|
AlignedBuffer()
|
2022-10-26 19:08:20 +00:00
|
|
|
: alignment_(), capacity_(0), cursize_(0), bufstart_(nullptr) {}
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2022-05-26 23:55:08 +00:00
|
|
|
AlignedBuffer(AlignedBuffer&& o) noexcept { *this = std::move(o); }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2022-05-26 23:55:08 +00:00
|
|
|
AlignedBuffer& operator=(AlignedBuffer&& o) noexcept {
|
2015-09-11 16:57:02 +00:00
|
|
|
alignment_ = std::move(o.alignment_);
|
|
|
|
buf_ = std::move(o.buf_);
|
|
|
|
capacity_ = std::move(o.capacity_);
|
|
|
|
cursize_ = std::move(o.cursize_);
|
|
|
|
bufstart_ = std::move(o.bufstart_);
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
AlignedBuffer(const AlignedBuffer&) = delete;
|
|
|
|
|
|
|
|
AlignedBuffer& operator=(const AlignedBuffer&) = delete;
|
|
|
|
|
2017-01-12 00:42:07 +00:00
|
|
|
static bool isAligned(const void* ptr, size_t alignment) {
|
|
|
|
return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool isAligned(size_t n, size_t alignment) {
|
|
|
|
return n % alignment == 0;
|
|
|
|
}
|
|
|
|
|
2022-10-26 19:08:20 +00:00
|
|
|
size_t Alignment() const { return alignment_; }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2022-10-26 19:08:20 +00:00
|
|
|
size_t Capacity() const { return capacity_; }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2022-10-26 19:08:20 +00:00
|
|
|
size_t CurrentSize() const { return cursize_; }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2022-10-26 19:08:20 +00:00
|
|
|
const char* BufferStart() const { return bufstart_; }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2017-01-12 00:42:07 +00:00
|
|
|
char* BufferStart() { return bufstart_; }
|
|
|
|
|
2022-10-26 19:08:20 +00:00
|
|
|
void Clear() { cursize_ = 0; }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2020-03-06 22:02:09 +00:00
|
|
|
char* Release() {
|
|
|
|
cursize_ = 0;
|
|
|
|
capacity_ = 0;
|
|
|
|
bufstart_ = nullptr;
|
|
|
|
return buf_.release();
|
|
|
|
}
|
|
|
|
|
2015-09-12 00:36:48 +00:00
|
|
|
void Alignment(size_t alignment) {
|
2015-09-11 16:57:02 +00:00
|
|
|
assert(alignment > 0);
|
|
|
|
assert((alignment & (alignment - 1)) == 0);
|
|
|
|
alignment_ = alignment;
|
|
|
|
}
|
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// Allocates a new buffer and sets the start position to the first aligned
|
|
|
|
// byte.
|
|
|
|
//
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 18:02:49 +00:00
|
|
|
// requested_capacity: requested new buffer capacity. This capacity will be
|
|
|
|
// rounded up based on alignment.
|
2019-05-24 17:02:36 +00:00
|
|
|
// copy_data: Copy data from old buffer to new buffer. If copy_offset and
|
|
|
|
// copy_len are not passed in and the new requested capacity is bigger
|
|
|
|
// than the existing buffer's capacity, the data in the exising buffer is
|
|
|
|
// fully copied over to the new buffer.
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 18:02:49 +00:00
|
|
|
// copy_offset: Copy data from this offset in old buffer.
|
|
|
|
// copy_len: Number of bytes to copy.
|
2019-05-24 17:02:36 +00:00
|
|
|
//
|
|
|
|
// The function does nothing if the new requested_capacity is smaller than
|
|
|
|
// the current buffer capacity and copy_data is true i.e. the old buffer is
|
|
|
|
// retained as is.
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 18:02:49 +00:00
|
|
|
void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
|
|
|
|
uint64_t copy_offset = 0, size_t copy_len = 0) {
|
2015-09-11 16:57:02 +00:00
|
|
|
assert(alignment_ > 0);
|
|
|
|
assert((alignment_ & (alignment_ - 1)) == 0);
|
|
|
|
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 18:02:49 +00:00
|
|
|
copy_len = copy_len > 0 ? copy_len : cursize_;
|
|
|
|
if (copy_data && requested_capacity < copy_len) {
|
2017-06-13 11:34:51 +00:00
|
|
|
// If we are downsizing to a capacity that is smaller than the current
|
2019-05-24 17:02:36 +00:00
|
|
|
// data in the buffer -- Ignore the request.
|
2017-06-13 11:34:51 +00:00
|
|
|
return;
|
|
|
|
}
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2017-06-13 11:34:51 +00:00
|
|
|
size_t new_capacity = Roundup(requested_capacity, alignment_);
|
|
|
|
char* new_buf = new char[new_capacity + alignment_];
|
|
|
|
char* new_bufstart = reinterpret_cast<char*>(
|
|
|
|
(reinterpret_cast<uintptr_t>(new_buf) + (alignment_ - 1)) &
|
|
|
|
~static_cast<uintptr_t>(alignment_ - 1));
|
|
|
|
|
|
|
|
if (copy_data) {
|
2018-07-11 19:24:07 +00:00
|
|
|
assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_);
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 18:02:49 +00:00
|
|
|
memcpy(new_bufstart, bufstart_ + copy_offset, copy_len);
|
|
|
|
cursize_ = copy_len;
|
2017-06-13 11:34:51 +00:00
|
|
|
} else {
|
|
|
|
cursize_ = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bufstart_ = new_bufstart;
|
|
|
|
capacity_ = new_capacity;
|
|
|
|
buf_.reset(new_buf);
|
2015-09-11 16:57:02 +00:00
|
|
|
}
|
2019-05-24 17:02:36 +00:00
|
|
|
|
|
|
|
// Append to the buffer.
|
|
|
|
//
|
|
|
|
// src : source to copy the data from.
|
|
|
|
// append_size : number of bytes to copy from src.
|
|
|
|
// Returns the number of bytes appended.
|
|
|
|
//
|
|
|
|
// If append_size is more than the remaining buffer size only the
|
|
|
|
// remaining-size worth of bytes are copied.
|
2015-09-11 16:57:02 +00:00
|
|
|
size_t Append(const char* src, size_t append_size) {
|
|
|
|
size_t buffer_remaining = capacity_ - cursize_;
|
|
|
|
size_t to_copy = std::min(append_size, buffer_remaining);
|
|
|
|
|
|
|
|
if (to_copy > 0) {
|
|
|
|
memcpy(bufstart_ + cursize_, src, to_copy);
|
|
|
|
cursize_ += to_copy;
|
|
|
|
}
|
|
|
|
return to_copy;
|
|
|
|
}
|
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// Read from the buffer.
|
|
|
|
//
|
|
|
|
// dest : destination buffer to copy the data to.
|
|
|
|
// offset : the buffer offset to start reading from.
|
|
|
|
// read_size : the number of bytes to copy from the buffer to dest.
|
|
|
|
// Returns the number of bytes read/copied to dest.
|
2015-09-11 16:57:02 +00:00
|
|
|
size_t Read(char* dest, size_t offset, size_t read_size) const {
|
|
|
|
assert(offset < cursize_);
|
2017-04-27 19:19:55 +00:00
|
|
|
|
|
|
|
size_t to_read = 0;
|
2022-10-26 19:08:20 +00:00
|
|
|
if (offset < cursize_) {
|
2017-04-27 19:19:55 +00:00
|
|
|
to_read = std::min(cursize_ - offset, read_size);
|
|
|
|
}
|
2015-09-11 16:57:02 +00:00
|
|
|
if (to_read > 0) {
|
|
|
|
memcpy(dest, bufstart_ + offset, to_read);
|
|
|
|
}
|
|
|
|
return to_read;
|
|
|
|
}
|
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// Pad to the end of alignment with "padding"
|
2015-09-11 16:57:02 +00:00
|
|
|
void PadToAlignmentWith(int padding) {
|
|
|
|
size_t total_size = Roundup(cursize_, alignment_);
|
|
|
|
size_t pad_size = total_size - cursize_;
|
|
|
|
|
|
|
|
if (pad_size > 0) {
|
|
|
|
assert((pad_size + cursize_) <= capacity_);
|
|
|
|
memset(bufstart_ + cursize_, padding, pad_size);
|
|
|
|
cursize_ += pad_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-27 03:14:24 +00:00
|
|
|
void PadWith(size_t pad_size, int padding) {
|
|
|
|
assert((pad_size + cursize_) <= capacity_);
|
|
|
|
memset(bufstart_ + cursize_, padding, pad_size);
|
|
|
|
cursize_ += pad_size;
|
|
|
|
}
|
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// After a partial flush move the tail to the beginning of the buffer.
|
2015-09-11 16:57:02 +00:00
|
|
|
void RefitTail(size_t tail_offset, size_t tail_size) {
|
|
|
|
if (tail_size > 0) {
|
|
|
|
memmove(bufstart_, bufstart_ + tail_offset, tail_size);
|
|
|
|
}
|
|
|
|
cursize_ = tail_size;
|
|
|
|
}
|
|
|
|
|
2019-05-24 17:02:36 +00:00
|
|
|
// Returns a place to start appending.
|
|
|
|
// WARNING: Note that it is possible to write past the end of the buffer if
|
|
|
|
// the buffer is modified without using the write APIs or encapsulation
|
|
|
|
// offered by AlignedBuffer. It is up to the user to guard against such
|
|
|
|
// errors.
|
2022-10-26 19:08:20 +00:00
|
|
|
char* Destination() { return bufstart_ + cursize_; }
|
2015-09-11 16:57:02 +00:00
|
|
|
|
2022-10-26 19:08:20 +00:00
|
|
|
void Size(size_t cursize) { cursize_ = cursize; }
|
2015-09-11 16:57:02 +00:00
|
|
|
};
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|