mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-01 16:15:59 +00:00
a43481b3d0
Summary: When the rate limiter does not have any waiting requests, the first request to arrive may consume all of the available bandwidth, despite potentially having lower priority than requests that arrive later in the same refill interval. Then, those higher priority requests must wait for a refill. So even in scenarios in which we have an overall bandwidth surplus, the highest priority requests can be sporadically delayed up to a whole refill period. Alone, this isn't necessarily problematic as the refill period is configurable via `refill_period_us` and can be tuned down as needed until the max sporadic delay is tolerable. However, tuning down `refill_period_us` had a side effect of reducing burst size. Some users require a certain burst size to issue optimal I/O sizes to the underlying storage system. To satisfy those users, this PR decouples the refill period from the burst size. That way, the max sporadic delay can be limited without impacting I/O sizes issued to the underlying storage system. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12379 Test Plan: The goal is to show we can now limit the max sporadic delay without impacting compaction's I/O size. The benchmark runs compaction with a large I/O size, while user reads simultaneously run at a low rate that does not consume all of the available bandwidth. The max sporadic delay is measured using the P100 of rocksdb.file.read.get.micros. I just used strace to verify the compaction reads follow `rate_limiter_single_burst_bytes` Setup: `./db_bench -benchmarks=fillrandom,flush -write_buffer_size=67108864 -disable_auto_compactions=true -value_size=256 -num=1048576` Benchmark: `./db_bench -benchmarks=readrandom -use_existing_db=true -num=1048576 -duration=10 -benchmark_read_rate_limit=4096 -rate_limiter_bytes_per_sec=67108864 -rate_limiter_refill_period_us=$refill_micros -rate_limiter_single_burst_bytes=16777216 -rate_limit_bg_reads=true -rate_limit_user_ops=true -statistics=true -cache_size=0 -stats_level=5 -compaction_readahead_size=16777216 -use_direct_reads=true` Results: refill_micros | rocksdb.file.read.get.micros (P100) -- | -- 10000 | 10802 100000 | 100240 1000000 | 922061 For verifying compaction read sizes: `strace -fye pread64 ./db_bench -benchmarks=compact -use_existing_db=true -rate_limiter_bytes_per_sec=67108864 -rate_limiter_refill_period_us=$refill_micros -rate_limiter_single_burst_bytes=16777216 -rate_limit_bg_reads=true -compaction_readahead_size=16777216 -use_direct_reads=true` Reviewed By: hx235 Differential Revision: D54165675 Pulled By: ajkr fbshipit-source-id: c5968486316cbfb7ff8e5b7d75d3589883dd1105
157 lines
5 KiB
C++
157 lines
5 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <chrono>
|
|
#include <deque>
|
|
|
|
#include "port/port.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/rate_limiter.h"
|
|
#include "rocksdb/status.h"
|
|
#include "rocksdb/system_clock.h"
|
|
#include "util/mutexlock.h"
|
|
#include "util/random.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class GenericRateLimiter : public RateLimiter {
|
|
public:
|
|
GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
|
|
int32_t fairness, RateLimiter::Mode mode,
|
|
const std::shared_ptr<SystemClock>& clock, bool auto_tuned,
|
|
int64_t single_burst_bytes);
|
|
|
|
virtual ~GenericRateLimiter();
|
|
|
|
// This API allows user to dynamically change rate limiter's bytes per second.
|
|
void SetBytesPerSecond(int64_t bytes_per_second) override;
|
|
|
|
Status SetSingleBurstBytes(int64_t single_burst_bytes) override;
|
|
|
|
// Request for token to write bytes. If this request can not be satisfied,
|
|
// the call is blocked. Caller is responsible to make sure
|
|
// bytes <= GetSingleBurstBytes() and bytes >= 0. Negative bytes
|
|
// passed in will be rounded up to 0.
|
|
using RateLimiter::Request;
|
|
void Request(const int64_t bytes, const Env::IOPriority pri,
|
|
Statistics* stats) override;
|
|
|
|
int64_t GetSingleBurstBytes() const override {
|
|
int64_t raw_single_burst_bytes =
|
|
raw_single_burst_bytes_.load(std::memory_order_relaxed);
|
|
if (raw_single_burst_bytes == 0) {
|
|
return refill_bytes_per_period_.load(std::memory_order_relaxed);
|
|
}
|
|
return raw_single_burst_bytes;
|
|
}
|
|
|
|
int64_t GetTotalBytesThrough(
|
|
const Env::IOPriority pri = Env::IO_TOTAL) const override {
|
|
MutexLock g(&request_mutex_);
|
|
if (pri == Env::IO_TOTAL) {
|
|
int64_t total_bytes_through_sum = 0;
|
|
for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
|
|
total_bytes_through_sum += total_bytes_through_[i];
|
|
}
|
|
return total_bytes_through_sum;
|
|
}
|
|
return total_bytes_through_[pri];
|
|
}
|
|
|
|
int64_t GetTotalRequests(
|
|
const Env::IOPriority pri = Env::IO_TOTAL) const override {
|
|
MutexLock g(&request_mutex_);
|
|
if (pri == Env::IO_TOTAL) {
|
|
int64_t total_requests_sum = 0;
|
|
for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
|
|
total_requests_sum += total_requests_[i];
|
|
}
|
|
return total_requests_sum;
|
|
}
|
|
return total_requests_[pri];
|
|
}
|
|
|
|
Status GetTotalPendingRequests(
|
|
int64_t* total_pending_requests,
|
|
const Env::IOPriority pri = Env::IO_TOTAL) const override {
|
|
assert(total_pending_requests != nullptr);
|
|
MutexLock g(&request_mutex_);
|
|
if (pri == Env::IO_TOTAL) {
|
|
int64_t total_pending_requests_sum = 0;
|
|
for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) {
|
|
total_pending_requests_sum += static_cast<int64_t>(queue_[i].size());
|
|
}
|
|
*total_pending_requests = total_pending_requests_sum;
|
|
} else {
|
|
*total_pending_requests = static_cast<int64_t>(queue_[pri].size());
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
int64_t GetBytesPerSecond() const override {
|
|
return rate_bytes_per_sec_.load(std::memory_order_relaxed);
|
|
}
|
|
|
|
virtual void TEST_SetClock(std::shared_ptr<SystemClock> clock) {
|
|
MutexLock g(&request_mutex_);
|
|
clock_ = std::move(clock);
|
|
next_refill_us_ = NowMicrosMonotonicLocked();
|
|
}
|
|
|
|
private:
|
|
static constexpr int kMicrosecondsPerSecond = 1000000;
|
|
void RefillBytesAndGrantRequestsLocked();
|
|
std::vector<Env::IOPriority> GeneratePriorityIterationOrderLocked();
|
|
int64_t CalculateRefillBytesPerPeriodLocked(int64_t rate_bytes_per_sec);
|
|
Status TuneLocked();
|
|
void SetBytesPerSecondLocked(int64_t bytes_per_second);
|
|
|
|
uint64_t NowMicrosMonotonicLocked() {
|
|
return clock_->NowNanos() / std::milli::den;
|
|
}
|
|
|
|
// This mutex guard all internal states
|
|
mutable port::Mutex request_mutex_;
|
|
|
|
const int64_t refill_period_us_;
|
|
|
|
std::atomic<int64_t> rate_bytes_per_sec_;
|
|
std::atomic<int64_t> refill_bytes_per_period_;
|
|
// This value is validated but unsanitized (may be zero).
|
|
std::atomic<int64_t> raw_single_burst_bytes_;
|
|
std::shared_ptr<SystemClock> clock_;
|
|
|
|
bool stop_;
|
|
port::CondVar exit_cv_;
|
|
int32_t requests_to_wait_;
|
|
|
|
int64_t total_requests_[Env::IO_TOTAL];
|
|
int64_t total_bytes_through_[Env::IO_TOTAL];
|
|
int64_t available_bytes_;
|
|
int64_t next_refill_us_;
|
|
|
|
int32_t fairness_;
|
|
Random rnd_;
|
|
|
|
struct Req;
|
|
std::deque<Req*> queue_[Env::IO_TOTAL];
|
|
bool wait_until_refill_pending_;
|
|
|
|
bool auto_tuned_;
|
|
int64_t num_drains_;
|
|
const int64_t max_bytes_per_sec_;
|
|
std::chrono::microseconds tuned_time_;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|