mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 20:43:57 +00:00
05c678e135
Summary: ### Context: Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users. From the RocksDB perspective, there can be two kinds of rate limiters, the internal (native) one and the external one. - The internal (native) rate limiter is introduced in [the wiki](https://github.com/facebook/rocksdb/wiki/Rate-Limiter). Currently, only IO_LOW and IO_HIGH are used and they are set statically. - For the external rate limiter, in FSWritableFile functions, IOOptions is open for end users to set and get rate_limiter_priority for their own rate limiter. Currently, RocksDB doesn’t pass the rate_limiter_priority through IOOptions to the file system. ### Solution During the User Read, Flush write, Compaction read/write, the WriteController is used to determine whether DB writes are stalled or slowed down. The rate limiter priority (Env::IOPriority) can be determined accordingly. We decided to always pass the priority in IOOptions. What the file system does with it should be a contract between the user and the file system. We would like to set the rate limiter priority at file level, since the Flush/Compaction job level may be too coarse with multiple files and block IO level is too granular. **This PR is for the Write path.** The **Write:** dynamic priority for different state are listed as follows: | State | Normal | Delayed | Stalled | | ----- | ------ | ------- | ------- | | Flush | IO_HIGH | IO_USER | IO_USER | | Compaction | IO_LOW | IO_USER | IO_USER | Flush and Compaction writes share the same call path through BlockBaseTableWriter, WritableFileWriter, and FSWritableFile. When a new FSWritableFile object is created, its io_priority_ can be set dynamically based on the state of the WriteController. In WritableFileWriter, before the call sites of FSWritableFile functions, WritableFileWriter::DecideRateLimiterPriority() determines the rate_limiter_priority. The options (IOOptions) argument of FSWritableFile functions will be updated with the rate_limiter_priority. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9988 Test Plan: Add unit tests. Reviewed By: anand1976 Differential Revision: D36395159 Pulled By: gitbw95 fbshipit-source-id: a7c82fc29759139a1a07ec46c37dbf7e753474cf
148 lines
4.9 KiB
C++
148 lines
4.9 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <atomic>
|
|
#include <memory>
|
|
#include "rocksdb/rate_limiter.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class SystemClock;
|
|
class WriteControllerToken;
|
|
|
|
// WriteController is controlling write stalls in our write code-path. Write
|
|
// stalls happen when compaction can't keep up with write rate.
|
|
// All of the methods here (including WriteControllerToken's destructors) need
|
|
// to be called while holding DB mutex
|
|
class WriteController {
|
|
public:
|
|
explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u,
|
|
int64_t low_pri_rate_bytes_per_sec = 1024 * 1024)
|
|
: total_stopped_(0),
|
|
total_delayed_(0),
|
|
total_compaction_pressure_(0),
|
|
credit_in_bytes_(0),
|
|
next_refill_time_(0),
|
|
low_pri_rate_limiter_(
|
|
NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
|
|
set_max_delayed_write_rate(_delayed_write_rate);
|
|
}
|
|
~WriteController() = default;
|
|
|
|
// When an actor (column family) requests a stop token, all writes will be
|
|
// stopped until the stop token is released (deleted)
|
|
std::unique_ptr<WriteControllerToken> GetStopToken();
|
|
// When an actor (column family) requests a delay token, total delay for all
|
|
// writes to the DB will be controlled under the delayed write rate. Every
|
|
// write needs to call GetDelay() with number of bytes writing to the DB,
|
|
// which returns number of microseconds to sleep.
|
|
std::unique_ptr<WriteControllerToken> GetDelayToken(
|
|
uint64_t delayed_write_rate);
|
|
// When an actor (column family) requests a moderate token, compaction
|
|
// threads will be increased
|
|
std::unique_ptr<WriteControllerToken> GetCompactionPressureToken();
|
|
|
|
// these three metods are querying the state of the WriteController
|
|
bool IsStopped() const;
|
|
bool NeedsDelay() const { return total_delayed_.load() > 0; }
|
|
bool NeedSpeedupCompaction() const {
|
|
return IsStopped() || NeedsDelay() || total_compaction_pressure_.load() > 0;
|
|
}
|
|
// return how many microseconds the caller needs to sleep after the call
|
|
// num_bytes: how many number of bytes to put into the DB.
|
|
// Prerequisite: DB mutex held.
|
|
uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes);
|
|
void set_delayed_write_rate(uint64_t write_rate) {
|
|
// avoid divide 0
|
|
if (write_rate == 0) {
|
|
write_rate = 1u;
|
|
} else if (write_rate > max_delayed_write_rate()) {
|
|
write_rate = max_delayed_write_rate();
|
|
}
|
|
delayed_write_rate_ = write_rate;
|
|
}
|
|
|
|
void set_max_delayed_write_rate(uint64_t write_rate) {
|
|
// avoid divide 0
|
|
if (write_rate == 0) {
|
|
write_rate = 1u;
|
|
}
|
|
max_delayed_write_rate_ = write_rate;
|
|
// update delayed_write_rate_ as well
|
|
delayed_write_rate_ = write_rate;
|
|
}
|
|
|
|
uint64_t delayed_write_rate() const { return delayed_write_rate_; }
|
|
|
|
uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; }
|
|
|
|
RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
|
|
|
|
private:
|
|
uint64_t NowMicrosMonotonic(SystemClock* clock);
|
|
|
|
friend class WriteControllerToken;
|
|
friend class StopWriteToken;
|
|
friend class DelayWriteToken;
|
|
friend class CompactionPressureToken;
|
|
|
|
std::atomic<int> total_stopped_;
|
|
std::atomic<int> total_delayed_;
|
|
std::atomic<int> total_compaction_pressure_;
|
|
|
|
// Number of bytes allowed to write without delay
|
|
uint64_t credit_in_bytes_;
|
|
// Next time that we can add more credit of bytes
|
|
uint64_t next_refill_time_;
|
|
// Write rate set when initialization or by `DBImpl::SetDBOptions`
|
|
uint64_t max_delayed_write_rate_;
|
|
// Current write rate (bytes / second)
|
|
uint64_t delayed_write_rate_;
|
|
|
|
std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
|
|
};
|
|
|
|
class WriteControllerToken {
|
|
public:
|
|
explicit WriteControllerToken(WriteController* controller)
|
|
: controller_(controller) {}
|
|
virtual ~WriteControllerToken() {}
|
|
|
|
protected:
|
|
WriteController* controller_;
|
|
|
|
private:
|
|
// no copying allowed
|
|
WriteControllerToken(const WriteControllerToken&) = delete;
|
|
void operator=(const WriteControllerToken&) = delete;
|
|
};
|
|
|
|
class StopWriteToken : public WriteControllerToken {
|
|
public:
|
|
explicit StopWriteToken(WriteController* controller)
|
|
: WriteControllerToken(controller) {}
|
|
virtual ~StopWriteToken();
|
|
};
|
|
|
|
class DelayWriteToken : public WriteControllerToken {
|
|
public:
|
|
explicit DelayWriteToken(WriteController* controller)
|
|
: WriteControllerToken(controller) {}
|
|
virtual ~DelayWriteToken();
|
|
};
|
|
|
|
class CompactionPressureToken : public WriteControllerToken {
|
|
public:
|
|
explicit CompactionPressureToken(WriteController* controller)
|
|
: WriteControllerToken(controller) {}
|
|
virtual ~CompactionPressureToken();
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|