rocksdb/file/sst_file_manager_impl.h

223 lines
9.0 KiB
C
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <optional>
#include <string>
#include "db/compaction/compaction.h"
#include "file/delete_scheduler.h"
#include "port/port.h"
#include "rocksdb/sst_file_manager.h"
namespace ROCKSDB_NAMESPACE {
class ErrorHandler;
class FileSystem;
class SystemClock;
class Logger;
// SstFileManager is used to track SST and blob files in the DB and control
// their deletion rate. All SstFileManager public functions are thread-safe.
class SstFileManagerImpl : public SstFileManager {
public:
explicit SstFileManagerImpl(const std::shared_ptr<SystemClock>& clock,
const std::shared_ptr<FileSystem>& fs,
const std::shared_ptr<Logger>& logger,
int64_t rate_bytes_per_sec,
double max_trash_db_ratio,
uint64_t bytes_max_delete_chunk);
// No copy
SstFileManagerImpl(const SstFileManagerImpl& sfm) = delete;
SstFileManagerImpl& operator=(const SstFileManagerImpl& sfm) = delete;
~SstFileManagerImpl();
// DB will call OnAddFile whenever a new sst/blob file is added.
Status OnAddFile(const std::string& file_path);
// Overload where size of the file is provided by the caller rather than
// queried from the filesystem. This is an optimization.
Status OnAddFile(const std::string& file_path, uint64_t file_size);
// DB will call OnDeleteFile whenever a sst/blob file is deleted.
Status OnDeleteFile(const std::string& file_path);
// DB will call OnMoveFile whenever a sst/blob file is move to a new path.
Status OnMoveFile(const std::string& old_path, const std::string& new_path,
uint64_t* file_size = nullptr);
// DB will call OnUntrackFile when closing with an unowned SstFileManager.
Status OnUntrackFile(const std::string& file_path);
// Update the maximum allowed space that should be used by RocksDB, if
// the total size of the SST and blob files exceeds max_allowed_space, writes
// to RocksDB will fail.
//
// Setting max_allowed_space to 0 will disable this feature, maximum allowed
// space will be infinite (Default value).
//
// thread-safe.
void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override;
void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
// Return true if the total size of SST and blob files exceeded the maximum
// allowed space usage.
//
// thread-safe.
bool IsMaxAllowedSpaceReached() override;
bool IsMaxAllowedSpaceReachedIncludingCompactions() override;
// Returns true is there is enough (approximate) space for the specified
// compaction. Space is approximate because this function conservatively
// estimates how much space is currently being used by compactions (i.e.
// if a compaction has started, this function bumps the used space by
// the full compaction size).
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
const std::vector<CompactionInputFiles>& inputs,
const Status& bg_error);
// Bookkeeping so total_file_sizes_ goes back to normal after compaction
// finishes
void OnCompactionCompletion(Compaction* c);
uint64_t GetCompactionsReservedSize();
// Return the total size of all tracked files.
uint64_t GetTotalSize() override;
// Return a map containing all tracked files and there corresponding sizes.
std::unordered_map<std::string, uint64_t> GetTrackedFiles() override;
// Return delete rate limit in bytes per second.
int64_t GetDeleteRateBytesPerSecond() override;
// Update the delete rate limit in bytes per second.
void SetDeleteRateBytesPerSecond(int64_t delete_rate) override;
// Return trash/DB size ratio where new files will be deleted immediately
double GetMaxTrashDBRatio() override;
// Update trash/DB size ratio where new files will be deleted immediately
void SetMaxTrashDBRatio(double ratio) override;
// Return the total size of trash files
uint64_t GetTotalTrashSize() override;
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
// Called by each DB instance using this sst file manager to reserve
// disk buffer space for recovery from out of space errors
void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
// Set a flag upon encountering disk full. May enqueue the ErrorHandler
// instance for background polling and recovery
void StartErrorRecovery(ErrorHandler* db, Status bg_error);
// Remove the given Errorhandler instance from the recovery queue. Its
// not guaranteed
bool CancelErrorRecovery(ErrorHandler* db);
// Mark a file as trash and schedule its deletion. If force_bg is set, it
// forces the file to be deleting in the background regardless of DB size,
// except when rate limited delete is disabled.
virtual Status ScheduleFileDeletion(const std::string& file_path,
const std::string& dir_to_sync,
const bool force_bg = false);
// Delete an unaccounted file. The file is deleted immediately if slow
// deletion is disabled. A file with more than 1 hard links will be deleted
// immediately unless force_bg is set. In other cases, files will be scheduled
// for slow deletion, and assigned to the specified bucket if a legitimate one
// is provided. A legitimate bucket is one that is created with the
// `NewTrashBucket` API, and for which `WaitForEmptyTrashBucket` hasn't been
// called yet.
virtual Status ScheduleUnaccountedFileDeletion(
const std::string& file_path, const std::string& dir_to_sync,
const bool force_bg = false,
std::optional<int32_t> bucket = std::nullopt);
// Wait for all files being deleted in the background to finish or for
// destructor to be called.
virtual void WaitForEmptyTrash();
// Creates a new trash bucket. A legitimate bucket is only created and
// returned when slow deletion is enabled.
// For each bucket that is created and used, the user should also call
// `WaitForEmptyTrashBucket` after scheduling file deletions to make sure all
// the trash files are cleared.
std::optional<int32_t> NewTrashBucket();
// Wait for all the files in the specified bucket to be deleted in the
// background or for destructor to be called.
virtual void WaitForEmptyTrashBucket(int32_t bucket);
DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
// Stop the error recovery background thread. This should be called only
// once in the object's lifetime, and before the destructor
void Close();
void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) override {
stats_ = stats;
delete_scheduler_.SetStatisticsPtr(stats);
}
private:
// REQUIRES: mutex locked
void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
// REQUIRES: mutex locked
void OnDeleteFileImpl(const std::string& file_path);
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
void ClearError();
bool CheckFreeSpace() {
return bg_err_.severity() == Status::Severity::kSoftError;
}
std::shared_ptr<SystemClock> clock_;
Introduce a new storage specific Env API (#5761) Summary: The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc. This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO. The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before. This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection. The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761 Differential Revision: D18868376 Pulled By: anand1976 fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
std::shared_ptr<FileSystem> fs_;
std::shared_ptr<Logger> logger_;
// Mutex to protect tracked_files_, total_files_size_
port::Mutex mu_;
// The summation of the sizes of all files in tracked_files_ map
uint64_t total_files_size_;
// Compactions should only execute if they can leave at least
// this amount of buffer space for logs and flushes
uint64_t compaction_buffer_size_;
// Estimated size of the current ongoing compactions
uint64_t cur_compactions_reserved_size_;
// A map containing all tracked files and there sizes
// file_path => file_size
std::unordered_map<std::string, uint64_t> tracked_files_;
// The maximum allowed space (in bytes) for sst and blob files.
uint64_t max_allowed_space_;
// DeleteScheduler used to throttle file deletion.
DeleteScheduler delete_scheduler_;
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
port::CondVar cv_;
// Flag to force error recovery thread to exit
bool closing_;
// Background error recovery thread
std::unique_ptr<port::Thread> bg_thread_;
// A path in the filesystem corresponding to this SFM. This is used for
// calling Env::GetFreeSpace. Posix requires a path in the filesystem
std::string path_;
// Save the current background error
Status bg_err_;
// Amount of free disk headroom before allowing recovery from hard errors
uint64_t reserved_disk_buffer_;
// For soft errors, amount of free disk space before we can allow
// compactions to run full throttle. If disk space is below this trigger,
// compactions will be gated by free disk space > input size
uint64_t free_space_trigger_;
// List of database error handler instances tracked by this SstFileManager.
Auto recovery from out of space errors (#4164) Summary: This commit implements automatic recovery from a Status::NoSpace() error during background operations such as write callback, flush and compaction. The broad design is as follows - 1. Compaction errors are treated as soft errors and don't put the database in read-only mode. A compaction is delayed until enough free disk space is available to accomodate the compaction outputs, which is estimated based on the input size. This means that users can continue to write, and we rely on the WriteController to delay or stop writes if the compaction debt becomes too high due to persistent low disk space condition 2. Errors during write callback and flush are treated as hard errors, i.e the database is put in read-only mode and goes back to read-write only fater certain recovery actions are taken. 3. Both types of recovery rely on the SstFileManagerImpl to poll for sufficient disk space. We assume that there is a 1-1 mapping between an SFM and the underlying OS storage container. For cases where multiple DBs are hosted on a single storage container, the user is expected to allocate a single SFM instance and use the same one for all the DBs. If no SFM is specified by the user, DBImpl::Open() will allocate one, but this will be one per DB and each DB will recover independently. The recovery implemented by SFM is as follows - a) On the first occurance of an out of space error during compaction, subsequent compactions will be delayed until the disk free space check indicates enough available space. The required space is computed as the sum of input sizes. b) The free space check requirement will be removed once the amount of free space is greater than the size reserved by in progress compactions when the first error occured c) If the out of space error is a hard error, a background thread in SFM will poll for sufficient headroom before triggering the recovery of the database and putting it in write-only mode. The headroom is calculated as the sum of the write_buffer_size of all the DB instances associated with the SFM 4. EventListener callbacks will be called at the start and completion of automatic recovery. Users can disable the auto recov ery in the start callback, and later initiate it manually by calling DB::Resume() Todo: 1. More extensive testing 2. Add disk full condition to db_stress (follow-on PR) Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164 Differential Revision: D9846378 Pulled By: anand1976 fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
std::list<ErrorHandler*> error_handler_list_;
// Pointer to ErrorHandler instance that is currently processing recovery
ErrorHandler* cur_instance_;
std::shared_ptr<Statistics> stats_;
};
} // namespace ROCKSDB_NAMESPACE