2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-10-14 08:14:53 +00:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#ifdef ROCKSDB_LIB_IO_POSIX
|
2017-04-06 02:02:00 +00:00
|
|
|
#include "env/io_posix.h"
|
2022-10-25 00:54:14 +00:00
|
|
|
|
2015-10-23 16:16:46 +00:00
|
|
|
#include <fcntl.h>
|
2022-10-25 00:54:14 +00:00
|
|
|
|
2016-04-21 17:37:27 +00:00
|
|
|
#include <algorithm>
|
2023-12-04 19:17:32 +00:00
|
|
|
#include <cerrno>
|
2015-10-23 16:16:46 +00:00
|
|
|
#if defined(OS_LINUX)
|
|
|
|
#include <linux/fs.h>
|
2019-09-06 00:33:35 +00:00
|
|
|
#ifndef FALLOC_FL_KEEP_SIZE
|
2019-08-16 20:55:37 +00:00
|
|
|
#include <linux/falloc.h>
|
2015-10-23 16:16:46 +00:00
|
|
|
#endif
|
2019-09-06 00:33:35 +00:00
|
|
|
#endif
|
2015-10-23 16:16:46 +00:00
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/types.h>
|
2023-12-04 19:17:32 +00:00
|
|
|
|
|
|
|
#include <cstdio>
|
|
|
|
#include <cstdlib>
|
|
|
|
#include <cstring>
|
2015-10-23 16:16:46 +00:00
|
|
|
#ifdef OS_LINUX
|
|
|
|
#include <sys/statfs.h>
|
2017-04-26 08:09:05 +00:00
|
|
|
#include <sys/sysmacros.h>
|
2015-10-23 16:16:46 +00:00
|
|
|
#endif
|
2017-04-06 02:02:00 +00:00
|
|
|
#include "monitoring/iostats_context_imp.h"
|
2015-10-23 16:16:46 +00:00
|
|
|
#include "port/port.h"
|
2021-09-15 19:43:35 +00:00
|
|
|
#include "port/stack_trace.h"
|
2015-10-23 16:16:46 +00:00
|
|
|
#include "rocksdb/slice.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "test_util/sync_point.h"
|
2019-12-08 04:54:27 +00:00
|
|
|
#include "util/autovector.h"
|
2015-10-23 16:16:46 +00:00
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/string_util.h"
|
2015-10-14 08:14:53 +00:00
|
|
|
|
2017-11-10 17:25:26 +00:00
|
|
|
#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
|
|
|
|
#define F_LINUX_SPECIFIC_BASE 1024
|
2019-05-15 21:16:36 +00:00
|
|
|
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
|
2017-11-10 17:25:26 +00:00
|
|
|
#endif
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2015-10-14 08:14:53 +00:00
|
|
|
|
2020-03-12 01:36:43 +00:00
|
|
|
std::string IOErrorMsg(const std::string& context,
|
|
|
|
const std::string& file_name) {
|
|
|
|
if (file_name.empty()) {
|
|
|
|
return context;
|
|
|
|
}
|
|
|
|
return context + ": " + file_name;
|
|
|
|
}
|
|
|
|
|
|
|
|
// file_name can be left empty if it is not unkown.
|
|
|
|
IOStatus IOError(const std::string& context, const std::string& file_name,
|
|
|
|
int err_number) {
|
|
|
|
switch (err_number) {
|
|
|
|
case ENOSPC: {
|
|
|
|
IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
|
2021-03-25 06:06:31 +00:00
|
|
|
errnoStr(err_number).c_str());
|
2020-03-12 01:36:43 +00:00
|
|
|
s.SetRetryable(true);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
case ESTALE:
|
|
|
|
return IOStatus::IOError(IOStatus::kStaleFile);
|
|
|
|
case ENOENT:
|
|
|
|
return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
|
2021-03-25 06:06:31 +00:00
|
|
|
errnoStr(err_number).c_str());
|
2020-03-12 01:36:43 +00:00
|
|
|
default:
|
|
|
|
return IOStatus::IOError(IOErrorMsg(context, file_name),
|
2021-03-25 06:06:31 +00:00
|
|
|
errnoStr(err_number).c_str());
|
2020-03-12 01:36:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-14 08:14:53 +00:00
|
|
|
// A wrapper for fadvise, if the platform doesn't support fadvise,
|
2017-04-26 21:21:04 +00:00
|
|
|
// it will simply return 0.
|
2015-10-14 08:14:53 +00:00
|
|
|
int Fadvise(int fd, off_t offset, size_t len, int advice) {
|
|
|
|
#ifdef OS_LINUX
|
|
|
|
return posix_fadvise(fd, offset, len, advice);
|
|
|
|
#else
|
2018-04-13 00:55:14 +00:00
|
|
|
(void)fd;
|
|
|
|
(void)offset;
|
|
|
|
(void)len;
|
|
|
|
(void)advice;
|
2015-10-14 08:14:53 +00:00
|
|
|
return 0; // simply do nothing.
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2022-06-15 20:05:58 +00:00
|
|
|
// A wrapper for fadvise, if the platform doesn't support fadvise,
|
|
|
|
// it will simply return 0.
|
|
|
|
int Madvise(void* addr, size_t len, int advice) {
|
|
|
|
#ifdef OS_LINUX
|
|
|
|
return posix_madvise(addr, len, advice);
|
|
|
|
#else
|
|
|
|
(void)addr;
|
|
|
|
(void)len;
|
|
|
|
(void)advice;
|
|
|
|
return 0; // simply do nothing.
|
|
|
|
#endif
|
2022-06-10 23:34:01 +00:00
|
|
|
}
|
|
|
|
|
2017-02-23 19:17:49 +00:00
|
|
|
namespace {
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
|
2019-05-15 21:16:36 +00:00
|
|
|
// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
|
|
|
|
// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
|
|
|
|
// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
|
|
|
|
// the writes aligned.
|
|
|
|
|
|
|
|
bool PosixWrite(int fd, const char* buf, size_t nbyte) {
|
|
|
|
const size_t kLimit1Gb = 1UL << 30;
|
|
|
|
|
|
|
|
const char* src = buf;
|
|
|
|
size_t left = nbyte;
|
|
|
|
|
|
|
|
while (left != 0) {
|
|
|
|
size_t bytes_to_write = std::min(left, kLimit1Gb);
|
|
|
|
|
|
|
|
ssize_t done = write(fd, src, bytes_to_write);
|
|
|
|
if (done < 0) {
|
|
|
|
if (errno == EINTR) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
left -= done;
|
|
|
|
src += done;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
|
|
|
|
const size_t kLimit1Gb = 1UL << 30;
|
|
|
|
|
|
|
|
const char* src = buf;
|
|
|
|
size_t left = nbyte;
|
|
|
|
|
|
|
|
while (left != 0) {
|
|
|
|
size_t bytes_to_write = std::min(left, kLimit1Gb);
|
|
|
|
|
|
|
|
ssize_t done = pwrite(fd, src, bytes_to_write, offset);
|
|
|
|
if (done < 0) {
|
|
|
|
if (errno == EINTR) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
left -= done;
|
|
|
|
offset += done;
|
|
|
|
src += done;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
#ifdef ROCKSDB_RANGESYNC_PRESENT
|
|
|
|
|
|
|
|
#if !defined(ZFS_SUPER_MAGIC)
|
|
|
|
// The magic number for ZFS was not exposed until recently. It should be fixed
|
|
|
|
// forever so we can just copy the magic number here.
|
|
|
|
#define ZFS_SUPER_MAGIC 0x2fc12fc1
|
|
|
|
#endif
|
|
|
|
|
2019-06-13 20:52:43 +00:00
|
|
|
bool IsSyncFileRangeSupported(int fd) {
|
2020-06-19 22:26:05 +00:00
|
|
|
// This function tracks and checks for cases where we know `sync_file_range`
|
|
|
|
// definitely will not work properly despite passing the compile-time check
|
|
|
|
// (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
|
|
|
|
// fail in unexpected ways, we allow `sync_file_range` to be used. This way
|
|
|
|
// should minimize risk of impacting existing use cases.
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
struct statfs buf;
|
|
|
|
int ret = fstatfs(fd, &buf);
|
|
|
|
assert(ret == 0);
|
2019-06-13 20:52:43 +00:00
|
|
|
if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
// Testing on ZFS showed the writeback did not happen asynchronously when
|
|
|
|
// `sync_file_range` was called, even though it returned success. Avoid it
|
|
|
|
// and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
|
|
|
|
// even though this'll incur extra I/O for metadata.
|
|
|
|
return false;
|
|
|
|
}
|
2019-06-13 20:52:43 +00:00
|
|
|
|
|
|
|
ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
|
|
|
|
assert(!(ret == -1 && errno != ENOSYS));
|
|
|
|
if (ret == -1 && errno == ENOSYS) {
|
|
|
|
// `sync_file_range` is not implemented on all platforms even if
|
|
|
|
// compile-time checks pass and a supported filesystem is in-use. For
|
|
|
|
// example, using ext4 on WSL (Windows Subsystem for Linux),
|
|
|
|
// `sync_file_range()` returns `ENOSYS`
|
|
|
|
// ("Function not implemented").
|
|
|
|
return false;
|
|
|
|
}
|
2020-06-19 22:26:05 +00:00
|
|
|
// None of the known cases matched, so allow `sync_file_range` use.
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef ZFS_SUPER_MAGIC
|
|
|
|
|
|
|
|
#endif // ROCKSDB_RANGESYNC_PRESENT
|
|
|
|
|
|
|
|
} // anonymous namespace
|
2017-02-23 19:17:49 +00:00
|
|
|
|
2015-10-14 08:14:53 +00:00
|
|
|
/*
|
|
|
|
* PosixSequentialFile
|
|
|
|
*/
|
2017-01-12 00:42:07 +00:00
|
|
|
PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
|
2020-03-12 01:36:43 +00:00
|
|
|
int fd, size_t logical_block_size,
|
|
|
|
const EnvOptions& options)
|
2015-10-14 08:14:53 +00:00
|
|
|
: filename_(fname),
|
2017-01-12 00:42:07 +00:00
|
|
|
file_(file),
|
|
|
|
fd_(fd),
|
2017-02-23 19:17:49 +00:00
|
|
|
use_direct_io_(options.use_direct_reads),
|
2020-03-12 01:36:43 +00:00
|
|
|
logical_sector_size_(logical_block_size) {
|
2017-01-12 00:42:07 +00:00
|
|
|
assert(!options.use_direct_reads || !options.use_mmap_reads);
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
|
2017-01-12 00:42:07 +00:00
|
|
|
PosixSequentialFile::~PosixSequentialFile() {
|
2017-01-13 20:01:08 +00:00
|
|
|
if (!use_direct_io()) {
|
2017-01-12 00:42:07 +00:00
|
|
|
assert(file_);
|
|
|
|
fclose(file_);
|
|
|
|
} else {
|
|
|
|
assert(fd_);
|
|
|
|
close(fd_);
|
|
|
|
}
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
|
|
|
|
Slice* result, char* scratch,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2017-01-13 20:01:08 +00:00
|
|
|
assert(result != nullptr && !use_direct_io());
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus s;
|
2015-10-14 08:14:53 +00:00
|
|
|
size_t r = 0;
|
|
|
|
do {
|
2020-03-30 04:54:47 +00:00
|
|
|
clearerr(file_);
|
2015-10-14 08:14:53 +00:00
|
|
|
r = fread_unlocked(scratch, 1, n, file_);
|
|
|
|
} while (r == 0 && ferror(file_) && errno == EINTR);
|
|
|
|
*result = Slice(scratch, r);
|
|
|
|
if (r < n) {
|
|
|
|
if (feof(file_)) {
|
|
|
|
// We leave status as ok if we hit the end of the file
|
|
|
|
// We also clear the error so that the reads can continue
|
|
|
|
// if a new data is written to the file
|
|
|
|
clearerr(file_);
|
|
|
|
} else {
|
|
|
|
// A partial read with an error: return a non-ok status
|
2017-06-26 19:42:21 +00:00
|
|
|
s = IOError("While reading file sequentially", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
}
|
2017-01-12 00:42:07 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
|
|
|
|
const IOOptions& /*opts*/,
|
|
|
|
Slice* result, char* scratch,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2018-06-21 21:45:49 +00:00
|
|
|
assert(use_direct_io());
|
|
|
|
assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus s;
|
2017-01-12 00:42:07 +00:00
|
|
|
ssize_t r = -1;
|
|
|
|
size_t left = n;
|
|
|
|
char* ptr = scratch;
|
|
|
|
while (left > 0) {
|
|
|
|
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
|
|
|
|
if (r <= 0) {
|
|
|
|
if (r == -1 && errno == EINTR) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ptr += r;
|
|
|
|
offset += r;
|
|
|
|
left -= r;
|
2020-05-19 00:23:22 +00:00
|
|
|
if (!IsSectorAligned(r, GetRequiredBufferAlignment())) {
|
2017-01-12 00:42:07 +00:00
|
|
|
// Bytes reads don't fill sectors. Should only happen at the end
|
|
|
|
// of the file.
|
|
|
|
break;
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2017-01-12 00:42:07 +00:00
|
|
|
if (r < 0) {
|
|
|
|
// An error: return a non-ok status
|
2022-05-06 20:03:58 +00:00
|
|
|
s = IOError("While pread " + std::to_string(n) + " bytes from offset " +
|
|
|
|
std::to_string(offset),
|
|
|
|
filename_, errno);
|
2017-01-12 00:42:07 +00:00
|
|
|
}
|
|
|
|
*result = Slice(scratch, (r < 0) ? 0 : n - left);
|
2015-10-14 08:14:53 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixSequentialFile::Skip(uint64_t n) {
|
2015-10-14 08:14:53 +00:00
|
|
|
if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While fseek to skip " + std::to_string(n) + " bytes",
|
|
|
|
filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
|
2015-10-14 08:14:53 +00:00
|
|
|
#ifndef OS_LINUX
|
2018-04-13 00:55:14 +00:00
|
|
|
(void)offset;
|
|
|
|
(void)length;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
#else
|
2017-01-13 20:01:08 +00:00
|
|
|
if (!use_direct_io()) {
|
2017-01-12 00:42:07 +00:00
|
|
|
// free OS pages
|
|
|
|
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
|
|
|
if (ret != 0) {
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While fadvise NotNeeded offset " +
|
|
|
|
std::to_string(offset) + " len " +
|
|
|
|
std::to_string(length),
|
2017-06-26 19:42:21 +00:00
|
|
|
filename_, errno);
|
2017-01-12 00:42:07 +00:00
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2017-01-12 00:42:07 +00:00
|
|
|
#endif
|
2016-04-21 17:37:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PosixRandomAccessFile
|
|
|
|
*/
|
2015-10-14 08:14:53 +00:00
|
|
|
#if defined(OS_LINUX)
|
2016-04-21 17:37:27 +00:00
|
|
|
size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
|
2015-10-14 08:14:53 +00:00
|
|
|
if (max_size < kMaxVarint64Length * 3) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct stat buf;
|
|
|
|
int result = fstat(fd, &buf);
|
|
|
|
if (result == -1) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
long version = 0;
|
|
|
|
result = ioctl(fd, FS_IOC_GETVERSION, &version);
|
2015-12-16 02:20:10 +00:00
|
|
|
TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
|
2015-10-14 08:14:53 +00:00
|
|
|
if (result == -1) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
uint64_t uversion = (uint64_t)version;
|
|
|
|
|
|
|
|
char* rid = id;
|
|
|
|
rid = EncodeVarint64(rid, buf.st_dev);
|
|
|
|
rid = EncodeVarint64(rid, buf.st_ino);
|
|
|
|
rid = EncodeVarint64(rid, uversion);
|
|
|
|
assert(rid >= id);
|
|
|
|
return static_cast<size_t>(rid - id);
|
|
|
|
}
|
2016-05-18 23:21:29 +00:00
|
|
|
#endif
|
|
|
|
|
2017-04-22 03:41:37 +00:00
|
|
|
#if defined(OS_MACOSX) || defined(OS_AIX)
|
2016-04-21 17:37:27 +00:00
|
|
|
size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
|
2016-05-18 23:21:29 +00:00
|
|
|
if (max_size < kMaxVarint64Length * 3) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct stat buf;
|
|
|
|
int result = fstat(fd, &buf);
|
|
|
|
if (result == -1) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
char* rid = id;
|
|
|
|
rid = EncodeVarint64(rid, buf.st_dev);
|
|
|
|
rid = EncodeVarint64(rid, buf.st_ino);
|
|
|
|
rid = EncodeVarint64(rid, buf.st_gen);
|
|
|
|
assert(rid >= id);
|
|
|
|
return static_cast<size_t>(rid - id);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
#endif
|
2020-03-12 01:36:43 +00:00
|
|
|
|
|
|
|
#ifdef OS_LINUX
|
|
|
|
std::string RemoveTrailingSlash(const std::string& path) {
|
|
|
|
std::string p = path;
|
|
|
|
if (p.size() > 1 && p.back() == '/') {
|
|
|
|
p.pop_back();
|
|
|
|
}
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
|
|
|
|
const std::vector<std::string>& directories) {
|
|
|
|
std::vector<std::string> dirs;
|
|
|
|
dirs.reserve(directories.size());
|
|
|
|
for (auto& d : directories) {
|
|
|
|
dirs.emplace_back(RemoveTrailingSlash(d));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::map<std::string, size_t> dir_sizes;
|
|
|
|
{
|
|
|
|
ReadLock lock(&cache_mutex_);
|
|
|
|
for (const auto& dir : dirs) {
|
|
|
|
if (cache_.find(dir) == cache_.end()) {
|
|
|
|
dir_sizes.emplace(dir, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
for (auto& dir_size : dir_sizes) {
|
|
|
|
s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteLock lock(&cache_mutex_);
|
|
|
|
for (const auto& dir : dirs) {
|
|
|
|
auto& v = cache_[dir];
|
|
|
|
v.ref++;
|
|
|
|
auto dir_size = dir_sizes.find(dir);
|
|
|
|
if (dir_size != dir_sizes.end()) {
|
|
|
|
v.size = dir_size->second;
|
|
|
|
}
|
|
|
|
}
|
2020-05-20 02:20:29 +00:00
|
|
|
return s;
|
2020-03-12 01:36:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
|
|
|
|
const std::vector<std::string>& directories) {
|
|
|
|
std::vector<std::string> dirs;
|
|
|
|
dirs.reserve(directories.size());
|
|
|
|
for (auto& dir : directories) {
|
|
|
|
dirs.emplace_back(RemoveTrailingSlash(dir));
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteLock lock(&cache_mutex_);
|
|
|
|
for (const auto& dir : dirs) {
|
|
|
|
auto it = cache_.find(dir);
|
|
|
|
if (it != cache_.end() && !(--(it->second.ref))) {
|
|
|
|
cache_.erase(it);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
|
|
|
|
int fd) {
|
2023-12-04 19:17:32 +00:00
|
|
|
std::string dir = fname.substr(0, fname.find_last_of('/'));
|
2020-03-12 01:36:43 +00:00
|
|
|
if (dir.empty()) {
|
|
|
|
dir = "/";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
ReadLock lock(&cache_mutex_);
|
|
|
|
auto it = cache_.find(dir);
|
|
|
|
if (it != cache_.end()) {
|
|
|
|
return it->second.size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return get_logical_block_size_of_fd_(fd);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
|
|
|
|
size_t* size) {
|
|
|
|
int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
|
|
|
|
if (fd == -1) {
|
|
|
|
return Status::IOError("Cannot open directory " + directory);
|
|
|
|
}
|
|
|
|
*size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
|
|
|
|
close(fd);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
|
|
|
|
#ifdef OS_LINUX
|
|
|
|
struct stat buf;
|
|
|
|
int result = fstat(fd, &buf);
|
|
|
|
if (result == -1) {
|
|
|
|
return kDefaultPageSize;
|
|
|
|
}
|
|
|
|
if (major(buf.st_dev) == 0) {
|
|
|
|
// Unnamed devices (e.g. non-device mounts), reserved as null device number.
|
|
|
|
// These don't have an entry in /sys/dev/block/. Return a sensible default.
|
|
|
|
return kDefaultPageSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reading queue/logical_block_size does not require special permissions.
|
|
|
|
const int kBufferSize = 100;
|
|
|
|
char path[kBufferSize];
|
|
|
|
char real_path[PATH_MAX + 1];
|
|
|
|
snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
|
|
|
|
minor(buf.st_dev));
|
|
|
|
if (realpath(path, real_path) == nullptr) {
|
|
|
|
return kDefaultPageSize;
|
|
|
|
}
|
|
|
|
std::string device_dir(real_path);
|
|
|
|
if (!device_dir.empty() && device_dir.back() == '/') {
|
|
|
|
device_dir.pop_back();
|
|
|
|
}
|
|
|
|
// NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
|
|
|
|
// and nvme0n1 have it.
|
|
|
|
// $ ls -al '/sys/dev/block/8:3'
|
|
|
|
// lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
|
|
|
|
// ../../block/sda/sda3
|
|
|
|
// $ ls -al '/sys/dev/block/259:4'
|
|
|
|
// lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
|
|
|
|
// ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
|
|
|
|
size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
|
|
|
|
if (parent_end == std::string::npos) {
|
|
|
|
return kDefaultPageSize;
|
|
|
|
}
|
|
|
|
size_t parent_begin = device_dir.rfind('/', parent_end - 1);
|
|
|
|
if (parent_begin == std::string::npos) {
|
|
|
|
return kDefaultPageSize;
|
|
|
|
}
|
|
|
|
std::string parent =
|
|
|
|
device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
|
|
|
|
std::string child = device_dir.substr(parent_end + 1, std::string::npos);
|
|
|
|
if (parent != "block" &&
|
|
|
|
(child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
|
|
|
|
device_dir = device_dir.substr(0, parent_end);
|
|
|
|
}
|
|
|
|
std::string fname = device_dir + "/queue/logical_block_size";
|
|
|
|
FILE* fp;
|
|
|
|
size_t size = 0;
|
|
|
|
fp = fopen(fname.c_str(), "r");
|
|
|
|
if (fp != nullptr) {
|
|
|
|
char* line = nullptr;
|
|
|
|
size_t len = 0;
|
|
|
|
if (getline(&line, &len, fp) != -1) {
|
|
|
|
sscanf(line, "%zu", &size);
|
|
|
|
}
|
|
|
|
free(line);
|
|
|
|
fclose(fp);
|
|
|
|
}
|
|
|
|
if (size != 0 && (size & (size - 1)) == 0) {
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
(void)fd;
|
|
|
|
return kDefaultPageSize;
|
|
|
|
}
|
|
|
|
|
2015-10-14 08:14:53 +00:00
|
|
|
/*
|
|
|
|
* PosixRandomAccessFile
|
|
|
|
*
|
|
|
|
* pread() based random-access
|
|
|
|
*/
|
2019-12-08 04:54:27 +00:00
|
|
|
PosixRandomAccessFile::PosixRandomAccessFile(
|
2020-03-12 01:36:43 +00:00
|
|
|
const std::string& fname, int fd, size_t logical_block_size,
|
|
|
|
const EnvOptions& options
|
2019-12-08 04:54:27 +00:00
|
|
|
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
|
|
,
|
|
|
|
ThreadLocalPtr* thread_local_io_urings
|
|
|
|
#endif
|
|
|
|
)
|
2017-02-23 19:17:49 +00:00
|
|
|
: filename_(fname),
|
|
|
|
fd_(fd),
|
|
|
|
use_direct_io_(options.use_direct_reads),
|
2020-03-12 01:36:43 +00:00
|
|
|
logical_sector_size_(logical_block_size)
|
2019-12-08 04:54:27 +00:00
|
|
|
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
|
|
,
|
|
|
|
thread_local_io_urings_(thread_local_io_urings)
|
|
|
|
#endif
|
|
|
|
{
|
2017-01-12 00:42:07 +00:00
|
|
|
assert(!options.use_direct_reads || !options.use_mmap_reads);
|
2022-01-21 21:03:15 +00:00
|
|
|
assert(!options.use_mmap_reads);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
|
|
|
|
const IOOptions& /*opts*/, Slice* result,
|
|
|
|
char* scratch,
|
|
|
|
IODebugContext* /*dbg*/) const {
|
2017-05-04 08:21:31 +00:00
|
|
|
if (use_direct_io()) {
|
|
|
|
assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
|
|
|
|
}
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus s;
|
2015-10-14 08:14:53 +00:00
|
|
|
ssize_t r = -1;
|
|
|
|
size_t left = n;
|
|
|
|
char* ptr = scratch;
|
|
|
|
while (left > 0) {
|
|
|
|
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
|
|
|
|
if (r <= 0) {
|
2017-01-12 00:42:07 +00:00
|
|
|
if (r == -1 && errno == EINTR) {
|
2015-10-14 08:14:53 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ptr += r;
|
|
|
|
offset += r;
|
|
|
|
left -= r;
|
2017-01-13 20:01:08 +00:00
|
|
|
if (use_direct_io() &&
|
2017-01-12 00:42:07 +00:00
|
|
|
r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
|
|
|
|
// Bytes reads don't fill sectors. Should only happen at the end
|
|
|
|
// of the file.
|
|
|
|
break;
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
if (r < 0) {
|
|
|
|
// An error: return a non-ok status
|
2022-05-06 20:03:58 +00:00
|
|
|
s = IOError("While pread offset " + std::to_string(offset) + " len " +
|
|
|
|
std::to_string(n),
|
|
|
|
filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2017-01-12 00:42:07 +00:00
|
|
|
*result = Slice(scratch, (r < 0) ? 0 : n - left);
|
2015-10-14 08:14:53 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2022-10-25 00:54:14 +00:00
|
|
|
IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
const IOOptions& options,
|
|
|
|
IODebugContext* dbg) {
|
2020-04-23 22:17:00 +00:00
|
|
|
if (use_direct_io()) {
|
|
|
|
for (size_t i = 0; i < num_reqs; i++) {
|
2020-04-24 22:11:42 +00:00
|
|
|
assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment()));
|
2020-04-23 22:17:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-08 04:54:27 +00:00
|
|
|
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
|
|
struct io_uring* iu = nullptr;
|
|
|
|
if (thread_local_io_urings_) {
|
|
|
|
iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
|
|
|
|
if (iu == nullptr) {
|
|
|
|
iu = CreateIOUring();
|
|
|
|
if (iu != nullptr) {
|
|
|
|
thread_local_io_urings_->Reset(iu);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Init failed, platform doesn't support io_uring. Fall back to
|
|
|
|
// serialized reads
|
|
|
|
if (iu == nullptr) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
|
2019-12-08 04:54:27 +00:00
|
|
|
}
|
|
|
|
|
2021-05-18 23:08:21 +00:00
|
|
|
IOStatus ios = IOStatus::OK();
|
|
|
|
|
2019-12-08 04:54:27 +00:00
|
|
|
struct WrappedReadRequest {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
FSReadRequest* req;
|
2019-12-08 04:54:27 +00:00
|
|
|
struct iovec iov;
|
2020-02-22 00:56:04 +00:00
|
|
|
size_t finished_len;
|
|
|
|
explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
|
2019-12-08 04:54:27 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
autovector<WrappedReadRequest, 32> req_wraps;
|
2020-02-22 00:56:04 +00:00
|
|
|
autovector<WrappedReadRequest*, 4> incomplete_rq_list;
|
2021-09-15 19:43:35 +00:00
|
|
|
std::unordered_set<WrappedReadRequest*> wrap_cache;
|
2019-12-08 04:54:27 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < num_reqs; i++) {
|
|
|
|
req_wraps.emplace_back(&reqs[i]);
|
|
|
|
}
|
|
|
|
|
2020-02-22 00:56:04 +00:00
|
|
|
size_t reqs_off = 0;
|
|
|
|
while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
|
|
|
|
size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
|
2019-12-08 04:54:27 +00:00
|
|
|
|
|
|
|
// If requests exceed depth, split it into batches
|
2023-12-04 19:17:32 +00:00
|
|
|
if (this_reqs > kIoUringDepth) {
|
|
|
|
this_reqs = kIoUringDepth;
|
|
|
|
}
|
2019-12-08 04:54:27 +00:00
|
|
|
|
2020-02-22 00:56:04 +00:00
|
|
|
assert(incomplete_rq_list.size() <= this_reqs);
|
2019-12-08 04:54:27 +00:00
|
|
|
for (size_t i = 0; i < this_reqs; i++) {
|
2020-02-22 00:56:04 +00:00
|
|
|
WrappedReadRequest* rep_to_submit;
|
|
|
|
if (i < incomplete_rq_list.size()) {
|
|
|
|
rep_to_submit = incomplete_rq_list[i];
|
|
|
|
} else {
|
|
|
|
rep_to_submit = &req_wraps[reqs_off++];
|
|
|
|
}
|
|
|
|
assert(rep_to_submit->req->len > rep_to_submit->finished_len);
|
|
|
|
rep_to_submit->iov.iov_base =
|
|
|
|
rep_to_submit->req->scratch + rep_to_submit->finished_len;
|
|
|
|
rep_to_submit->iov.iov_len =
|
|
|
|
rep_to_submit->req->len - rep_to_submit->finished_len;
|
2019-12-08 04:54:27 +00:00
|
|
|
|
2020-02-22 00:56:04 +00:00
|
|
|
struct io_uring_sqe* sqe;
|
2019-12-08 04:54:27 +00:00
|
|
|
sqe = io_uring_get_sqe(iu);
|
2020-02-22 00:56:04 +00:00
|
|
|
io_uring_prep_readv(
|
|
|
|
sqe, fd_, &rep_to_submit->iov, 1,
|
|
|
|
rep_to_submit->req->offset + rep_to_submit->finished_len);
|
|
|
|
io_uring_sqe_set_data(sqe, rep_to_submit);
|
2021-09-15 19:43:35 +00:00
|
|
|
wrap_cache.emplace(rep_to_submit);
|
2019-12-08 04:54:27 +00:00
|
|
|
}
|
2020-02-22 00:56:04 +00:00
|
|
|
incomplete_rq_list.clear();
|
2019-12-08 04:54:27 +00:00
|
|
|
|
2020-02-22 00:56:04 +00:00
|
|
|
ssize_t ret =
|
|
|
|
io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
|
2021-05-18 23:08:21 +00:00
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
|
|
|
|
&ret);
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
|
|
|
|
iu);
|
|
|
|
|
2019-12-08 04:54:27 +00:00
|
|
|
if (static_cast<size_t>(ret) != this_reqs) {
|
|
|
|
fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
|
2021-05-18 23:08:21 +00:00
|
|
|
// If error happens and we submitted fewer than expected, it is an
|
|
|
|
// exception case and we don't retry here. We should still consume
|
|
|
|
// what is is submitted in the ring.
|
|
|
|
for (ssize_t i = 0; i < ret; i++) {
|
|
|
|
struct io_uring_cqe* cqe = nullptr;
|
|
|
|
io_uring_wait_cqe(iu, &cqe);
|
|
|
|
if (cqe != nullptr) {
|
|
|
|
io_uring_cqe_seen(iu, cqe);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return IOStatus::IOError("io_uring_submit_and_wait() requested " +
|
2022-05-06 20:03:58 +00:00
|
|
|
std::to_string(this_reqs) + " but returned " +
|
|
|
|
std::to_string(ret));
|
2019-12-08 04:54:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < this_reqs; i++) {
|
2021-05-18 23:08:21 +00:00
|
|
|
struct io_uring_cqe* cqe = nullptr;
|
2019-12-08 04:54:27 +00:00
|
|
|
WrappedReadRequest* req_wrap;
|
|
|
|
|
|
|
|
// We could use the peek variant here, but this seems safer in terms
|
|
|
|
// of our initial wait not reaping all completions
|
|
|
|
ret = io_uring_wait_cqe(iu, &cqe);
|
2021-05-18 23:08:21 +00:00
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
|
|
|
|
if (ret) {
|
2022-05-06 20:03:58 +00:00
|
|
|
ios = IOStatus::IOError("io_uring_wait_cqe() returns " +
|
|
|
|
std::to_string(ret));
|
2021-05-18 23:08:21 +00:00
|
|
|
|
|
|
|
if (cqe != nullptr) {
|
|
|
|
io_uring_cqe_seen(iu, cqe);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2020-02-22 00:56:04 +00:00
|
|
|
|
2019-12-08 04:54:27 +00:00
|
|
|
req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
|
2021-09-15 19:43:35 +00:00
|
|
|
// Reset cqe data to catch any stray reuse of it
|
|
|
|
static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
|
|
|
|
// Check that we got a valid unique cqe data
|
|
|
|
auto wrap_check = wrap_cache.find(req_wrap);
|
|
|
|
if (wrap_check == wrap_cache.end()) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"PosixRandomAccessFile::MultiRead: "
|
|
|
|
"Bad cqe data from IO uring - %p\n",
|
|
|
|
req_wrap);
|
|
|
|
port::PrintStack();
|
|
|
|
ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
|
2022-05-06 20:03:58 +00:00
|
|
|
std::to_string((uint64_t)req_wrap));
|
2021-09-15 19:43:35 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
wrap_cache.erase(wrap_check);
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
FSReadRequest* req = req_wrap->req;
|
2022-04-04 22:35:43 +00:00
|
|
|
size_t bytes_read = 0;
|
2022-07-06 18:42:59 +00:00
|
|
|
bool read_again = false;
|
2022-03-11 02:28:31 +00:00
|
|
|
UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
|
2022-07-06 18:42:59 +00:00
|
|
|
false /*async_read*/, use_direct_io(),
|
|
|
|
GetRequiredBufferAlignment(), req_wrap->finished_len, req,
|
|
|
|
bytes_read, read_again);
|
2022-03-11 02:28:31 +00:00
|
|
|
int32_t res = cqe->res;
|
2022-04-04 22:35:43 +00:00
|
|
|
if (res >= 0) {
|
2022-07-18 22:37:29 +00:00
|
|
|
if (bytes_read == 0) {
|
|
|
|
if (read_again) {
|
|
|
|
Slice tmp_slice;
|
|
|
|
req->status =
|
|
|
|
Read(req->offset + req_wrap->finished_len,
|
|
|
|
req->len - req_wrap->finished_len, options, &tmp_slice,
|
|
|
|
req->scratch + req_wrap->finished_len, dbg);
|
|
|
|
req->result =
|
|
|
|
Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
|
|
|
|
}
|
|
|
|
// else It means EOF so no need to do anything.
|
2022-04-04 22:35:43 +00:00
|
|
|
} else if (bytes_read < req_wrap->iov.iov_len) {
|
|
|
|
incomplete_rq_list.push_back(req_wrap);
|
2020-02-22 00:56:04 +00:00
|
|
|
}
|
2019-12-08 04:54:27 +00:00
|
|
|
}
|
|
|
|
io_uring_cqe_seen(iu, cqe);
|
|
|
|
}
|
2021-09-15 19:43:35 +00:00
|
|
|
wrap_cache.clear();
|
2019-12-08 04:54:27 +00:00
|
|
|
}
|
2021-05-18 23:08:21 +00:00
|
|
|
return ios;
|
2019-12-08 04:54:27 +00:00
|
|
|
#else
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
|
2019-12-08 04:54:27 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
|
|
|
|
const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
IOStatus s;
|
2017-04-26 21:21:04 +00:00
|
|
|
if (!use_direct_io()) {
|
|
|
|
ssize_t r = 0;
|
|
|
|
#ifdef OS_LINUX
|
|
|
|
r = readahead(fd_, offset, n);
|
|
|
|
#endif
|
|
|
|
#ifdef OS_MACOSX
|
|
|
|
radvisory advice;
|
|
|
|
advice.ra_offset = static_cast<off_t>(offset);
|
|
|
|
advice.ra_count = static_cast<int>(n);
|
|
|
|
r = fcntl(fd_, F_RDADVISE, &advice);
|
|
|
|
#endif
|
|
|
|
if (r == -1) {
|
2022-05-06 20:03:58 +00:00
|
|
|
s = IOError("While prefetching offset " + std::to_string(offset) +
|
|
|
|
" len " + std::to_string(n),
|
2017-06-26 19:42:21 +00:00
|
|
|
filename_, errno);
|
2017-04-26 21:21:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-04-22 03:41:37 +00:00
|
|
|
#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
|
2015-10-14 08:14:53 +00:00
|
|
|
size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
|
2016-04-21 17:37:27 +00:00
|
|
|
return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
void PosixRandomAccessFile::Hint(AccessPattern pattern) {
|
2017-01-13 20:01:08 +00:00
|
|
|
if (use_direct_io()) {
|
2017-01-12 00:42:07 +00:00
|
|
|
return;
|
|
|
|
}
|
2022-06-15 20:05:58 +00:00
|
|
|
switch (pattern) {
|
|
|
|
case kNormal:
|
|
|
|
Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
|
|
|
|
break;
|
|
|
|
case kRandom:
|
|
|
|
Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
|
|
|
|
break;
|
|
|
|
case kSequential:
|
|
|
|
Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
|
|
|
|
break;
|
|
|
|
case kWillNeed:
|
|
|
|
Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
|
|
|
|
break;
|
|
|
|
case kWontNeed:
|
|
|
|
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
|
2017-01-13 20:01:08 +00:00
|
|
|
if (use_direct_io()) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2017-01-12 00:42:07 +00:00
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
#ifndef OS_LINUX
|
2018-04-13 00:55:14 +00:00
|
|
|
(void)offset;
|
|
|
|
(void)length;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
#else
|
|
|
|
// free OS pages
|
|
|
|
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
|
|
|
if (ret == 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While fadvise NotNeeded offset " + std::to_string(offset) +
|
|
|
|
" len " + std::to_string(length),
|
2017-06-26 19:42:21 +00:00
|
|
|
filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2022-03-11 02:28:31 +00:00
|
|
|
IOStatus PosixRandomAccessFile::ReadAsync(
|
|
|
|
FSReadRequest& req, const IOOptions& /*opts*/,
|
2024-02-16 17:14:55 +00:00
|
|
|
std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
|
2022-03-11 02:28:31 +00:00
|
|
|
void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) {
|
|
|
|
if (use_direct_io()) {
|
|
|
|
assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(req.len, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment()));
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
|
|
// io_uring_queue_init.
|
|
|
|
struct io_uring* iu = nullptr;
|
|
|
|
if (thread_local_io_urings_) {
|
|
|
|
iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
|
|
|
|
if (iu == nullptr) {
|
|
|
|
iu = CreateIOUring();
|
|
|
|
if (iu != nullptr) {
|
|
|
|
thread_local_io_urings_->Reset(iu);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Init failed, platform doesn't support io_uring.
|
|
|
|
if (iu == nullptr) {
|
|
|
|
return IOStatus::NotSupported("ReadAsync");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allocate io_handle.
|
|
|
|
IOHandleDeleter deletefn = [](void* args) -> void {
|
|
|
|
delete (static_cast<Posix_IOHandle*>(args));
|
|
|
|
args = nullptr;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Initialize Posix_IOHandle.
|
2022-07-06 18:42:59 +00:00
|
|
|
Posix_IOHandle* posix_handle =
|
|
|
|
new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch,
|
|
|
|
use_direct_io(), GetRequiredBufferAlignment());
|
2022-04-04 22:35:43 +00:00
|
|
|
posix_handle->iov.iov_base = req.scratch;
|
|
|
|
posix_handle->iov.iov_len = req.len;
|
2022-07-06 18:42:59 +00:00
|
|
|
|
|
|
|
*io_handle = static_cast<void*>(posix_handle);
|
|
|
|
*del_fn = deletefn;
|
2022-03-11 02:28:31 +00:00
|
|
|
|
|
|
|
// Step 3: io_uring_sqe_set_data
|
|
|
|
struct io_uring_sqe* sqe;
|
|
|
|
sqe = io_uring_get_sqe(iu);
|
|
|
|
|
2022-09-21 21:21:59 +00:00
|
|
|
io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov,
|
|
|
|
/*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset);
|
2022-03-11 02:28:31 +00:00
|
|
|
|
2022-09-21 21:21:59 +00:00
|
|
|
// Sets sqe->user_data to posix_handle.
|
2022-03-11 02:28:31 +00:00
|
|
|
io_uring_sqe_set_data(sqe, posix_handle);
|
|
|
|
|
|
|
|
// Step 4: io_uring_submit
|
|
|
|
ssize_t ret = io_uring_submit(iu);
|
|
|
|
if (ret < 0) {
|
|
|
|
fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
|
|
|
|
return IOStatus::IOError("io_uring_submit() requested but returned " +
|
2022-05-06 20:03:58 +00:00
|
|
|
std::to_string(ret));
|
2022-03-11 02:28:31 +00:00
|
|
|
}
|
|
|
|
return IOStatus::OK();
|
|
|
|
#else
|
|
|
|
(void)req;
|
|
|
|
(void)cb;
|
|
|
|
(void)cb_arg;
|
|
|
|
(void)io_handle;
|
|
|
|
(void)del_fn;
|
|
|
|
return IOStatus::NotSupported("ReadAsync");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-10-14 08:14:53 +00:00
|
|
|
/*
|
|
|
|
* PosixMmapReadableFile
|
|
|
|
*
|
|
|
|
* mmap() based random-access
|
|
|
|
*/
|
|
|
|
// base[0,length-1] contains the mmapped contents of the file.
|
|
|
|
PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
|
|
|
|
const std::string& fname,
|
|
|
|
void* base, size_t length,
|
|
|
|
const EnvOptions& options)
|
|
|
|
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
|
2018-04-13 00:55:14 +00:00
|
|
|
#ifdef NDEBUG
|
|
|
|
(void)options;
|
|
|
|
#endif
|
2015-10-14 08:14:53 +00:00
|
|
|
fd_ = fd_ + 0; // suppress the warning for used variables
|
|
|
|
assert(options.use_mmap_reads);
|
2016-12-22 20:51:29 +00:00
|
|
|
assert(!options.use_direct_reads);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PosixMmapReadableFile::~PosixMmapReadableFile() {
|
|
|
|
int ret = munmap(mmapped_region_, length_);
|
|
|
|
if (ret != 0) {
|
|
|
|
fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
|
|
|
|
mmapped_region_, length_);
|
|
|
|
}
|
2018-05-25 17:47:56 +00:00
|
|
|
close(fd_);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
|
|
|
|
const IOOptions& /*opts*/, Slice* result,
|
|
|
|
char* /*scratch*/,
|
|
|
|
IODebugContext* /*dbg*/) const {
|
|
|
|
IOStatus s;
|
2015-10-14 08:14:53 +00:00
|
|
|
if (offset > length_) {
|
|
|
|
*result = Slice();
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While mmap read offset " + std::to_string(offset) +
|
|
|
|
" larger than file length " + std::to_string(length_),
|
2017-06-26 19:42:21 +00:00
|
|
|
filename_, EINVAL);
|
2015-10-14 08:14:53 +00:00
|
|
|
} else if (offset + n > length_) {
|
|
|
|
n = static_cast<size_t>(length_ - offset);
|
|
|
|
}
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
*result = Slice(static_cast<char*>(mmapped_region_) + offset, n);
|
2015-10-14 08:14:53 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2022-06-10 23:34:01 +00:00
|
|
|
void PosixMmapReadableFile::Hint(AccessPattern pattern) {
|
2022-06-15 20:05:58 +00:00
|
|
|
switch (pattern) {
|
|
|
|
case kNormal:
|
|
|
|
Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL);
|
|
|
|
break;
|
|
|
|
case kRandom:
|
|
|
|
Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM);
|
|
|
|
break;
|
|
|
|
case kSequential:
|
|
|
|
Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL);
|
|
|
|
break;
|
|
|
|
case kWillNeed:
|
|
|
|
Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED);
|
|
|
|
break;
|
|
|
|
case kWontNeed:
|
|
|
|
Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
2022-06-10 23:34:01 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
|
2015-10-14 08:14:53 +00:00
|
|
|
#ifndef OS_LINUX
|
2018-04-13 00:55:14 +00:00
|
|
|
(void)offset;
|
|
|
|
(void)length;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
#else
|
|
|
|
// free OS pages
|
|
|
|
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
|
|
|
if (ret == 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While fadvise not needed. Offset " + std::to_string(offset) +
|
|
|
|
" len" + std::to_string(length),
|
2017-06-26 19:42:21 +00:00
|
|
|
filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PosixMmapFile
|
|
|
|
*
|
|
|
|
* We preallocate up to an extra megabyte and use memcpy to append new
|
|
|
|
* data to the file. This is safe since we either properly close the
|
|
|
|
* file before reading from it, or for log files, the reading code
|
|
|
|
* knows enough to skip zero suffixes.
|
|
|
|
*/
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::UnmapCurrentRegion() {
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
|
2015-10-14 08:14:53 +00:00
|
|
|
if (base_ != nullptr) {
|
|
|
|
int munmap_status = munmap(base_, limit_ - base_);
|
|
|
|
if (munmap_status != 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While munmap", filename_, munmap_status);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
file_offset_ += limit_ - base_;
|
|
|
|
base_ = nullptr;
|
|
|
|
limit_ = nullptr;
|
|
|
|
last_sync_ = nullptr;
|
|
|
|
dst_ = nullptr;
|
|
|
|
|
|
|
|
// Increase the amount we map the next time, but capped at 1MB
|
|
|
|
if (map_size_ < (1 << 20)) {
|
|
|
|
map_size_ *= 2;
|
|
|
|
}
|
|
|
|
}
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::MapNewRegion() {
|
2015-10-14 08:14:53 +00:00
|
|
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
|
|
|
assert(base_ == nullptr);
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
|
2015-10-14 08:14:53 +00:00
|
|
|
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
|
|
|
|
if (allow_fallocate_) {
|
|
|
|
IOSTATS_TIMER_GUARD(allocate_nanos);
|
|
|
|
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
|
|
|
|
if (alloc_status != 0) {
|
|
|
|
// fallback to posix_fallocate
|
|
|
|
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
|
|
|
|
}
|
|
|
|
if (alloc_status != 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::IOError("Error allocating space to file : " + filename_ +
|
2021-03-25 06:06:31 +00:00
|
|
|
"Error : " + errnoStr(alloc_status).c_str());
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixMmapFile::Append:1");
|
2015-10-14 08:14:53 +00:00
|
|
|
void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
|
|
|
|
file_offset_);
|
|
|
|
if (ptr == MAP_FAILED) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::IOError("MMap failed on " + filename_);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixMmapFile::Append:2");
|
2015-10-14 08:14:53 +00:00
|
|
|
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
base_ = static_cast<char*>(ptr);
|
2015-10-14 08:14:53 +00:00
|
|
|
limit_ = base_ + map_size_;
|
|
|
|
dst_ = base_;
|
|
|
|
last_sync_ = base_;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
#else
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::NotSupported("This platform doesn't support fallocate()");
|
2015-10-14 08:14:53 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::Msync() {
|
2015-10-14 08:14:53 +00:00
|
|
|
if (dst_ == last_sync_) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
// Find the beginnings of the pages that contain the first and last
|
|
|
|
// bytes to be synced.
|
|
|
|
size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
|
|
|
|
size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
|
|
|
|
last_sync_ = dst_;
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
|
2015-10-14 08:14:53 +00:00
|
|
|
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While msync", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
|
|
|
|
const EnvOptions& options)
|
|
|
|
: filename_(fname),
|
|
|
|
fd_(fd),
|
|
|
|
page_size_(page_size),
|
|
|
|
map_size_(Roundup(65536, page_size)),
|
|
|
|
base_(nullptr),
|
|
|
|
limit_(nullptr),
|
|
|
|
dst_(nullptr),
|
|
|
|
last_sync_(nullptr),
|
|
|
|
file_offset_(0) {
|
|
|
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
|
|
|
allow_fallocate_ = options.allow_fallocate;
|
|
|
|
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
2018-04-13 00:55:14 +00:00
|
|
|
#else
|
|
|
|
(void)options;
|
2015-10-14 08:14:53 +00:00
|
|
|
#endif
|
|
|
|
assert((page_size & (page_size - 1)) == 0);
|
|
|
|
assert(options.use_mmap_writes);
|
2016-12-22 20:51:29 +00:00
|
|
|
assert(!options.use_direct_writes);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PosixMmapFile::~PosixMmapFile() {
|
|
|
|
if (fd_ >= 0) {
|
2020-07-29 05:58:28 +00:00
|
|
|
IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr);
|
|
|
|
s.PermitUncheckedError();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2015-10-14 08:14:53 +00:00
|
|
|
const char* src = data.data();
|
|
|
|
size_t left = data.size();
|
|
|
|
while (left > 0) {
|
|
|
|
assert(base_ <= dst_);
|
|
|
|
assert(dst_ <= limit_);
|
|
|
|
size_t avail = limit_ - dst_;
|
|
|
|
if (avail == 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus s = UnmapCurrentRegion();
|
2015-10-14 08:14:53 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
s = MapNewRegion();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixMmapFile::Append:0");
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
size_t n = (left <= avail) ? left : avail;
|
2017-06-06 19:50:56 +00:00
|
|
|
assert(dst_);
|
2015-10-14 08:14:53 +00:00
|
|
|
memcpy(dst_, src, n);
|
|
|
|
dst_ += n;
|
|
|
|
src += n;
|
|
|
|
left -= n;
|
|
|
|
}
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
IOStatus s;
|
2015-10-14 08:14:53 +00:00
|
|
|
size_t unused = limit_ - dst_;
|
|
|
|
|
|
|
|
s = UnmapCurrentRegion();
|
|
|
|
if (!s.ok()) {
|
2017-06-26 19:42:21 +00:00
|
|
|
s = IOError("While closing mmapped file", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
} else if (unused > 0) {
|
|
|
|
// Trim the extra space at the end of the file
|
|
|
|
if (ftruncate(fd_, file_offset_ - unused) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
s = IOError("While ftruncating mmaped file", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (close(fd_) < 0) {
|
|
|
|
if (s.ok()) {
|
2017-06-26 19:42:21 +00:00
|
|
|
s = IOError("While closing mmapped file", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fd_ = -1;
|
|
|
|
base_ = nullptr;
|
|
|
|
limit_ = nullptr;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
return IOStatus::OK();
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2022-01-19 04:21:37 +00:00
|
|
|
#ifdef HAVE_FULLFSYNC
|
|
|
|
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
|
|
|
|
return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno);
|
|
|
|
}
|
|
|
|
#else // HAVE_FULLFSYNC
|
2015-10-14 08:14:53 +00:00
|
|
|
if (fdatasync(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fdatasync mmapped file", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2022-01-19 04:21:37 +00:00
|
|
|
#endif // HAVE_FULLFSYNC
|
2015-10-14 08:14:53 +00:00
|
|
|
|
|
|
|
return Msync();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Flush data as well as metadata to stable storage.
|
|
|
|
*/
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2022-01-19 04:21:37 +00:00
|
|
|
#ifdef HAVE_FULLFSYNC
|
|
|
|
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
|
|
|
|
return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno);
|
|
|
|
}
|
|
|
|
#else // HAVE_FULLFSYNC
|
2015-10-14 08:14:53 +00:00
|
|
|
if (fsync(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fsync mmaped file", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2022-01-19 04:21:37 +00:00
|
|
|
#endif // HAVE_FULLFSYNC
|
2015-10-14 08:14:53 +00:00
|
|
|
|
|
|
|
return Msync();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the size of valid data in the file. This will not match the
|
|
|
|
* size that is returned from the filesystem because we use mmap
|
|
|
|
* to extend file by map_size every time.
|
|
|
|
*/
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2015-10-14 08:14:53 +00:00
|
|
|
size_t used = dst_ - base_;
|
|
|
|
return file_offset_ + used;
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
|
2015-10-14 08:14:53 +00:00
|
|
|
#ifndef OS_LINUX
|
2018-04-13 00:55:14 +00:00
|
|
|
(void)offset;
|
|
|
|
(void)length;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
#else
|
|
|
|
// free OS pages
|
|
|
|
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
|
|
|
if (ret == 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-10-27 21:27:48 +00:00
|
|
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
|
|
|
|
const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2019-07-30 21:09:02 +00:00
|
|
|
assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
|
|
|
|
assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
|
2015-10-14 08:14:53 +00:00
|
|
|
int alloc_status = 0;
|
|
|
|
if (allow_fallocate_) {
|
2019-05-15 21:16:36 +00:00
|
|
|
alloc_status =
|
|
|
|
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
|
|
|
static_cast<off_t>(offset), static_cast<off_t>(len));
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
if (alloc_status == 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
} else {
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While fallocate offset " + std::to_string(offset) +
|
|
|
|
" len " + std::to_string(len),
|
|
|
|
filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2015-10-27 21:27:48 +00:00
|
|
|
}
|
2015-10-28 18:55:20 +00:00
|
|
|
#endif
|
2015-10-14 08:14:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* PosixWritableFile
|
|
|
|
*
|
|
|
|
* Use posix write to write data to a file.
|
|
|
|
*/
|
|
|
|
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
|
2020-03-12 01:36:43 +00:00
|
|
|
size_t logical_block_size,
|
2015-10-14 08:14:53 +00:00
|
|
|
const EnvOptions& options)
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
: FSWritableFile(options),
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
filename_(fname),
|
2017-01-13 20:01:08 +00:00
|
|
|
use_direct_io_(options.use_direct_writes),
|
2016-12-22 20:51:29 +00:00
|
|
|
fd_(fd),
|
2017-02-23 19:17:49 +00:00
|
|
|
filesize_(0),
|
2020-03-12 01:36:43 +00:00
|
|
|
logical_sector_size_(logical_block_size) {
|
2015-10-14 08:14:53 +00:00
|
|
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
|
|
|
allow_fallocate_ = options.allow_fallocate;
|
|
|
|
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
|
|
|
#endif
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
#ifdef ROCKSDB_RANGESYNC_PRESENT
|
|
|
|
sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
|
|
|
|
#endif // ROCKSDB_RANGESYNC_PRESENT
|
2015-10-14 08:14:53 +00:00
|
|
|
assert(!options.use_mmap_writes);
|
|
|
|
}
|
|
|
|
|
|
|
|
PosixWritableFile::~PosixWritableFile() {
|
|
|
|
if (fd_ >= 0) {
|
2020-07-29 05:58:28 +00:00
|
|
|
IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr);
|
|
|
|
s.PermitUncheckedError();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2017-05-04 08:21:31 +00:00
|
|
|
if (use_direct_io()) {
|
|
|
|
assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
const char* src = data.data();
|
2019-05-15 21:16:36 +00:00
|
|
|
size_t nbytes = data.size();
|
|
|
|
|
|
|
|
if (!PosixWrite(fd_, src, nbytes)) {
|
|
|
|
return IOError("While appending to file", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2019-05-15 21:16:36 +00:00
|
|
|
|
|
|
|
filesize_ += nbytes;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
|
|
|
|
const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2017-05-04 08:21:31 +00:00
|
|
|
if (use_direct_io()) {
|
|
|
|
assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
|
|
|
|
assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
|
|
|
|
}
|
2019-07-30 21:09:02 +00:00
|
|
|
assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
|
2016-11-19 01:06:37 +00:00
|
|
|
const char* src = data.data();
|
2019-05-15 21:16:36 +00:00
|
|
|
size_t nbytes = data.size();
|
|
|
|
if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While pwrite to file at offset " + std::to_string(offset),
|
2019-05-15 21:16:36 +00:00
|
|
|
filename_, errno);
|
2016-11-19 01:06:37 +00:00
|
|
|
}
|
2019-05-15 21:16:36 +00:00
|
|
|
filesize_ = offset + nbytes;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2016-11-19 01:06:37 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
IOStatus s;
|
2017-02-22 18:00:25 +00:00
|
|
|
int r = ftruncate(fd_, size);
|
|
|
|
if (r < 0) {
|
2022-05-06 20:03:58 +00:00
|
|
|
s = IOError("While ftruncate file to size " + std::to_string(size),
|
|
|
|
filename_, errno);
|
2017-02-22 18:00:25 +00:00
|
|
|
} else {
|
|
|
|
filesize_ = size;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
IOStatus s;
|
2015-10-14 08:14:53 +00:00
|
|
|
|
|
|
|
size_t block_size;
|
|
|
|
size_t last_allocated_block;
|
|
|
|
GetPreallocationStatus(&block_size, &last_allocated_block);
|
2021-03-28 16:58:42 +00:00
|
|
|
TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block);
|
2015-10-14 08:14:53 +00:00
|
|
|
if (last_allocated_block > 0) {
|
|
|
|
// trim the extra space preallocated at the end of the file
|
|
|
|
// NOTE(ljin): we probably don't want to surface failure as an IOError,
|
|
|
|
// but it will be nice to log these errors.
|
2018-02-02 20:14:42 +00:00
|
|
|
int dummy __attribute__((__unused__));
|
2015-10-14 08:14:53 +00:00
|
|
|
dummy = ftruncate(fd_, filesize_);
|
2022-07-23 03:16:45 +00:00
|
|
|
#if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE)
|
2017-04-05 22:56:45 +00:00
|
|
|
// in some file systems, ftruncate only trims trailing space if the
|
|
|
|
// new file size is smaller than the current size. Calling fallocate
|
|
|
|
// with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
|
|
|
|
// blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
|
|
|
|
// filesystems:
|
|
|
|
// XFS (since Linux 2.6.38)
|
|
|
|
// ext4 (since Linux 3.0)
|
|
|
|
// Btrfs (since Linux 3.7)
|
|
|
|
// tmpfs (since Linux 3.5)
|
|
|
|
// We ignore error since failure of this operation does not affect
|
|
|
|
// correctness.
|
2017-04-07 01:08:53 +00:00
|
|
|
struct stat file_stats;
|
2017-06-09 02:54:00 +00:00
|
|
|
int result = fstat(fd_, &file_stats);
|
2017-04-07 01:08:53 +00:00
|
|
|
// After ftruncate, we check whether ftruncate has the correct behavior.
|
|
|
|
// If not, we should hack it with FALLOC_FL_PUNCH_HOLE
|
2017-06-09 02:54:00 +00:00
|
|
|
if (result == 0 &&
|
|
|
|
(file_stats.st_size + file_stats.st_blksize - 1) /
|
2019-05-15 21:16:36 +00:00
|
|
|
file_stats.st_blksize !=
|
|
|
|
file_stats.st_blocks / (file_stats.st_blksize / 512)) {
|
2017-04-07 01:08:53 +00:00
|
|
|
IOSTATS_TIMER_GUARD(allocate_nanos);
|
|
|
|
if (allow_fallocate_) {
|
|
|
|
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
|
|
|
|
block_size * last_allocated_block - filesize_);
|
|
|
|
}
|
2017-04-05 22:56:45 +00:00
|
|
|
}
|
|
|
|
#endif
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (close(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
s = IOError("While closing file after writing", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
fd_ = -1;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// write out the cached data to the OS cache
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
return IOStatus::OK();
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2022-01-19 04:21:37 +00:00
|
|
|
#ifdef HAVE_FULLFSYNC
|
|
|
|
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
|
|
|
|
return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
|
|
|
|
}
|
|
|
|
#else // HAVE_FULLFSYNC
|
2015-10-14 08:14:53 +00:00
|
|
|
if (fdatasync(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fdatasync", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2022-01-19 04:21:37 +00:00
|
|
|
#endif // HAVE_FULLFSYNC
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2022-01-19 04:21:37 +00:00
|
|
|
#ifdef HAVE_FULLFSYNC
|
|
|
|
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
|
|
|
|
return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
|
|
|
|
}
|
|
|
|
#else // HAVE_FULLFSYNC
|
2015-10-14 08:14:53 +00:00
|
|
|
if (fsync(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fsync", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2022-01-19 04:21:37 +00:00
|
|
|
#endif // HAVE_FULLFSYNC
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
return filesize_;
|
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
|
2017-11-10 17:25:26 +00:00
|
|
|
void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
|
|
|
|
#ifdef OS_LINUX
|
2017-11-15 22:13:03 +00:00
|
|
|
// Suppress Valgrind "Unimplemented functionality" error.
|
|
|
|
#ifndef ROCKSDB_VALGRIND_RUN
|
2024-05-08 00:54:50 +00:00
|
|
|
uint64_t fcntl_hint = hint;
|
|
|
|
|
2017-11-10 17:25:26 +00:00
|
|
|
if (hint == write_hint_) {
|
|
|
|
return;
|
|
|
|
}
|
2024-05-08 00:54:50 +00:00
|
|
|
if (fcntl(fd_, F_SET_RW_HINT, &fcntl_hint) == 0) {
|
2017-11-10 17:25:26 +00:00
|
|
|
write_hint_ = hint;
|
|
|
|
}
|
2018-04-13 00:55:14 +00:00
|
|
|
#else
|
|
|
|
(void)hint;
|
2019-05-15 21:16:36 +00:00
|
|
|
#endif // ROCKSDB_VALGRIND_RUN
|
2018-04-16 00:19:57 +00:00
|
|
|
#else
|
|
|
|
(void)hint;
|
2019-05-15 21:16:36 +00:00
|
|
|
#endif // OS_LINUX
|
2017-11-10 17:25:26 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
|
2017-01-13 20:01:08 +00:00
|
|
|
if (use_direct_io()) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2016-12-22 20:51:29 +00:00
|
|
|
}
|
2015-10-14 08:14:53 +00:00
|
|
|
#ifndef OS_LINUX
|
2018-04-13 00:55:14 +00:00
|
|
|
(void)offset;
|
|
|
|
(void)length;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
#else
|
|
|
|
// free OS pages
|
|
|
|
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
|
|
|
|
if (ret == 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fadvise NotNeeded", filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-10-27 21:27:48 +00:00
|
|
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
|
|
|
|
const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2019-07-30 21:09:02 +00:00
|
|
|
assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
|
|
|
|
assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
|
2021-05-05 22:49:29 +00:00
|
|
|
TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
|
2015-10-14 08:14:53 +00:00
|
|
|
IOSTATS_TIMER_GUARD(allocate_nanos);
|
|
|
|
int alloc_status = 0;
|
|
|
|
if (allow_fallocate_) {
|
2019-05-15 21:16:36 +00:00
|
|
|
alloc_status =
|
|
|
|
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
|
|
|
static_cast<off_t>(offset), static_cast<off_t>(len));
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
if (alloc_status == 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
} else {
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While fallocate offset " + std::to_string(offset) +
|
|
|
|
" len " + std::to_string(len),
|
|
|
|
filename_, errno);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
}
|
2017-02-01 18:19:47 +00:00
|
|
|
#endif
|
2015-10-14 08:14:53 +00:00
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
|
|
|
|
const IOOptions& opts,
|
|
|
|
IODebugContext* dbg) {
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
#ifdef ROCKSDB_RANGESYNC_PRESENT
|
2019-07-30 21:09:02 +00:00
|
|
|
assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
|
|
|
|
assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
if (sync_file_range_supported_) {
|
|
|
|
int ret;
|
|
|
|
if (strict_bytes_per_sync_) {
|
|
|
|
// Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
|
|
|
|
// that spans all bytes written so far tells `sync_file_range` to wait for
|
|
|
|
// any outstanding writeback requests to finish before issuing a new one.
|
|
|
|
ret =
|
|
|
|
sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
|
|
|
|
SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
|
|
|
|
} else {
|
|
|
|
ret = sync_file_range(fd_, static_cast<off_t>(offset),
|
|
|
|
static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
|
|
|
|
}
|
|
|
|
if (ret != 0) {
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While sync_file_range returned " + std::to_string(ret),
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
filename_, errno);
|
|
|
|
}
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
#endif // ROCKSDB_RANGESYNC_PRESENT
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
|
2015-10-14 08:14:53 +00:00
|
|
|
}
|
|
|
|
|
2017-04-22 03:41:37 +00:00
|
|
|
#ifdef OS_LINUX
|
2015-10-14 08:14:53 +00:00
|
|
|
size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
|
2016-04-21 17:37:27 +00:00
|
|
|
return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
|
2015-10-27 21:27:48 +00:00
|
|
|
}
|
2017-02-02 18:32:40 +00:00
|
|
|
#endif
|
2015-10-27 19:15:55 +00:00
|
|
|
|
2016-09-13 19:08:22 +00:00
|
|
|
/*
|
|
|
|
* PosixRandomRWFile
|
|
|
|
*/
|
|
|
|
|
|
|
|
PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
|
2018-03-05 21:08:17 +00:00
|
|
|
const EnvOptions& /*options*/)
|
2016-09-13 19:08:22 +00:00
|
|
|
: filename_(fname), fd_(fd) {}
|
|
|
|
|
|
|
|
PosixRandomRWFile::~PosixRandomRWFile() {
|
|
|
|
if (fd_ >= 0) {
|
2020-07-29 05:58:28 +00:00
|
|
|
IOStatus s = Close(IOOptions(), nullptr);
|
|
|
|
s.PermitUncheckedError();
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
|
|
|
|
const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2016-09-13 19:08:22 +00:00
|
|
|
const char* src = data.data();
|
2019-05-15 21:16:36 +00:00
|
|
|
size_t nbytes = data.size();
|
|
|
|
if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
|
2022-05-06 20:03:58 +00:00
|
|
|
return IOError("While write random read/write file at offset " +
|
|
|
|
std::to_string(offset),
|
|
|
|
filename_, errno);
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
|
|
|
|
const IOOptions& /*opts*/, Slice* result,
|
|
|
|
char* scratch, IODebugContext* /*dbg*/) const {
|
2016-09-13 19:08:22 +00:00
|
|
|
size_t left = n;
|
|
|
|
char* ptr = scratch;
|
|
|
|
while (left > 0) {
|
|
|
|
ssize_t done = pread(fd_, ptr, left, offset);
|
|
|
|
if (done < 0) {
|
|
|
|
// error while reading from file
|
|
|
|
if (errno == EINTR) {
|
|
|
|
// read was interrupted, try again.
|
|
|
|
continue;
|
|
|
|
}
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While reading random read/write file offset " +
|
2022-05-06 20:03:58 +00:00
|
|
|
std::to_string(offset) + " len " + std::to_string(n),
|
2017-06-26 19:42:21 +00:00
|
|
|
filename_, errno);
|
2016-09-13 19:08:22 +00:00
|
|
|
} else if (done == 0) {
|
|
|
|
// Nothing more to read
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Read `done` bytes
|
|
|
|
ptr += done;
|
|
|
|
offset += done;
|
|
|
|
left -= done;
|
|
|
|
}
|
|
|
|
|
|
|
|
*result = Slice(scratch, n - left);
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
return IOStatus::OK();
|
|
|
|
}
|
2016-09-13 19:08:22 +00:00
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2022-01-19 04:21:37 +00:00
|
|
|
#ifdef HAVE_FULLFSYNC
|
|
|
|
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
|
|
|
|
return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno);
|
|
|
|
}
|
|
|
|
#else // HAVE_FULLFSYNC
|
2016-09-13 19:08:22 +00:00
|
|
|
if (fdatasync(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fdatasync random read/write file", filename_, errno);
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
2022-01-19 04:21:37 +00:00
|
|
|
#endif // HAVE_FULLFSYNC
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2022-01-19 04:21:37 +00:00
|
|
|
#ifdef HAVE_FULLFSYNC
|
|
|
|
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
|
|
|
|
return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno);
|
|
|
|
}
|
|
|
|
#else // HAVE_FULLFSYNC
|
2016-09-13 19:08:22 +00:00
|
|
|
if (fsync(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While fsync random read/write file", filename_, errno);
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
2022-01-19 04:21:37 +00:00
|
|
|
#endif // HAVE_FULLFSYNC
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
2016-09-13 19:08:22 +00:00
|
|
|
if (close(fd_) < 0) {
|
2017-06-26 19:42:21 +00:00
|
|
|
return IOError("While close random read/write file", filename_, errno);
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
fd_ = -1;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
return IOStatus::OK();
|
2016-09-13 19:08:22 +00:00
|
|
|
}
|
|
|
|
|
2018-04-30 19:23:45 +00:00
|
|
|
PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
|
|
|
|
// TODO should have error handling though not much we can do...
|
2018-05-24 22:05:00 +00:00
|
|
|
munmap(this->base_, length_);
|
2018-04-30 19:23:45 +00:00
|
|
|
}
|
|
|
|
|
2016-04-21 17:37:27 +00:00
|
|
|
/*
|
|
|
|
* PosixDirectory
|
|
|
|
*/
|
2021-11-03 19:20:19 +00:00
|
|
|
#if !defined(BTRFS_SUPER_MAGIC)
|
|
|
|
// The magic number for BTRFS is fixed, if it's not defined, define it here
|
|
|
|
#define BTRFS_SUPER_MAGIC 0x9123683E
|
|
|
|
#endif
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
PosixDirectory::PosixDirectory(int fd, const std::string& directory_name)
|
|
|
|
: fd_(fd), directory_name_(directory_name) {
|
2021-11-03 19:20:19 +00:00
|
|
|
is_btrfs_ = false;
|
|
|
|
#ifdef OS_LINUX
|
|
|
|
struct statfs buf;
|
|
|
|
int ret = fstatfs(fd, &buf);
|
2021-12-09 05:57:57 +00:00
|
|
|
is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>(
|
|
|
|
BTRFS_SUPER_MAGIC));
|
2021-11-03 19:20:19 +00:00
|
|
|
#endif
|
|
|
|
}
|
2016-04-21 17:37:27 +00:00
|
|
|
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
PosixDirectory::~PosixDirectory() {
|
|
|
|
if (fd_ >= 0) {
|
|
|
|
IOStatus s = PosixDirectory::Close(IOOptions(), nullptr);
|
|
|
|
s.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
}
|
2015-10-27 19:15:55 +00:00
|
|
|
|
2021-11-03 19:20:19 +00:00
|
|
|
IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) {
|
|
|
|
return FsyncWithDirOptions(opts, dbg, DirFsyncOptions());
|
|
|
|
}
|
|
|
|
|
2022-06-03 04:52:35 +00:00
|
|
|
// Users who want the file entries synced in Directory project must call a
|
|
|
|
// Fsync or FsyncWithDirOptions function before Close
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
IOStatus PosixDirectory::Close(const IOOptions& /*opts*/,
|
|
|
|
IODebugContext* /*dbg*/) {
|
|
|
|
IOStatus s = IOStatus::OK();
|
|
|
|
if (close(fd_) < 0) {
|
|
|
|
s = IOError("While closing directory ", directory_name_, errno);
|
|
|
|
} else {
|
|
|
|
fd_ = -1;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2021-11-03 19:20:19 +00:00
|
|
|
IOStatus PosixDirectory::FsyncWithDirOptions(
|
|
|
|
const IOOptions& /*opts*/, IODebugContext* /*dbg*/,
|
|
|
|
const DirFsyncOptions& dir_fsync_options) {
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
assert(fd_ >= 0); // Check use after close
|
2021-11-03 19:20:19 +00:00
|
|
|
IOStatus s = IOStatus::OK();
|
2017-04-22 03:41:37 +00:00
|
|
|
#ifndef OS_AIX
|
2021-11-03 19:20:19 +00:00
|
|
|
if (is_btrfs_) {
|
|
|
|
// skip dir fsync for new file creation, which is not needed for btrfs
|
|
|
|
if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
// skip dir fsync for renaming file, only need to sync new file
|
|
|
|
if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) {
|
|
|
|
std::string new_name = dir_fsync_options.renamed_new_name;
|
|
|
|
assert(!new_name.empty());
|
|
|
|
int fd;
|
|
|
|
do {
|
|
|
|
IOSTATS_TIMER_GUARD(open_nanos);
|
|
|
|
fd = open(new_name.c_str(), O_RDONLY);
|
|
|
|
} while (fd < 0 && errno == EINTR);
|
|
|
|
if (fd < 0) {
|
|
|
|
s = IOError("While open renaming file", new_name, errno);
|
|
|
|
} else if (fsync(fd) < 0) {
|
|
|
|
s = IOError("While fsync renaming file", new_name, errno);
|
|
|
|
}
|
|
|
|
if (close(fd) < 0) {
|
|
|
|
s = IOError("While closing file after fsync", new_name, errno);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
// fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
|
|
|
|
}
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
|
|
|
|
// skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed
|
|
|
|
// in either the de-construction or the close function, data must have been
|
|
|
|
// fsync-ed before de-construction and close is called
|
2022-01-19 04:21:37 +00:00
|
|
|
#ifdef HAVE_FULLFSYNC
|
|
|
|
// btrfs is a Linux file system, while currently F_FULLFSYNC is available on
|
|
|
|
// Mac OS.
|
|
|
|
assert(!is_btrfs_);
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) {
|
2022-01-19 04:21:37 +00:00
|
|
|
return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno);
|
|
|
|
}
|
|
|
|
#else // HAVE_FULLFSYNC
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
if (fd_ != -1 && fsync(fd_) == -1) {
|
2021-11-03 19:20:19 +00:00
|
|
|
s = IOError("While fsync", "a directory", errno);
|
2015-10-27 19:15:55 +00:00
|
|
|
}
|
2022-01-19 04:21:37 +00:00
|
|
|
#endif // HAVE_FULLFSYNC
|
|
|
|
#endif // OS_AIX
|
2021-11-03 19:20:19 +00:00
|
|
|
return s;
|
2015-10-27 19:15:55 +00:00
|
|
|
}
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2015-10-14 08:14:53 +00:00
|
|
|
#endif
|