mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-30 04:41:49 +00:00
54cb9c77d9
Summary: The following are risks associated with pointer-to-pointer reinterpret_cast: * Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do. * Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally. I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement: * Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have `struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic. * Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance. With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain. A couple of related interventions included here: * Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle. * Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse). Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work. I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308 Test Plan: existing tests, CI Reviewed By: ltamasi Differential Revision: D53204947 Pulled By: pdillinger fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
1102 lines
35 KiB
C++
1102 lines
35 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#if defined(OS_WIN)
|
|
|
|
#include "port/win/io_win.h"
|
|
|
|
#include "env_win.h"
|
|
#include "monitoring/iostats_context_imp.h"
|
|
#include "test_util/sync_point.h"
|
|
#include "util/aligned_buffer.h"
|
|
#include "util/coding.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace port {
|
|
|
|
/*
|
|
* DirectIOHelper
|
|
*/
|
|
namespace {
|
|
|
|
const size_t kSectorSize = 512;
|
|
|
|
inline bool IsPowerOfTwo(const size_t alignment) {
|
|
return ((alignment) & (alignment - 1)) == 0;
|
|
}
|
|
|
|
inline bool IsAligned(size_t alignment, const void* ptr) {
|
|
return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
|
|
}
|
|
} // namespace
|
|
|
|
std::string GetWindowsErrSz(DWORD err) {
|
|
std::string Err;
|
|
LPSTR lpMsgBuf = nullptr;
|
|
FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
|
|
FORMAT_MESSAGE_IGNORE_INSERTS,
|
|
NULL, err,
|
|
0, // Default language
|
|
reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
|
|
|
|
if (lpMsgBuf) {
|
|
Err = lpMsgBuf;
|
|
LocalFree(lpMsgBuf);
|
|
}
|
|
return Err;
|
|
}
|
|
|
|
// We preserve the original name of this interface to denote the original idea
|
|
// behind it.
|
|
// All reads happen by a specified offset and pwrite interface does not change
|
|
// the position of the file pointer. Judging from the man page and errno it does
|
|
// execute
|
|
// lseek atomically to return the position of the file back where it was.
|
|
// WriteFile() does not
|
|
// have this capability. Therefore, for both pread and pwrite the pointer is
|
|
// advanced to the next position
|
|
// which is fine for writes because they are (should be) sequential.
|
|
// Because all the reads/writes happen by the specified offset, the caller in
|
|
// theory should not
|
|
// rely on the current file offset.
|
|
IOStatus pwrite(const WinFileData* file_data, const Slice& data,
|
|
uint64_t offset, size_t& bytes_written) {
|
|
IOStatus s;
|
|
bytes_written = 0;
|
|
|
|
size_t num_bytes = data.size();
|
|
if (num_bytes > std::numeric_limits<DWORD>::max()) {
|
|
// May happen in 64-bit builds where size_t is 64-bits but
|
|
// long is still 32-bit, but that's the API here at the moment
|
|
return IOStatus::InvalidArgument(
|
|
"num_bytes is too large for a single write: " + file_data->GetName());
|
|
}
|
|
|
|
OVERLAPPED overlapped = {0};
|
|
ULARGE_INTEGER offsetUnion;
|
|
offsetUnion.QuadPart = offset;
|
|
|
|
overlapped.Offset = offsetUnion.LowPart;
|
|
overlapped.OffsetHigh = offsetUnion.HighPart;
|
|
|
|
DWORD bytesWritten = 0;
|
|
|
|
if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
|
|
static_cast<DWORD>(num_bytes), &bytesWritten,
|
|
&overlapped)) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
|
|
lastError);
|
|
} else {
|
|
bytes_written = bytesWritten;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// See comments for pwrite above
|
|
IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
|
|
uint64_t offset, size_t& bytes_read) {
|
|
IOStatus s;
|
|
bytes_read = 0;
|
|
|
|
if (num_bytes > std::numeric_limits<DWORD>::max()) {
|
|
return IOStatus::InvalidArgument(
|
|
"num_bytes is too large for a single read: " + file_data->GetName());
|
|
}
|
|
|
|
OVERLAPPED overlapped = {0};
|
|
ULARGE_INTEGER offsetUnion;
|
|
offsetUnion.QuadPart = offset;
|
|
|
|
overlapped.Offset = offsetUnion.LowPart;
|
|
overlapped.OffsetHigh = offsetUnion.HighPart;
|
|
|
|
DWORD bytesRead = 0;
|
|
|
|
if (FALSE == ReadFile(file_data->GetFileHandle(), src,
|
|
static_cast<DWORD>(num_bytes), &bytesRead,
|
|
&overlapped)) {
|
|
auto lastError = GetLastError();
|
|
// EOF is OK with zero bytes read
|
|
if (lastError != ERROR_HANDLE_EOF) {
|
|
s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
|
|
lastError);
|
|
}
|
|
} else {
|
|
bytes_read = bytesRead;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// SetFileInformationByHandle() is capable of fast pre-allocates.
|
|
// However, this does not change the file end position unless the file is
|
|
// truncated and the pre-allocated space is not considered filled with zeros.
|
|
IOStatus fallocate(const std::string& filename, HANDLE hFile,
|
|
uint64_t to_size) {
|
|
IOStatus status;
|
|
|
|
FILE_ALLOCATION_INFO alloc_info;
|
|
alloc_info.AllocationSize.QuadPart = to_size;
|
|
|
|
if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
|
|
sizeof(FILE_ALLOCATION_INFO))) {
|
|
auto lastError = GetLastError();
|
|
status = IOErrorFromWindowsError(
|
|
"Failed to pre-allocate space: " + filename, lastError);
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
|
|
IOStatus status;
|
|
|
|
FILE_END_OF_FILE_INFO end_of_file;
|
|
end_of_file.EndOfFile.QuadPart = toSize;
|
|
|
|
if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
|
|
sizeof(FILE_END_OF_FILE_INFO))) {
|
|
auto lastError = GetLastError();
|
|
status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
|
|
lastError);
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
|
|
size_t /*max_size*/) {
|
|
// Returning 0 is safe as it causes the table reader to generate a unique ID.
|
|
// This is suboptimal for performance as it prevents multiple table readers
|
|
// for the same file from sharing cached blocks. For example, if users have
|
|
// a low value for `max_open_files`, there can be many table readers opened
|
|
// for the same file.
|
|
//
|
|
// TODO: this is a temporarily solution as it is safe but not optimal for
|
|
// performance. For more details see discussion in
|
|
// https://github.com/facebook/rocksdb/pull/5844.
|
|
return 0;
|
|
}
|
|
|
|
WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
|
|
bool direct_io)
|
|
: filename_(filename),
|
|
hFile_(hFile),
|
|
use_direct_io_(direct_io),
|
|
sector_size_(WinFileSystem::GetSectorSize(filename)) {}
|
|
|
|
bool WinFileData::IsSectorAligned(const size_t off) const {
|
|
return (off & (sector_size_ - 1)) == 0;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// WinMmapReadableFile
|
|
|
|
WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
|
|
HANDLE hFile, HANDLE hMap,
|
|
const void* mapped_region,
|
|
size_t length)
|
|
: WinFileData(fileName, hFile, false /* use_direct_io */),
|
|
hMap_(hMap),
|
|
mapped_region_(mapped_region),
|
|
length_(length) {}
|
|
|
|
WinMmapReadableFile::~WinMmapReadableFile() {
|
|
BOOL ret __attribute__((__unused__));
|
|
ret = ::UnmapViewOfFile(mapped_region_);
|
|
assert(ret);
|
|
|
|
ret = ::CloseHandle(hMap_);
|
|
assert(ret);
|
|
}
|
|
|
|
IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
|
|
const IOOptions& /*options*/, Slice* result,
|
|
char* scratch,
|
|
IODebugContext* /*dbg*/) const {
|
|
IOStatus s;
|
|
|
|
if (offset > length_) {
|
|
*result = Slice();
|
|
return IOError(filename_, EINVAL);
|
|
} else if (offset + n > length_) {
|
|
n = length_ - static_cast<size_t>(offset);
|
|
}
|
|
*result = Slice(static_cast<const char*>(mapped_region_) + offset, n);
|
|
return s;
|
|
}
|
|
|
|
IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
|
|
return GetUniqueIdFromFile(hFile_, id, max_size);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
/// WinMmapFile
|
|
|
|
// Can only truncate or reserve to a sector size aligned if
|
|
// used on files that are opened with Unbuffered I/O
|
|
IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
|
|
return ftruncate(filename_, hFile_, toSize);
|
|
}
|
|
|
|
IOStatus WinMmapFile::UnmapCurrentRegion() {
|
|
IOStatus status;
|
|
|
|
if (mapped_begin_ != nullptr) {
|
|
if (!::UnmapViewOfFile(mapped_begin_)) {
|
|
status = IOErrorFromWindowsError(
|
|
"Failed to unmap file view: " + filename_, GetLastError());
|
|
}
|
|
|
|
// Move on to the next portion of the file
|
|
file_offset_ += view_size_;
|
|
|
|
// UnmapView automatically sends data to disk but not the metadata
|
|
// which is good and provides some equivalent of fdatasync() on Linux
|
|
// therefore, we donot need separate flag for metadata
|
|
mapped_begin_ = nullptr;
|
|
mapped_end_ = nullptr;
|
|
dst_ = nullptr;
|
|
|
|
last_sync_ = nullptr;
|
|
pending_sync_ = false;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
|
|
IODebugContext* dbg) {
|
|
IOStatus status;
|
|
|
|
assert(mapped_begin_ == nullptr);
|
|
|
|
size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
|
|
|
|
if (minDiskSize > reserved_size_) {
|
|
status = Allocate(file_offset_, view_size_, options, dbg);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
}
|
|
|
|
// Need to remap
|
|
if (hMap_ == NULL || reserved_size_ > mapping_size_) {
|
|
if (hMap_ != NULL) {
|
|
// Unmap the previous one
|
|
BOOL ret __attribute__((__unused__));
|
|
ret = ::CloseHandle(hMap_);
|
|
assert(ret);
|
|
hMap_ = NULL;
|
|
}
|
|
|
|
ULARGE_INTEGER mappingSize;
|
|
mappingSize.QuadPart = reserved_size_;
|
|
|
|
hMap_ = CreateFileMappingA(
|
|
hFile_,
|
|
NULL, // Security attributes
|
|
PAGE_READWRITE, // There is not a write only mode for mapping
|
|
mappingSize.HighPart, // Enable mapping the whole file but the actual
|
|
// amount mapped is determined by MapViewOfFile
|
|
mappingSize.LowPart,
|
|
NULL); // Mapping name
|
|
|
|
if (NULL == hMap_) {
|
|
return IOErrorFromWindowsError(
|
|
"WindowsMmapFile failed to create file mapping for: " + filename_,
|
|
GetLastError());
|
|
}
|
|
|
|
mapping_size_ = reserved_size_;
|
|
}
|
|
|
|
ULARGE_INTEGER offset;
|
|
offset.QuadPart = file_offset_;
|
|
|
|
// View must begin at the granularity aligned offset
|
|
mapped_begin_ =
|
|
static_cast<char*>(MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart,
|
|
offset.LowPart, view_size_, NULL));
|
|
|
|
if (!mapped_begin_) {
|
|
status = IOErrorFromWindowsError(
|
|
"WindowsMmapFile failed to map file view: " + filename_,
|
|
GetLastError());
|
|
} else {
|
|
mapped_end_ = mapped_begin_ + view_size_;
|
|
dst_ = mapped_begin_;
|
|
last_sync_ = mapped_begin_;
|
|
pending_sync_ = false;
|
|
}
|
|
return status;
|
|
}
|
|
|
|
IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
|
|
return fallocate(filename_, hFile_, spaceToReserve);
|
|
}
|
|
|
|
WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
|
|
size_t page_size, size_t allocation_granularity,
|
|
const FileOptions& options)
|
|
: WinFileData(fname, hFile, false),
|
|
FSWritableFile(options),
|
|
hMap_(NULL),
|
|
page_size_(page_size),
|
|
allocation_granularity_(allocation_granularity),
|
|
reserved_size_(0),
|
|
mapping_size_(0),
|
|
view_size_(0),
|
|
mapped_begin_(nullptr),
|
|
mapped_end_(nullptr),
|
|
dst_(nullptr),
|
|
last_sync_(nullptr),
|
|
file_offset_(0),
|
|
pending_sync_(false) {
|
|
// Allocation granularity must be obtained from GetSystemInfo() and must be
|
|
// a power of two.
|
|
assert(allocation_granularity > 0);
|
|
assert((allocation_granularity & (allocation_granularity - 1)) == 0);
|
|
|
|
assert(page_size > 0);
|
|
assert((page_size & (page_size - 1)) == 0);
|
|
|
|
// Only for memory mapped writes
|
|
assert(options.use_mmap_writes);
|
|
|
|
// View size must be both the multiple of allocation_granularity AND the
|
|
// page size and the granularity is usually a multiple of a page size.
|
|
const size_t viewSize =
|
|
32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
|
|
view_size_ = Roundup(viewSize, allocation_granularity_);
|
|
}
|
|
|
|
WinMmapFile::~WinMmapFile() {
|
|
if (hFile_) {
|
|
this->Close(IOOptions(), nullptr);
|
|
}
|
|
}
|
|
|
|
IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
|
|
IODebugContext* dbg) {
|
|
const char* src = data.data();
|
|
size_t left = data.size();
|
|
|
|
while (left > 0) {
|
|
assert(mapped_begin_ <= dst_);
|
|
size_t avail = mapped_end_ - dst_;
|
|
|
|
if (avail == 0) {
|
|
IOStatus s = UnmapCurrentRegion();
|
|
if (s.ok()) {
|
|
s = MapNewRegion(options, dbg);
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
} else {
|
|
size_t n = std::min(left, avail);
|
|
memcpy(dst_, src, n);
|
|
dst_ += n;
|
|
src += n;
|
|
left -= n;
|
|
pending_sync_ = true;
|
|
}
|
|
}
|
|
|
|
// Now make sure that the last partial page is padded with zeros if needed
|
|
size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
|
|
if (bytesToPad > 0) {
|
|
memset(dst_, 0, bytesToPad);
|
|
}
|
|
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
// Means Close() will properly take care of truncate
|
|
// and it does not need any additional information
|
|
IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
|
|
IOStatus s;
|
|
|
|
assert(NULL != hFile_);
|
|
|
|
// We truncate to the precise size so no
|
|
// uninitialized data at the end. SetEndOfFile
|
|
// which we use does not write zeros and it is good.
|
|
uint64_t targetSize = GetFileSize(options, dbg);
|
|
|
|
if (mapped_begin_ != nullptr) {
|
|
// Sync before unmapping to make sure everything
|
|
// is on disk and there is not a lazy writing
|
|
// so we are deterministic with the tests
|
|
Sync(options, dbg);
|
|
s = UnmapCurrentRegion();
|
|
}
|
|
|
|
if (NULL != hMap_) {
|
|
BOOL ret = ::CloseHandle(hMap_);
|
|
if (!ret && s.ok()) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError(
|
|
"Failed to Close mapping for file: " + filename_, lastError);
|
|
}
|
|
|
|
hMap_ = NULL;
|
|
}
|
|
|
|
if (hFile_ != NULL) {
|
|
TruncateFile(targetSize);
|
|
|
|
BOOL ret = ::CloseHandle(hFile_);
|
|
hFile_ = NULL;
|
|
|
|
if (!ret && s.ok()) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError(
|
|
"Failed to close file map handle: " + filename_, lastError);
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
// Flush only data
|
|
IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
IOStatus s;
|
|
|
|
// Some writes occurred since last sync
|
|
if (dst_ > last_sync_) {
|
|
assert(mapped_begin_);
|
|
assert(dst_);
|
|
assert(dst_ > mapped_begin_);
|
|
assert(dst_ < mapped_end_);
|
|
|
|
size_t page_begin =
|
|
TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
|
|
size_t page_end =
|
|
TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
|
|
|
|
// Flush only the amount of that is a multiple of pages
|
|
if (!::FlushViewOfFile(mapped_begin_ + page_begin,
|
|
(page_end - page_begin) + page_size_)) {
|
|
s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
|
|
GetLastError());
|
|
} else {
|
|
last_sync_ = dst_;
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
/**
|
|
* Flush data as well as metadata to stable storage.
|
|
*/
|
|
IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
|
|
IOStatus s = Sync(options, dbg);
|
|
|
|
// Flush metadata
|
|
if (s.ok() && pending_sync_) {
|
|
if (!::FlushFileBuffers(hFile_)) {
|
|
s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
|
|
GetLastError());
|
|
}
|
|
pending_sync_ = false;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
/**
|
|
* Get the size of valid data in the file. This will not match the
|
|
* size that is returned from the filesystem because we use mmap
|
|
* to extend file by map_size every time.
|
|
*/
|
|
uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
size_t used = dst_ - mapped_begin_;
|
|
return file_offset_ + used;
|
|
}
|
|
|
|
IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
|
|
const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
IOStatus status;
|
|
TEST_KILL_RANDOM("WinMmapFile::Allocate");
|
|
|
|
// Make sure that we reserve an aligned amount of space
|
|
// since the reservation block size is driven outside so we want
|
|
// to check if we are ok with reservation here
|
|
size_t spaceToReserve =
|
|
Roundup(static_cast<size_t>(offset + len), view_size_);
|
|
// Nothing to do
|
|
if (spaceToReserve <= reserved_size_) {
|
|
return status;
|
|
}
|
|
|
|
IOSTATS_TIMER_GUARD(allocate_nanos);
|
|
status = PreallocateInternal(spaceToReserve);
|
|
if (status.ok()) {
|
|
reserved_size_ = spaceToReserve;
|
|
}
|
|
return status;
|
|
}
|
|
|
|
size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
|
|
return GetUniqueIdFromFile(hFile_, id, max_size);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////
|
|
// WinSequentialFile
|
|
|
|
WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
|
|
const FileOptions& options)
|
|
: WinFileData(fname, f, options.use_direct_reads) {}
|
|
|
|
WinSequentialFile::~WinSequentialFile() {
|
|
assert(hFile_ != INVALID_HANDLE_VALUE);
|
|
}
|
|
|
|
IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
|
|
Slice* result, char* scratch,
|
|
IODebugContext* /*dbg*/) {
|
|
IOStatus s;
|
|
size_t r = 0;
|
|
|
|
assert(result != nullptr);
|
|
if (WinFileData::use_direct_io()) {
|
|
return IOStatus::NotSupported("Read() does not support direct_io");
|
|
}
|
|
|
|
// Windows ReadFile API accepts a DWORD.
|
|
// While it is possible to read in a loop if n is too big
|
|
// it is an unlikely case.
|
|
if (n > std::numeric_limits<DWORD>::max()) {
|
|
return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
|
|
filename_);
|
|
}
|
|
|
|
DWORD bytesToRead =
|
|
static_cast<DWORD>(n); // cast is safe due to the check above
|
|
DWORD bytesRead = 0;
|
|
BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
|
|
if (ret != FALSE) {
|
|
r = bytesRead;
|
|
} else {
|
|
auto lastError = GetLastError();
|
|
if (lastError != ERROR_HANDLE_EOF) {
|
|
s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
|
|
}
|
|
}
|
|
|
|
*result = Slice(scratch, r);
|
|
return s;
|
|
}
|
|
|
|
IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
|
|
uint64_t offset,
|
|
size_t& bytes_read) const {
|
|
return pread(this, src, numBytes, offset, bytes_read);
|
|
}
|
|
|
|
IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
|
|
const IOOptions& /*opts*/,
|
|
Slice* result, char* scratch,
|
|
IODebugContext* /*dbg*/) {
|
|
if (!WinFileData::use_direct_io()) {
|
|
return IOStatus::NotSupported("This function is only used for direct_io");
|
|
}
|
|
|
|
assert(IsSectorAligned(static_cast<size_t>(offset)));
|
|
assert(IsSectorAligned(static_cast<size_t>(n)));
|
|
|
|
size_t bytes_read = 0; // out param
|
|
IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
|
|
bytes_read);
|
|
*result = Slice(scratch, bytes_read);
|
|
return s;
|
|
}
|
|
|
|
IOStatus WinSequentialFile::Skip(uint64_t n) {
|
|
// Can't handle more than signed max as SetFilePointerEx accepts a signed
|
|
// 64-bit integer. As such it is a highly unlikley case to have n so large.
|
|
if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
|
|
return IOStatus::InvalidArgument(
|
|
"n is too large for a single SetFilePointerEx() call" + filename_);
|
|
}
|
|
|
|
LARGE_INTEGER li;
|
|
li.QuadPart = static_cast<LONGLONG>(n); // cast is safe due to the check
|
|
// above
|
|
BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
|
|
if (ret == FALSE) {
|
|
auto lastError = GetLastError();
|
|
return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
|
|
lastError);
|
|
}
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/// WinRandomAccessBase
|
|
|
|
inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
|
|
char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
|
|
return pread(file_base_, src, numBytes, offset, bytes_read);
|
|
}
|
|
|
|
inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
|
|
size_t alignment,
|
|
const FileOptions& options)
|
|
: file_base_(file_base),
|
|
alignment_(std::max(alignment, file_base->GetSectorSize())) {
|
|
assert(!options.use_mmap_reads);
|
|
}
|
|
|
|
inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
|
|
Slice* result,
|
|
char* scratch) const {
|
|
// Check buffer alignment
|
|
if (file_base_->use_direct_io()) {
|
|
assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
|
|
assert(IsAligned(alignment_, scratch));
|
|
}
|
|
|
|
if (n == 0) {
|
|
*result = Slice(scratch, 0);
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
size_t bytes_read = 0;
|
|
IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
|
|
*result = Slice(scratch, bytes_read);
|
|
return s;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/// WinRandomAccessFile
|
|
|
|
WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
|
|
size_t alignment,
|
|
const FileOptions& options)
|
|
: WinFileData(fname, hFile, options.use_direct_reads),
|
|
WinRandomAccessImpl(this, alignment, options) {}
|
|
|
|
WinRandomAccessFile::~WinRandomAccessFile() {}
|
|
|
|
IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
|
|
const IOOptions& /*options*/, Slice* result,
|
|
char* scratch,
|
|
IODebugContext* /*dbg*/) const {
|
|
return ReadImpl(offset, n, result, scratch);
|
|
}
|
|
|
|
IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
|
|
return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
|
|
}
|
|
|
|
size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
|
|
return GetAlignment();
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////
|
|
// WinWritableImpl
|
|
//
|
|
|
|
inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
|
|
return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
|
|
spaceToReserve);
|
|
}
|
|
|
|
inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
|
|
size_t alignment)
|
|
: file_data_(file_data),
|
|
alignment_(std::max(alignment, file_data->GetSectorSize())),
|
|
next_write_offset_(0),
|
|
reservedsize_(0) {
|
|
// Query current position in case ReopenWritableFile is called
|
|
// This position is only important for buffered writes
|
|
// for unbuffered writes we explicitely specify the position.
|
|
LARGE_INTEGER zero_move;
|
|
zero_move.QuadPart = 0; // Do not move
|
|
LARGE_INTEGER pos;
|
|
pos.QuadPart = 0;
|
|
BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
|
|
FILE_CURRENT);
|
|
// Querying no supped to fail
|
|
if (ret != 0) {
|
|
next_write_offset_ = pos.QuadPart;
|
|
} else {
|
|
assert(false);
|
|
}
|
|
}
|
|
|
|
inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
|
|
IOStatus s;
|
|
|
|
if (data.size() > std::numeric_limits<DWORD>::max()) {
|
|
return IOStatus::InvalidArgument("data is too long for a single write" +
|
|
file_data_->GetName());
|
|
}
|
|
|
|
size_t bytes_written = 0; // out param
|
|
|
|
if (file_data_->use_direct_io()) {
|
|
// With no offset specified we are appending
|
|
// to the end of the file
|
|
assert(file_data_->IsSectorAligned(next_write_offset_));
|
|
assert(file_data_->IsSectorAligned(data.size()));
|
|
assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
|
|
s = pwrite(file_data_, data, next_write_offset_, bytes_written);
|
|
} else {
|
|
DWORD bytesWritten = 0;
|
|
if (!WriteFile(file_data_->GetFileHandle(), data.data(),
|
|
static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError(
|
|
"Failed to WriteFile: " + file_data_->GetName(), lastError);
|
|
} else {
|
|
bytes_written = bytesWritten;
|
|
}
|
|
}
|
|
|
|
if (s.ok()) {
|
|
if (bytes_written == data.size()) {
|
|
// This matters for direct_io cases where
|
|
// we rely on the fact that next_write_offset_
|
|
// is sector aligned
|
|
next_write_offset_ += bytes_written;
|
|
} else {
|
|
s = IOStatus::IOError("Failed to write all bytes: " +
|
|
file_data_->GetName());
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
|
|
uint64_t offset) {
|
|
if (file_data_->use_direct_io()) {
|
|
assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
|
|
assert(file_data_->IsSectorAligned(data.size()));
|
|
assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
|
|
}
|
|
|
|
size_t bytes_written = 0;
|
|
IOStatus s = pwrite(file_data_, data, offset, bytes_written);
|
|
|
|
if (s.ok()) {
|
|
if (bytes_written == data.size()) {
|
|
// For sequential write this would be simple
|
|
// size extension by data.size()
|
|
uint64_t write_end = offset + bytes_written;
|
|
if (write_end >= next_write_offset_) {
|
|
next_write_offset_ = write_end;
|
|
}
|
|
} else {
|
|
s = IOStatus::IOError("Failed to write all of the requested data: " +
|
|
file_data_->GetName());
|
|
}
|
|
}
|
|
return s;
|
|
}
|
|
|
|
inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
|
|
// It is tempting to check for the size for sector alignment
|
|
// but truncation may come at the end and there is not a requirement
|
|
// for this to be sector aligned so long as we do not attempt to write
|
|
// after that. The interface docs state that the behavior is undefined
|
|
// in that case.
|
|
IOStatus s =
|
|
ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
|
|
|
|
if (s.ok()) {
|
|
next_write_offset_ = size;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
inline IOStatus WinWritableImpl::CloseImpl() {
|
|
IOStatus s;
|
|
|
|
auto hFile = file_data_->GetFileHandle();
|
|
assert(INVALID_HANDLE_VALUE != hFile);
|
|
|
|
if (!::FlushFileBuffers(hFile)) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError(
|
|
"FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
|
|
lastError);
|
|
}
|
|
|
|
if (!file_data_->CloseFile() && s.ok()) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError(
|
|
"CloseHandle failed for: " + file_data_->GetName(), lastError);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
IOStatus s;
|
|
if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError(
|
|
"FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
|
|
lastError);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
|
|
IOStatus status;
|
|
TEST_KILL_RANDOM("WinWritableFile::Allocate");
|
|
|
|
// Make sure that we reserve an aligned amount of space
|
|
// since the reservation block size is driven outside so we want
|
|
// to check if we are ok with reservation here
|
|
size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
|
|
static_cast<size_t>(alignment_));
|
|
// Nothing to do
|
|
if (spaceToReserve <= reservedsize_) {
|
|
return status;
|
|
}
|
|
|
|
IOSTATS_TIMER_GUARD(allocate_nanos);
|
|
status = PreallocateInternal(spaceToReserve);
|
|
if (status.ok()) {
|
|
reservedsize_ = spaceToReserve;
|
|
}
|
|
return status;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/// WinWritableFile
|
|
|
|
WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
|
|
size_t alignment, size_t /* capacity */,
|
|
const FileOptions& options)
|
|
: WinFileData(fname, hFile, options.use_direct_writes),
|
|
WinWritableImpl(this, alignment),
|
|
FSWritableFile(options) {
|
|
assert(!options.use_mmap_writes);
|
|
}
|
|
|
|
WinWritableFile::~WinWritableFile() {}
|
|
|
|
// Indicates if the class makes use of direct I/O
|
|
bool WinWritableFile::use_direct_io() const {
|
|
return WinFileData::use_direct_io();
|
|
}
|
|
|
|
size_t WinWritableFile::GetRequiredBufferAlignment() const {
|
|
return static_cast<size_t>(GetAlignment());
|
|
}
|
|
|
|
IOStatus WinWritableFile::Append(const Slice& data,
|
|
const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return AppendImpl(data);
|
|
}
|
|
|
|
IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
|
|
const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return PositionedAppendImpl(data, offset);
|
|
}
|
|
|
|
// Need to implement this so the file is truncated correctly
|
|
// when buffered and unbuffered mode
|
|
IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return TruncateImpl(size);
|
|
}
|
|
|
|
IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return CloseImpl();
|
|
}
|
|
|
|
// write out the cached data to the OS cache
|
|
// This is now taken care of the WritableFileWriter
|
|
IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
|
|
return SyncImpl(options, dbg);
|
|
}
|
|
|
|
IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
|
|
return SyncImpl(options, dbg);
|
|
}
|
|
|
|
bool WinWritableFile::IsSyncThreadSafe() const { return true; }
|
|
|
|
uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return GetFileNextWriteOffset();
|
|
}
|
|
|
|
IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
|
|
const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return AllocateImpl(offset, len);
|
|
}
|
|
|
|
size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
|
|
return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////
|
|
/// WinRandomRWFile
|
|
|
|
WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
|
|
size_t alignment, const FileOptions& options)
|
|
: WinFileData(fname, hFile,
|
|
options.use_direct_reads && options.use_direct_writes),
|
|
WinRandomAccessImpl(this, alignment, options),
|
|
WinWritableImpl(this, alignment) {}
|
|
|
|
bool WinRandomRWFile::use_direct_io() const {
|
|
return WinFileData::use_direct_io();
|
|
}
|
|
|
|
size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
|
|
assert(WinRandomAccessImpl::GetAlignment() ==
|
|
WinWritableImpl::GetAlignment());
|
|
return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
|
|
}
|
|
|
|
IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
|
|
const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return PositionedAppendImpl(data, offset);
|
|
}
|
|
|
|
IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
|
|
const IOOptions& /*options*/, Slice* result,
|
|
char* scratch, IODebugContext* /*dbg*/) const {
|
|
return ReadImpl(offset, n, result, scratch);
|
|
}
|
|
|
|
IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
|
|
return SyncImpl(options, dbg);
|
|
}
|
|
|
|
IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return CloseImpl();
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
/// WinMemoryMappedBufer
|
|
WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
|
|
BOOL ret
|
|
#if defined(_MSC_VER)
|
|
= FALSE;
|
|
#else
|
|
__attribute__((__unused__));
|
|
#endif
|
|
if (base_ != nullptr) {
|
|
ret = ::UnmapViewOfFile(base_);
|
|
assert(ret);
|
|
base_ = nullptr;
|
|
}
|
|
if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
|
|
ret = ::CloseHandle(map_handle_);
|
|
assert(ret);
|
|
map_handle_ = NULL;
|
|
}
|
|
if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
|
|
ret = ::CloseHandle(file_handle_);
|
|
assert(ret);
|
|
file_handle_ = NULL;
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
/// WinDirectory
|
|
|
|
IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
return IOStatus::OK();
|
|
}
|
|
|
|
IOStatus WinDirectory::Close(const IOOptions& /*options*/,
|
|
IODebugContext* /*dbg*/) {
|
|
IOStatus s = IOStatus::OK();
|
|
BOOL ret __attribute__((__unused__));
|
|
if (handle_ != INVALID_HANDLE_VALUE) {
|
|
ret = ::CloseHandle(handle_);
|
|
if (!ret) {
|
|
auto lastError = GetLastError();
|
|
s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(),
|
|
lastError);
|
|
}
|
|
handle_ = NULL;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
|
|
return GetUniqueIdFromFile(handle_, id, max_size);
|
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
|
/// WinFileLock
|
|
|
|
WinFileLock::~WinFileLock() {
|
|
BOOL ret __attribute__((__unused__));
|
|
ret = ::CloseHandle(hFile_);
|
|
assert(ret);
|
|
}
|
|
|
|
} // namespace port
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
#endif
|