rocksdb/port/win/io_win.cc

1104 lines
30 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "port/win/io_win.h"
#include "monitoring/iostats_context_imp.h"
#include "util/aligned_buffer.h"
#include "util/coding.h"
#include "util/sync_point.h"
namespace rocksdb {
namespace port {
/*
* DirectIOHelper
*/
namespace {
const size_t kSectorSize = 512;
inline
bool IsPowerOfTwo(const size_t alignment) {
return ((alignment) & (alignment - 1)) == 0;
}
inline
bool IsSectorAligned(const size_t off) {
return (off & (kSectorSize - 1)) == 0;
}
inline
bool IsAligned(size_t alignment, const void* ptr) {
return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
}
}
std::string GetWindowsErrSz(DWORD err) {
LPSTR lpMsgBuf;
FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err,
0, // Default language
reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
std::string Err = lpMsgBuf;
LocalFree(lpMsgBuf);
return Err;
}
// We preserve the original name of this interface to denote the original idea
// behind it.
// All reads happen by a specified offset and pwrite interface does not change
// the position of the file pointer. Judging from the man page and errno it does
// execute
// lseek atomically to return the position of the file back where it was.
// WriteFile() does not
// have this capability. Therefore, for both pread and pwrite the pointer is
// advanced to the next position
// which is fine for writes because they are (should be) sequential.
// Because all the reads/writes happen by the specified offset, the caller in
// theory should not
// rely on the current file offset.
Status pwrite(const WinFileData* file_data, const Slice& data,
uint64_t offset, size_t& bytes_written) {
Status s;
bytes_written = 0;
size_t num_bytes = data.size();
if (num_bytes > std::numeric_limits<DWORD>::max()) {
// May happen in 64-bit builds where size_t is 64-bits but
// long is still 32-bit, but that's the API here at the moment
return Status::InvalidArgument("num_bytes is too large for a single write: " +
file_data->GetName());
}
OVERLAPPED overlapped = { 0 };
ULARGE_INTEGER offsetUnion;
offsetUnion.QuadPart = offset;
overlapped.Offset = offsetUnion.LowPart;
overlapped.OffsetHigh = offsetUnion.HighPart;
DWORD bytesWritten = 0;
if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
&bytesWritten, &overlapped)) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
lastError);
} else {
bytes_written = bytesWritten;
}
return s;
}
// See comments for pwrite above
Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
uint64_t offset, size_t& bytes_read) {
Status s;
bytes_read = 0;
if (num_bytes > std::numeric_limits<DWORD>::max()) {
return Status::InvalidArgument("num_bytes is too large for a single read: " +
file_data->GetName());
}
OVERLAPPED overlapped = { 0 };
ULARGE_INTEGER offsetUnion;
offsetUnion.QuadPart = offset;
overlapped.Offset = offsetUnion.LowPart;
overlapped.OffsetHigh = offsetUnion.HighPart;
DWORD bytesRead = 0;
if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
&bytesRead, &overlapped)) {
auto lastError = GetLastError();
// EOF is OK with zero bytes read
if (lastError != ERROR_HANDLE_EOF) {
s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
lastError);
}
} else {
bytes_read = bytesRead;
}
return s;
}
// SetFileInformationByHandle() is capable of fast pre-allocates.
// However, this does not change the file end position unless the file is
// truncated and the pre-allocated space is not considered filled with zeros.
Status fallocate(const std::string& filename, HANDLE hFile,
uint64_t to_size) {
Status status;
FILE_ALLOCATION_INFO alloc_info;
alloc_info.AllocationSize.QuadPart = to_size;
if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
sizeof(FILE_ALLOCATION_INFO))) {
auto lastError = GetLastError();
status = IOErrorFromWindowsError(
"Failed to pre-allocate space: " + filename, lastError);
}
return status;
}
Status ftruncate(const std::string& filename, HANDLE hFile,
uint64_t toSize) {
Status status;
FILE_END_OF_FILE_INFO end_of_file;
end_of_file.EndOfFile.QuadPart = toSize;
if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
sizeof(FILE_END_OF_FILE_INFO))) {
auto lastError = GetLastError();
status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
lastError);
}
return status;
}
size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
if (max_size < kMaxVarint64Length * 3) {
return 0;
}
#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
// MINGGW as defined by CMake file.
// yuslepukhin: I hate the guts of the above macros.
// This impl does not guarantee uniqueness everywhere
// is reasonably good
BY_HANDLE_FILE_INFORMATION FileInfo;
BOOL result = GetFileInformationByHandle(hFile, &FileInfo);
TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
if (!result) {
return 0;
}
char* rid = id;
rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber));
rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh));
rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow));
assert(rid >= id);
return static_cast<size_t>(rid - id);
#else
FILE_ID_INFO FileInfo;
BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo,
sizeof(FileInfo));
TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
if (!result) {
return 0;
}
static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber),
"Wrong sizeof expectations");
// FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t
static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier),
"Wrong sizeof expectations");
char* rid = id;
rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber));
uint64_t* file_id = reinterpret_cast<uint64_t*>(&FileInfo.FileId.Identifier[0]);
rid = EncodeVarint64(rid, *file_id);
++file_id;
rid = EncodeVarint64(rid, *file_id);
assert(rid >= id);
return static_cast<size_t>(rid - id);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// WinMmapReadableFile
WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
HANDLE hFile, HANDLE hMap,
const void* mapped_region,
size_t length)
: WinFileData(fileName, hFile, false /* use_direct_io */),
hMap_(hMap),
mapped_region_(mapped_region),
length_(length) {}
WinMmapReadableFile::~WinMmapReadableFile() {
BOOL ret __attribute__((__unused__));
ret = ::UnmapViewOfFile(mapped_region_);
assert(ret);
ret = ::CloseHandle(hMap_);
assert(ret);
}
Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
if (offset > length_) {
*result = Slice();
return IOError(filename_, EINVAL);
} else if (offset + n > length_) {
n = length_ - offset;
}
*result =
Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
return s;
}
Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
return Status::OK();
}
size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(hFile_, id, max_size);
}
///////////////////////////////////////////////////////////////////////////////
/// WinMmapFile
// Can only truncate or reserve to a sector size aligned if
// used on files that are opened with Unbuffered I/O
Status WinMmapFile::TruncateFile(uint64_t toSize) {
return ftruncate(filename_, hFile_, toSize);
}
Status WinMmapFile::UnmapCurrentRegion() {
Status status;
if (mapped_begin_ != nullptr) {
if (!::UnmapViewOfFile(mapped_begin_)) {
status = IOErrorFromWindowsError(
"Failed to unmap file view: " + filename_, GetLastError());
}
// Move on to the next portion of the file
file_offset_ += view_size_;
// UnmapView automatically sends data to disk but not the metadata
// which is good and provides some equivalent of fdatasync() on Linux
// therefore, we donot need separate flag for metadata
mapped_begin_ = nullptr;
mapped_end_ = nullptr;
dst_ = nullptr;
last_sync_ = nullptr;
pending_sync_ = false;
}
return status;
}
Status WinMmapFile::MapNewRegion() {
Status status;
assert(mapped_begin_ == nullptr);
size_t minDiskSize = file_offset_ + view_size_;
if (minDiskSize > reserved_size_) {
status = Allocate(file_offset_, view_size_);
if (!status.ok()) {
return status;
}
}
// Need to remap
if (hMap_ == NULL || reserved_size_ > mapping_size_) {
if (hMap_ != NULL) {
// Unmap the previous one
BOOL ret __attribute__((__unused__));
ret = ::CloseHandle(hMap_);
assert(ret);
hMap_ = NULL;
}
ULARGE_INTEGER mappingSize;
mappingSize.QuadPart = reserved_size_;
hMap_ = CreateFileMappingA(
hFile_,
NULL, // Security attributes
PAGE_READWRITE, // There is not a write only mode for mapping
mappingSize.HighPart, // Enable mapping the whole file but the actual
// amount mapped is determined by MapViewOfFile
mappingSize.LowPart,
NULL); // Mapping name
if (NULL == hMap_) {
return IOErrorFromWindowsError(
"WindowsMmapFile failed to create file mapping for: " + filename_,
GetLastError());
}
mapping_size_ = reserved_size_;
}
ULARGE_INTEGER offset;
offset.QuadPart = file_offset_;
// View must begin at the granularity aligned offset
mapped_begin_ = reinterpret_cast<char*>(
MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
view_size_, NULL));
if (!mapped_begin_) {
status = IOErrorFromWindowsError(
"WindowsMmapFile failed to map file view: " + filename_,
GetLastError());
} else {
mapped_end_ = mapped_begin_ + view_size_;
dst_ = mapped_begin_;
last_sync_ = mapped_begin_;
pending_sync_ = false;
}
return status;
}
Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
return fallocate(filename_, hFile_, spaceToReserve);
}
WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
size_t allocation_granularity, const EnvOptions& options)
: WinFileData(fname, hFile, false),
hMap_(NULL),
page_size_(page_size),
allocation_granularity_(allocation_granularity),
reserved_size_(0),
mapping_size_(0),
view_size_(0),
mapped_begin_(nullptr),
mapped_end_(nullptr),
dst_(nullptr),
last_sync_(nullptr),
file_offset_(0),
pending_sync_(false) {
// Allocation granularity must be obtained from GetSystemInfo() and must be
// a power of two.
assert(allocation_granularity > 0);
assert((allocation_granularity & (allocation_granularity - 1)) == 0);
assert(page_size > 0);
assert((page_size & (page_size - 1)) == 0);
// Only for memory mapped writes
assert(options.use_mmap_writes);
// View size must be both the multiple of allocation_granularity AND the
// page size and the granularity is usually a multiple of a page size.
const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
view_size_ = Roundup(viewSize, allocation_granularity_);
}
WinMmapFile::~WinMmapFile() {
if (hFile_) {
this->Close();
}
}
Status WinMmapFile::Append(const Slice& data) {
const char* src = data.data();
size_t left = data.size();
while (left > 0) {
assert(mapped_begin_ <= dst_);
size_t avail = mapped_end_ - dst_;
if (avail == 0) {
Status s = UnmapCurrentRegion();
if (s.ok()) {
s = MapNewRegion();
}
if (!s.ok()) {
return s;
}
} else {
size_t n = std::min(left, avail);
memcpy(dst_, src, n);
dst_ += n;
src += n;
left -= n;
pending_sync_ = true;
}
}
// Now make sure that the last partial page is padded with zeros if needed
size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
if (bytesToPad > 0) {
memset(dst_, 0, bytesToPad);
}
return Status::OK();
}
// Means Close() will properly take care of truncate
// and it does not need any additional information
Status WinMmapFile::Truncate(uint64_t size) {
return Status::OK();
}
Status WinMmapFile::Close() {
Status s;
assert(NULL != hFile_);
// We truncate to the precise size so no
// uninitialized data at the end. SetEndOfFile
// which we use does not write zeros and it is good.
uint64_t targetSize = GetFileSize();
if (mapped_begin_ != nullptr) {
// Sync before unmapping to make sure everything
// is on disk and there is not a lazy writing
// so we are deterministic with the tests
Sync();
s = UnmapCurrentRegion();
}
if (NULL != hMap_) {
BOOL ret = ::CloseHandle(hMap_);
if (!ret && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError(
"Failed to Close mapping for file: " + filename_, lastError);
}
hMap_ = NULL;
}
if (hFile_ != NULL) {
TruncateFile(targetSize);
BOOL ret = ::CloseHandle(hFile_);
hFile_ = NULL;
if (!ret && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError(
"Failed to close file map handle: " + filename_, lastError);
}
}
return s;
}
Status WinMmapFile::Flush() { return Status::OK(); }
// Flush only data
Status WinMmapFile::Sync() {
Status s;
// Some writes occurred since last sync
if (dst_ > last_sync_) {
assert(mapped_begin_);
assert(dst_);
assert(dst_ > mapped_begin_);
assert(dst_ < mapped_end_);
size_t page_begin =
TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
size_t page_end =
TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
// Flush only the amount of that is a multiple of pages
if (!::FlushViewOfFile(mapped_begin_ + page_begin,
(page_end - page_begin) + page_size_)) {
s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
GetLastError());
} else {
last_sync_ = dst_;
}
}
return s;
}
/**
* Flush data as well as metadata to stable storage.
*/
Status WinMmapFile::Fsync() {
Status s = Sync();
// Flush metadata
if (s.ok() && pending_sync_) {
if (!::FlushFileBuffers(hFile_)) {
s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
GetLastError());
}
pending_sync_ = false;
}
return s;
}
/**
* Get the size of valid data in the file. This will not match the
* size that is returned from the filesystem because we use mmap
* to extend file by map_size every time.
*/
uint64_t WinMmapFile::GetFileSize() {
size_t used = dst_ - mapped_begin_;
return file_offset_ + used;
}
Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
return Status::OK();
}
Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
Status status;
TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
// Make sure that we reserve an aligned amount of space
// since the reservation block size is driven outside so we want
// to check if we are ok with reservation here
size_t spaceToReserve = Roundup(offset + len, view_size_);
// Nothing to do
if (spaceToReserve <= reserved_size_) {
return status;
}
IOSTATS_TIMER_GUARD(allocate_nanos);
status = PreallocateInternal(spaceToReserve);
if (status.ok()) {
reserved_size_ = spaceToReserve;
}
return status;
}
size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(hFile_, id, max_size);
}
//////////////////////////////////////////////////////////////////////////////////
// WinSequentialFile
WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
const EnvOptions& options)
: WinFileData(fname, f, options.use_direct_reads) {}
WinSequentialFile::~WinSequentialFile() {
assert(hFile_ != INVALID_HANDLE_VALUE);
}
Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
Status s;
size_t r = 0;
assert(result != nullptr);
if (WinFileData::use_direct_io()) {
return Status::NotSupported("Read() does not support direct_io");
}
// Windows ReadFile API accepts a DWORD.
// While it is possible to read in a loop if n is too big
// it is an unlikely case.
if (n > std::numeric_limits<DWORD>::max()) {
return Status::InvalidArgument("n is too big for a single ReadFile: "
+ filename_);
}
DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
DWORD bytesRead = 0;
BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
if (ret != FALSE) {
r = bytesRead;
} else {
auto lastError = GetLastError();
if (lastError != ERROR_HANDLE_EOF) {
s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
lastError);
}
}
*result = Slice(scratch, r);
return s;
}
Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
uint64_t offset, size_t& bytes_read) const {
return pread(this, src, numBytes, offset, bytes_read);
}
Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
char* scratch) {
Status s;
if (!WinFileData::use_direct_io()) {
return Status::NotSupported("This function is only used for direct_io");
}
if (!IsSectorAligned(offset) ||
!IsSectorAligned(n)) {
return Status::InvalidArgument(
"WinSequentialFile::PositionedRead: offset is not properly aligned");
}
size_t bytes_read = 0; // out param
s = PositionedReadInternal(scratch, n, offset, bytes_read);
*result = Slice(scratch, bytes_read);
return s;
}
Status WinSequentialFile::Skip(uint64_t n) {
// Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
// integer. As such it is a highly unlikley case to have n so large.
if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
filename_);
}
LARGE_INTEGER li;
li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
if (ret == FALSE) {
auto lastError = GetLastError();
return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
lastError);
}
return Status::OK();
}
Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
return Status::OK();
}
//////////////////////////////////////////////////////////////////////////////////////////////////
/// WinRandomAccessBase
inline
Status WinRandomAccessImpl::PositionedReadInternal(char* src,
size_t numBytes,
uint64_t offset,
size_t& bytes_read) const {
return pread(file_base_, src, numBytes, offset, bytes_read);
}
inline
WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
size_t alignment,
const EnvOptions& options) :
file_base_(file_base),
alignment_(alignment) {
assert(!options.use_mmap_reads);
}
inline
Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
// Check buffer alignment
if (file_base_->use_direct_io()) {
if (!IsSectorAligned(offset) ||
!IsAligned(alignment_, scratch)) {
return Status::InvalidArgument(
"WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
}
}
if (n == 0) {
*result = Slice(scratch, 0);
return s;
}
size_t bytes_read = 0;
s = PositionedReadInternal(scratch, n, offset, bytes_read);
*result = Slice(scratch, bytes_read);
return s;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// WinRandomAccessFile
WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
size_t alignment,
const EnvOptions& options)
: WinFileData(fname, hFile, options.use_direct_reads),
WinRandomAccessImpl(this, alignment, options) {}
WinRandomAccessFile::~WinRandomAccessFile() {
}
Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
return ReadImpl(offset, n, result, scratch);
}
Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
return Status::OK();
}
size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
}
size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
return GetAlignment();
}
/////////////////////////////////////////////////////////////////////////////
// WinWritableImpl
//
inline
Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
}
inline
WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
: file_data_(file_data),
alignment_(alignment),
next_write_offset_(0),
reservedsize_(0) {
// Query current position in case ReopenWritableFile is called
// This position is only important for buffered writes
// for unbuffered writes we explicitely specify the position.
LARGE_INTEGER zero_move;
zero_move.QuadPart = 0; // Do not move
LARGE_INTEGER pos;
pos.QuadPart = 0;
BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
FILE_CURRENT);
// Querying no supped to fail
if (ret != 0) {
next_write_offset_ = pos.QuadPart;
} else {
assert(false);
}
}
inline
Status WinWritableImpl::AppendImpl(const Slice& data) {
Status s;
if (data.size() > std::numeric_limits<DWORD>::max()) {
return Status::InvalidArgument("data is too long for a single write" +
file_data_->GetName());
}
size_t bytes_written = 0; // out param
if (file_data_->use_direct_io()) {
// With no offset specified we are appending
// to the end of the file
assert(IsSectorAligned(next_write_offset_));
if (!IsSectorAligned(data.size()) ||
!IsAligned(GetAlignement(), data.data())) {
s = Status::InvalidArgument(
"WriteData must be page aligned, size must be sector aligned");
} else {
s = pwrite(file_data_, data, next_write_offset_, bytes_written);
}
} else {
DWORD bytesWritten = 0;
if (!WriteFile(file_data_->GetFileHandle(), data.data(),
static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError(
"Failed to WriteFile: " + file_data_->GetName(),
lastError);
} else {
bytes_written = bytesWritten;
}
}
if(s.ok()) {
if (bytes_written == data.size()) {
// This matters for direct_io cases where
// we rely on the fact that next_write_offset_
// is sector aligned
next_write_offset_ += bytes_written;
} else {
s = Status::IOError("Failed to write all bytes: " +
file_data_->GetName());
}
}
return s;
}
inline
Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
if(file_data_->use_direct_io()) {
if (!IsSectorAligned(offset) ||
!IsSectorAligned(data.size()) ||
!IsAligned(GetAlignement(), data.data())) {
return Status::InvalidArgument(
"Data and offset must be page aligned, size must be sector aligned");
}
}
size_t bytes_written = 0;
Status s = pwrite(file_data_, data, offset, bytes_written);
if(s.ok()) {
if (bytes_written == data.size()) {
// For sequential write this would be simple
// size extension by data.size()
uint64_t write_end = offset + bytes_written;
if (write_end >= next_write_offset_) {
next_write_offset_ = write_end;
}
} else {
s = Status::IOError("Failed to write all of the requested data: " +
file_data_->GetName());
}
}
return s;
}
inline
Status WinWritableImpl::TruncateImpl(uint64_t size) {
// It is tempting to check for the size for sector alignment
// but truncation may come at the end and there is not a requirement
// for this to be sector aligned so long as we do not attempt to write
// after that. The interface docs state that the behavior is undefined
// in that case.
Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
size);
if (s.ok()) {
next_write_offset_ = size;
}
return s;
}
inline
Status WinWritableImpl::CloseImpl() {
Status s;
auto hFile = file_data_->GetFileHandle();
assert(INVALID_HANDLE_VALUE != hFile);
if (!::FlushFileBuffers(hFile)) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
file_data_->GetName(),
lastError);
}
if(!file_data_->CloseFile() && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
lastError);
}
return s;
}
inline
Status WinWritableImpl::SyncImpl() {
Status s;
if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError(
"FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
}
return s;
}
inline
Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
Status status;
TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
// Make sure that we reserve an aligned amount of space
// since the reservation block size is driven outside so we want
// to check if we are ok with reservation here
size_t spaceToReserve = Roundup(offset + len, alignment_);
// Nothing to do
if (spaceToReserve <= reservedsize_) {
return status;
}
IOSTATS_TIMER_GUARD(allocate_nanos);
status = PreallocateInternal(spaceToReserve);
if (status.ok()) {
reservedsize_ = spaceToReserve;
}
return status;
}
////////////////////////////////////////////////////////////////////////////////
/// WinWritableFile
WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
size_t alignment, size_t /* capacity */,
const EnvOptions& options)
: WinFileData(fname, hFile, options.use_direct_writes),
WinWritableImpl(this, alignment) {
assert(!options.use_mmap_writes);
}
WinWritableFile::~WinWritableFile() {
}
// Indicates if the class makes use of direct I/O
bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
size_t WinWritableFile::GetRequiredBufferAlignment() const {
return GetAlignement();
}
Status WinWritableFile::Append(const Slice& data) {
return AppendImpl(data);
}
Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
return PositionedAppendImpl(data, offset);
}
// Need to implement this so the file is truncated correctly
// when buffered and unbuffered mode
Status WinWritableFile::Truncate(uint64_t size) {
return TruncateImpl(size);
}
Status WinWritableFile::Close() {
return CloseImpl();
}
// write out the cached data to the OS cache
// This is now taken care of the WritableFileWriter
Status WinWritableFile::Flush() {
return Status::OK();
}
Status WinWritableFile::Sync() {
return SyncImpl();
}
Status WinWritableFile::Fsync() { return SyncImpl(); }
Disable onboard cache for compaction output Summary: FILE_FLAG_WRITE_THROUGH is for disabling device on-board cache in windows API, which should be disabled if user doesn't need system cache. There was a perf issue related with this, we found during memtable flush, the high percentile latency jumps significantly. During profiling, we found those high latency (P99.9) read requests got queue-jumped by write requests from memtable flush and takes 80ms or even more time to wait, even when SSD overall IO throughput is relatively low. After enabling FILE_FLAG_WRITE_THROUGH, we rerun the test found high percentile latency drops a lot without observable impact on writes. Scenario 1: 40MB/s + 40MB/s R/W compaction throughput  Original | FILE_FLAG_WRITE_THROUGH | Percentage reduction --------------------------------------------------------------- P99.9 | 56.897 ms | 35.593 ms | -37.4% P99 | 3.905 ms | 3.896 ms | -2.8% Scenario 2: 14MB/s + 14MB/s R/W compaction throughput, cohosted with 100+ other rocksdb instances have manually triggered memtable flush operations (memtable is tiny), creating a lot of randomized the small file writes operations during test. Original | FILE_FLAG_WRITE_THROUGH | Percentage reduction --------------------------------------------------------------- P99.9 | 86.227 ms | 50.436 ms | -41.5% P99 | 8.415 ms | 3.356 ms | -60.1% Closes https://github.com/facebook/rocksdb/pull/3225 Differential Revision: D6624174 Pulled By: miasantreble fbshipit-source-id: 321b86aee9d74470840c70e5d0d4fa9880660a91
2017-12-22 02:37:27 +00:00
bool WinWritableFile::IsSyncThreadSafe() const { return true; }
uint64_t WinWritableFile::GetFileSize() {
return GetFileNextWriteOffset();
}
Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
return AllocateImpl(offset, len);
}
size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
}
/////////////////////////////////////////////////////////////////////////
/// WinRandomRWFile
WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
size_t alignment, const EnvOptions& options)
: WinFileData(fname, hFile,
options.use_direct_reads && options.use_direct_writes),
WinRandomAccessImpl(this, alignment, options),
WinWritableImpl(this, alignment) {}
bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
return GetAlignement();
}
Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
return PositionedAppendImpl(data, offset);
}
Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
return ReadImpl(offset, n, result, scratch);
}
Status WinRandomRWFile::Flush() {
return Status::OK();
}
Status WinRandomRWFile::Sync() {
return SyncImpl();
}
Status WinRandomRWFile::Close() {
return CloseImpl();
}
//////////////////////////////////////////////////////////////////////////
/// WinMemoryMappedBufer
WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
BOOL ret = FALSE;
if (base_ != nullptr) {
ret = ::UnmapViewOfFile(base_);
assert(ret);
base_ = nullptr;
}
if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
ret = ::CloseHandle(map_handle_);
assert(ret);
map_handle_ = NULL;
}
if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
ret = ::CloseHandle(file_handle_);
assert(ret);
file_handle_ = NULL;
}
}
//////////////////////////////////////////////////////////////////////////
/// WinDirectory
Status WinDirectory::Fsync() { return Status::OK(); }
size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(handle_, id, max_size);
}
//////////////////////////////////////////////////////////////////////////
/// WinFileLock
WinFileLock::~WinFileLock() {
BOOL ret __attribute__((__unused__));
ret = ::CloseHandle(hFile_);
assert(ret);
}
}
}