rocksdb/port/win/env_win.cc
Dmitri Smirnov d8586ab22b All of these are in the new code added past 3.10
1) Crash in env_win.cc that prevented db_test run to completion and some new tests
     2) Fix new corruption tests in DBTest by allowing a shared trunction of files. Note that this is generally needed ONLY for tests.
     3) Close database so WAL is closed prior to inducing corruption similar to what we did within Corruption tests.
2015-07-08 16:56:45 -07:00

2604 lines
73 KiB
C++

// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include <deque>
#include <thread>
#include <ctime>
#include <errno.h>
#include <process.h>
#include <io.h>
#include <direct.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
#include "port/port.h"
#include "port/dirent.h"
#include "port/win/win_logger.h"
#include "util/random.h"
#include "util/iostats_context_imp.h"
#include "util/rate_limiter.h"
#include "util/thread_status_updater.h"
#include "util/thread_status_util.h"
#include <Windows.h>
#include <Rpc.h> // For UUID generation
// This is only set from db_stress.cc and for testing only.
// If non-zero, kill at various points in source code with probability 1/this
int rocksdb_kill_odds = 0;
namespace rocksdb
{
std::string GetWindowsErrSz(DWORD err) {
LPSTR lpMsgBuf;
FormatMessageA(
FORMAT_MESSAGE_ALLOCATE_BUFFER |
FORMAT_MESSAGE_FROM_SYSTEM |
FORMAT_MESSAGE_IGNORE_INSERTS,
NULL,
err,
0, // Default language
reinterpret_cast<LPSTR>(&lpMsgBuf),
0,
NULL
);
std::string Err = lpMsgBuf;
LocalFree(lpMsgBuf);
return Err;
}
namespace
{
const size_t c_OneMB = (1 << 20);
ThreadStatusUpdater* CreateThreadStatusUpdater() {
return new ThreadStatusUpdater();
}
// A wrapper for fadvise, if the platform doesn't support fadvise,
// it will simply return Status::NotSupport.
int Fadvise(int fd, off_t offset, size_t len, int advice) {
return 0; // simply do nothing.
}
inline
Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
return Status::IOError(context, GetWindowsErrSz(err));
}
inline
Status IOErrorFromLastWindowsError(const std::string& context) {
return IOErrorFromWindowsError(context, GetLastError());
}
inline
Status IOError(const std::string& context, int err_number) {
return Status::IOError(context, strerror(err_number));
}
// TODO(sdong): temp logging. Need to help debugging. Remove it when
// the feature is proved to be stable.
inline void PrintThreadInfo(size_t thread_id, size_t terminatingId) {
fprintf(stdout, "Bg thread %Iu terminates %Iu\n", thread_id, terminatingId);
}
// returns the ID of the current process
inline
int current_process_id() {
return _getpid();
}
#ifdef NDEBUG
// empty in release build
#define TEST_KILL_RANDOM(rocksdb_kill_odds)
#else
// Kill the process with probablity 1/odds for testing.
void TestKillRandom(int odds, const std::string& srcfile, int srcline) {
time_t curtime = time(nullptr);
Random r((uint32_t)curtime);
assert(odds > 0);
bool crash = r.OneIn(odds);
if (crash)
{
fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
fflush(stdout);
std::string* p_str = nullptr;
p_str->c_str();
}
}
// To avoid crashing always at some frequently executed codepaths (during
// kill random test), use this factor to reduce odds
#define REDUCE_ODDS 2
#define REDUCE_ODDS2 4
#define TEST_KILL_RANDOM(rocksdb_kill_odds) { \
if (rocksdb_kill_odds > 0) { \
TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \
} \
}
#endif
// RAII helpers for HANDLEs
const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;
// We preserve the original name of this interface to denote the original idea behind it.
// All reads happen by a specified offset and pwrite interface does not change
// the position of the file pointer. Judging from the man page and errno it does execute
// lseek atomically to return the position of the file back where it was. WriteFile() does not
// have this capability. Therefore, for both pread and pwrite the pointer is advanced to the next position
// which is fine for writes because they are (should be) sequential.
// Because all the reads/writes happen by the specified offset, the caller in theory should not
// rely on the current file offset.
SSIZE_T pwrite(HANDLE hFile, const char * src, size_t numBytes, uint64_t offset) {
OVERLAPPED overlapped = { 0 };
ULARGE_INTEGER offsetUnion;
offsetUnion.QuadPart = offset;
overlapped.Offset = offsetUnion.LowPart;
overlapped.OffsetHigh = offsetUnion.HighPart;
SSIZE_T result = 0;
unsigned long bytesWritten = 0;
if (FALSE == WriteFile(hFile, src, numBytes, &bytesWritten, &overlapped)) {
result = -1;
}
else {
result = bytesWritten;
}
return result;
}
// See comments for pwrite above
SSIZE_T pread(HANDLE hFile, char * src, size_t numBytes, uint64_t offset) {
OVERLAPPED overlapped = { 0 };
ULARGE_INTEGER offsetUnion;
offsetUnion.QuadPart = offset;
overlapped.Offset = offsetUnion.LowPart;
overlapped.OffsetHigh = offsetUnion.HighPart;
SSIZE_T result = 0;
unsigned long bytesRead = 0;
if (FALSE == ReadFile(hFile, src, numBytes, &bytesRead, &overlapped)) {
return -1;
}
else {
result = bytesRead;
}
return result;
}
// Note the below two do not set errno because they are used only here in this file
// on a Windows handle and, therefore, not necessary. Translating GetLastError() to errno
// is a sad business
inline
int fsync(HANDLE hFile) {
if (!FlushFileBuffers(hFile)) {
return -1;
}
return 0;
}
inline
size_t TruncateToPageBoundary(size_t page_size, size_t s) {
s -= (s & (page_size - 1));
assert((s % page_size) == 0);
return s;
}
// Roundup x to a multiple of y
inline
size_t Roundup(size_t x, size_t y) {
return ((x + y - 1) / y) * y;
}
// SetFileInformationByHandle() is capable of fast pre-allocates.
// However, this does not change the file end position unless the file is
// truncated and the pre-allocated space is not considered filled with zeros.
inline
Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size) {
Status status;
FILE_ALLOCATION_INFO alloc_info;
alloc_info.AllocationSize.QuadPart = to_size;
if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info, sizeof(FILE_ALLOCATION_INFO))) {
auto lastError = GetLastError();
status = IOErrorFromWindowsError("Failed to pre-allocate space: " + filename, lastError);
}
return status;
}
inline
Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
Status status;
FILE_END_OF_FILE_INFO end_of_file;
end_of_file.EndOfFile.QuadPart = toSize;
if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file, sizeof(FILE_END_OF_FILE_INFO))) {
auto lastError = GetLastError();
status = IOErrorFromWindowsError("Failed to Set end of file: " + filename, lastError);
}
return status;
}
class WinRandomRWFile : public RandomRWFile {
const std::string filename_;
HANDLE hFile_;
bool pending_fsync_;
public:
WinRandomRWFile(const std::string& fname, HANDLE hFile, const EnvOptions& options)
: filename_(fname),
hFile_(hFile),
pending_fsync_(false) {
assert(!options.use_mmap_writes && !options.use_mmap_reads);
}
~WinRandomRWFile() {
if (hFile_ != INVALID_HANDLE_VALUE && hFile_ != NULL) {
::CloseHandle(hFile_);
}
}
virtual Status Write(uint64_t offset, const Slice& data) override {
const char* src = data.data();
size_t left = data.size();
pending_fsync_ = true;
SSIZE_T done = 0;
{
IOSTATS_TIMER_GUARD(write_nanos);
done = pwrite(hFile_, src, left, offset);
}
if (done < 0) {
return IOErrorFromWindowsError("pwrite failed to: " + filename_, GetLastError());
}
IOSTATS_ADD(bytes_written, done);
left -= done;
src += done;
offset += done;
return Status::OK();
}
virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override {
Status s;
SSIZE_T r = -1;
char* ptr = scratch;
size_t left = n;
while (left > 0) {
{
IOSTATS_TIMER_GUARD(read_nanos);
r = pread(hFile_, ptr, n, offset);
}
if (r <= 0) {
break;
}
ptr += r;
offset += r;
left -= r;
}
IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
*result = Slice(scratch, (r < 0) ? 0 : r);
if (r < 0) {
s = IOErrorFromWindowsError("pread failed from: " + filename_, GetLastError());
}
return s;
}
virtual Status Close() override {
Status s = Status::OK();
if (hFile_ != INVALID_HANDLE_VALUE && ::CloseHandle(hFile_) == FALSE) {
s = IOErrorFromWindowsError("Failed to close file: " + filename_, GetLastError());
}
hFile_ = INVALID_HANDLE_VALUE;
return s;
}
virtual Status Sync() override {
if (pending_fsync_ && fsync(hFile_) < 0) {
return IOErrorFromWindowsError("Failed to Sync() buffers for: " + filename_, GetLastError());
}
pending_fsync_ = false;
return Status::OK();
}
virtual Status Fsync() override {
if (pending_fsync_ && fsync(hFile_) < 0) {
return IOErrorFromWindowsError("Failed to Fsync() for: " + filename_, GetLastError());
}
pending_fsync_ = false;
return Status::OK();
}
virtual Status Allocate(off_t offset, off_t len) override {
IOSTATS_TIMER_GUARD(allocate_nanos);
return fallocate(filename_, hFile_, len);
}
};
// mmap() based random-access
class WinMmapReadableFile : public RandomAccessFile {
const std::string fileName_;
HANDLE hFile_;
HANDLE hMap_;
const void* mapped_region_;
const size_t length_;
public:
// mapped_region_[0,length-1] contains the mmapped contents of the file.
WinMmapReadableFile(const std::string &fileName, HANDLE hFile, HANDLE hMap, const void* mapped_region, size_t length)
: fileName_(fileName), hFile_(hFile), hMap_(hMap), mapped_region_(mapped_region), length_(length) {
}
~WinMmapReadableFile() {
BOOL ret = ::UnmapViewOfFile(mapped_region_);
assert(ret);
ret = ::CloseHandle(hMap_);
assert(ret);
ret = ::CloseHandle(hFile_);
assert(ret);
}
virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override {
Status s;
if (offset + n > length_) {
*result = Slice();
s = IOError(fileName_, EINVAL);
}
else {
*result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
}
return s;
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
};
// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file. This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class WinMmapFile : public WritableFile {
private:
const std::string filename_;
HANDLE hFile_;
HANDLE hMap_;
const size_t page_size_; // We flush the mapping view in page_size increments. We may decide if this is a memory page size or SSD page size
const size_t allocation_granularity_; // View must start at such a granularity
size_t mapping_size_; // We want file mapping to be of a specific size because then the file is expandable
size_t view_size_; // How much memory to map into a view at a time
char* mapped_begin_; // Must begin at the file offset that is aligned with allocation_granularity_
char* mapped_end_;
char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_])
char* last_sync_; // Where have we synced up to
uint64_t file_offset_; // Offset of mapped_begin_ in file
// Do we have unsynced writes?
bool pending_sync_;
// Can only truncate or reserve to a sector size aligned if
// used on files that are opened with Unbuffered I/O
Status TruncateFile(uint64_t toSize) {
return ftruncate(filename_, hFile_, toSize);
}
// Can only truncate or reserve to a sector size aligned if
// used on files that are opened with Unbuffered I/O
// Normally it does not present a problem since in memory mapped files
// we do not disable buffering
Status ReserveFileSpace(uint64_t toSize) {
IOSTATS_TIMER_GUARD(allocate_nanos);
return fallocate(filename_, hFile_, toSize);
}
Status UnmapCurrentRegion() {
Status status;
if (mapped_begin_ != nullptr) {
if (!::UnmapViewOfFile(mapped_begin_)) {
status = IOErrorFromWindowsError("Failed to unmap file view: " + filename_, GetLastError());
}
// UnmapView automatically sends data to disk but not the metadata
// which is good and provides some equivalent of fdatasync() on Linux
// therefore, we donot need separate flag for metadata
pending_sync_ = false;
mapped_begin_ = nullptr;
mapped_end_ = nullptr;
dst_ = nullptr;
last_sync_ = nullptr;
// Move on to the next portion of the file
file_offset_ += view_size_;
// Increase the amount we map the next time, but capped at 1MB
view_size_ *= 2;
view_size_ = std::min(view_size_, c_OneMB);
}
return status;
}
Status MapNewRegion() {
Status status;
assert(mapped_begin_ == nullptr);
size_t minMappingSize = file_offset_ + view_size_;
// Check if we need to create a new mapping since we want to write beyond the current one
// If the mapping view is now too short
// CreateFileMapping will extend the size of the file automatically if the mapping size is greater than
// the current length of the file, which reserves the space and makes writing faster, except, windows can not map an empty file.
// Thus the first time around we must actually extend the file ourselves
if (hMap_ == NULL || minMappingSize > mapping_size_) {
if (NULL == hMap_) {
// Creating mapping for the first time so reserve the space on disk
status = ReserveFileSpace(minMappingSize);
if (!status.ok()) {
return status;
}
}
if (hMap_) {
// Unmap the previous one
BOOL ret = ::CloseHandle(hMap_);
assert(ret);
hMap_ = NULL;
}
// Calculate the new mapping size which will hopefully reserve space for several consecutive sliding views
// Query preallocation block size if set
size_t preallocationBlockSize = 0;
size_t lastAllocatedBlockSize = 0; // Not used
GetPreallocationStatus(&preallocationBlockSize, &lastAllocatedBlockSize);
if (preallocationBlockSize) {
preallocationBlockSize = Roundup(preallocationBlockSize, allocation_granularity_);
} else {
preallocationBlockSize = 2 * view_size_;
}
mapping_size_ += preallocationBlockSize;
ULARGE_INTEGER mappingSize;
mappingSize.QuadPart = mapping_size_;
hMap_ = CreateFileMappingA(
hFile_,
NULL, // Security attributes
PAGE_READWRITE, // There is not a write only mode for mapping
mappingSize.HighPart, // Enable mapping the whole file but the actual amount mapped is determined by MapViewOfFile
mappingSize.LowPart,
NULL); // Mapping name
if (NULL == hMap_) {
return IOErrorFromWindowsError("WindowsMmapFile failed to create file mapping for: " + filename_, GetLastError());
}
}
ULARGE_INTEGER offset;
offset.QuadPart = file_offset_;
// View must begin at the granularity aligned offset
mapped_begin_ = reinterpret_cast<char*>(MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart, view_size_, NULL));
if (!mapped_begin_) {
status = IOErrorFromWindowsError("WindowsMmapFile failed to map file view: " + filename_, GetLastError());
} else {
mapped_end_ = mapped_begin_ + view_size_;
dst_ = mapped_begin_;
last_sync_ = mapped_begin_;
pending_sync_ = false;
}
return status;
}
public:
WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size, size_t allocation_granularity, const EnvOptions& options)
: filename_(fname),
hFile_(hFile),
hMap_(NULL),
page_size_(page_size),
allocation_granularity_(allocation_granularity),
mapping_size_(0),
view_size_(0),
mapped_begin_(nullptr),
mapped_end_(nullptr),
dst_(nullptr),
last_sync_(nullptr),
file_offset_(0),
pending_sync_(false) {
// Allocation granularity must be obtained from GetSystemInfo() and must be a power of two.
assert(allocation_granularity > 0);
assert((allocation_granularity & (allocation_granularity - 1)) == 0);
assert(page_size > 0);
assert((page_size & (page_size - 1)) == 0);
// Only for memory mapped writes
assert(options.use_mmap_writes);
// Make sure buffering is not disabled. It is ignored for mapping
// purposes but also imposes restriction on moving file position
// it is not a problem so much with reserving space since it is probably a factor
// of allocation_granularity but we also want to truncate the file in Close() at
// arbitrary position so we do not have to feel this with zeros.
assert(options.use_os_buffer);
// View size must be both the multiple of allocation_granularity AND the page size
if ((allocation_granularity_ % page_size_) == 0) {
view_size_ = 2 * allocation_granularity;
} else if ((page_size_ % allocation_granularity_) == 0) {
view_size_ = 2 * page_size_;
} else {
// we can multiply them together
assert(false);
}
}
~WinMmapFile() {
if (hFile_) {
this->Close();
}
}
virtual Status Append(const Slice& data) override {
const char* src = data.data();
size_t left = data.size();
while (left > 0) {
assert(mapped_begin_ <= dst_);
size_t avail = mapped_end_ - dst_;
if (avail == 0) {
Status s = UnmapCurrentRegion();
if (s.ok()) {
s = MapNewRegion();
}
if (!s.ok()) {
return s;
}
}
size_t n = std::min(left, avail);
memcpy(dst_, src, n);
IOSTATS_ADD(bytes_written, n);
dst_ += n;
src += n;
left -= n;
pending_sync_ = true;
}
return Status::OK();
}
virtual Status Close() override {
Status s;
assert(NULL != hFile_);
// We truncate to the precise size so no
// uninitialized data at the end. SetEndOfFile
// which we use does not write zeros and it is good.
uint64_t targetSize = GetFileSize();
s = UnmapCurrentRegion();
if (NULL != hMap_ ) {
BOOL ret = ::CloseHandle(hMap_);
if (!ret && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Failed to Close mapping for file: " + filename_, lastError);
}
hMap_ = NULL;
}
TruncateFile(targetSize);
BOOL ret = ::CloseHandle(hFile_);
hFile_ = NULL;
if (!ret && s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Failed to close file map handle: " + filename_, lastError);
}
return s;
}
virtual Status Flush() override {
return Status::OK();
}
// Flush only data
virtual Status Sync() override {
Status s;
// Some writes occurred since last sync
if (pending_sync_) {
assert(mapped_begin_);
assert(dst_);
assert(dst_ > mapped_begin_);
assert(dst_ < mapped_end_);
size_t page_begin = TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
size_t page_end = TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
last_sync_ = dst_;
// Flush only the amount of that is a multiple of pages
if(!::FlushViewOfFile(mapped_begin_ + page_begin, (page_end - page_begin) + page_size_)) {
s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_, GetLastError());
}
pending_sync_ = false;
}
return s;
}
/**
* Flush data as well as metadata to stable storage.
*/
virtual Status Fsync() override {
Status s;
// Flush metadata if pending
const bool pending = pending_sync_;
s = Sync();
// Flush metadata
if (s.ok() && pending) {
if (!::FlushFileBuffers(hFile_)) {
s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_, GetLastError());
}
}
return s;
}
/**
* Get the size of valid data in the file. This will not match the
* size that is returned from the filesystem because we use mmap
* to extend file by map_size every time.
*/
virtual uint64_t GetFileSize() override {
size_t used = dst_ - mapped_begin_;
return file_offset_ + used;
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
virtual Status Allocate(off_t offset, off_t len) override {
return Status::OK();
}
};
// This class is to manage an aligned user
// allocated buffer for unbuffered I/O purposes
// though it does not make a difference if you need a buffer.
class AlignedBuffer {
const size_t alignment_;
std::unique_ptr<char[]> buf_;
size_t capacity_;
size_t cursize_;
char* bufstart_;
public:
explicit AlignedBuffer(size_t alignment) :
alignment_(alignment),
capacity_(0),
cursize_(0),
bufstart_(nullptr) {
assert(alignment > 0);
assert((alignment & (alignment - 1)) == 0);
}
size_t GetAlignment() const {
return alignment_;
}
size_t GetCapacity() const {
return capacity_;
}
size_t GetCurrentSize() const {
return cursize_;
}
const char* GetBufferStart() const {
return bufstart_;
}
void Clear() {
cursize_ = 0;
}
// Allocates a new buffer and sets bufstart_ to the aligned first byte
void AllocateNewBuffer(size_t requestedCapacity) {
size_t size = Roundup(requestedCapacity, alignment_);
buf_.reset(new char[size + alignment_]);
char* p = buf_.get();
bufstart_ = reinterpret_cast<char*>((reinterpret_cast<uintptr_t>(p)+(alignment_ - 1)) & ~static_cast<uintptr_t>(alignment_ - 1));
capacity_ = size;
cursize_ = 0;
}
// Used for write
// Returns the number of bytes appended
size_t Append(const char* src, size_t append_size) {
size_t buffer_remaining = capacity_ - cursize_;
size_t to_copy = std::min(append_size, buffer_remaining);
if (to_copy > 0) {
memcpy(bufstart_ + cursize_, src, to_copy);
cursize_ += to_copy;
}
return to_copy;
}
size_t Read(char* dest, size_t offset, size_t read_size) const {
assert(offset < cursize_);
size_t to_read = std::min(cursize_ - offset, read_size);
if (to_read > 0) {
memcpy(dest, bufstart_ + offset, to_read);
}
return to_read;
}
/// Pad to alignment
void PadToAlignmentWith(int padding) {
size_t total_size = Roundup(cursize_, alignment_);
size_t pad_size = total_size - cursize_;
if (pad_size > 0) {
assert((pad_size + cursize_) <= capacity_);
memset(bufstart_ + cursize_, padding, pad_size);
cursize_ += pad_size;
}
}
// After a partial flush move the tail to the beginning of the buffer
void RefitTail(size_t tail_offset, size_t tail_size) {
if (tail_size > 0) {
memmove(bufstart_, bufstart_ + tail_offset, tail_size);
}
cursize_ = tail_size;
}
// Returns place to start writing
char* GetDestination() {
return bufstart_ + cursize_;
}
void SetSize(size_t cursize) {
cursize_ = cursize;
}
};
class WinSequentialFile: public SequentialFile {
private:
const std::string filename_;
FILE* file_;
int fd_;
bool use_os_buffer_;
public:
WinSequentialFile(const std::string& fname, FILE* f, const EnvOptions& options) :
filename_(fname), file_(f), fd_(fileno(f)), use_os_buffer_(options.use_os_buffer) {
}
virtual ~WinSequentialFile() {
assert(file_ != nullptr);
fclose(file_);
}
virtual Status Read(size_t n, Slice* result, char* scratch) override {
Status s;
size_t r = 0;
// read() and fread() as well as write/fwrite do not guarantee
// to fullfil the entire request in one call thus the loop.
do {
r = fread(scratch, 1, n, file_);
} while (r == 0 && ferror(file_));
IOSTATS_ADD(bytes_read, r);
*result = Slice(scratch, r);
if (r < n) {
if (feof(file_)) {
// We leave status as ok if we hit the end of the file
// We also clear the error so that the reads can continue
// if a new data is written to the file
clearerr(file_);
} else {
// A partial read with an error: return a non-ok status
s = Status::IOError(filename_, strerror(errno));
}
}
return s;
}
virtual Status Skip(uint64_t n) override {
if (fseek(file_, n, SEEK_CUR)) {
return IOError(filename_, errno);
}
return Status::OK();
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
};
// pread() based random-access
class WinRandomAccessFile: public RandomAccessFile {
const std::string filename_;
HANDLE hFile_;
const bool use_os_buffer_;
mutable std::mutex buffer_mut_;
mutable AlignedBuffer buffer_;
mutable uint64_t buffered_start_; // file offset set that is currently buffered
public:
WinRandomAccessFile(const std::string& fname,
HANDLE hFile,
size_t alignment,
const EnvOptions& options) :
filename_(fname),
hFile_(hFile),
use_os_buffer_(options.use_os_buffer),
buffer_(alignment),
buffered_start_(0) {
assert(!options.use_mmap_reads);
// Unbuffered access, use internal buffer for reads
if (!use_os_buffer_) {
// Random read, no need in a big buffer
// We read things in database blocks which are likely to be similar to
// the alignment we use.
buffer_.AllocateNewBuffer(alignment * 2);
}
}
virtual ~WinRandomAccessFile() {
if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
::CloseHandle(hFile_);
}
}
virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override {
Status s;
SSIZE_T r = -1;
size_t left = n;
char* dest = scratch;
// When in unbuffered mode we need to do the following changes:
// - use our own aligned buffer
// - always read at the offset of that is a multiple of alignment
if (!use_os_buffer_) {
std::lock_guard<std::mutex> lg(buffer_mut_);
// Let's see if at least some of the requested data is already
// in the buffer
if (offset >= buffered_start_ &&
offset < (buffered_start_ + buffer_.GetCurrentSize())) {
size_t buffer_offset = offset - buffered_start_;
r = buffer_.Read(dest, buffer_offset, left);
assert(r >= 0);
left -= size_t(r);
offset += r;
dest += r;
}
// Still some left or none was buffered
if (left > 0) {
// Figure out the start/end offset for reading and amount to read
const size_t alignment = buffer_.GetAlignment();
const size_t start_page_start = TruncateToPageBoundary(alignment, offset);
const size_t end_page_start = TruncateToPageBoundary(alignment, offset + left - 1);
const size_t actual_bytes_toread = (end_page_start - start_page_start) + alignment;
if (buffer_.GetCapacity() < actual_bytes_toread) {
buffer_.AllocateNewBuffer(actual_bytes_toread);
} else {
buffer_.Clear();
}
SSIZE_T read = 0;
{
IOSTATS_TIMER_GUARD(read_nanos);
read = pread(hFile_, buffer_.GetDestination(), actual_bytes_toread, start_page_start);
}
if (read > 0) {
buffer_.SetSize(read);
buffered_start_ = start_page_start;
// Let's figure out how much we read from the users standpoint
if ((buffered_start_ + uint64_t(read)) > offset) {
size_t buffer_offset = offset - buffered_start_;
r = buffer_.Read(dest, buffer_offset, left);
} else {
r = 0;
}
left -= r;
} else {
r = read;
}
}
} else {
r = pread(hFile_, scratch, left, offset);
if (r > 0) {
left -= r;
}
}
IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
*result = Slice(scratch, (r < 0) ? 0 : n - left);
if (r < 0) {
s = IOErrorFromLastWindowsError(filename_);
}
return s;
}
virtual void Hint(AccessPattern pattern) override {
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
};
// This is a sequential write class. It has been mimicked (as others) after
// the original Posix class. We add support for unbuffered I/O on windows as well
// we utilize the original buffer as an alignment buffer to write directly to file with no buffering.
// No buffering requires that the provided buffer is aligned to the physical sector size (SSD page size) and
// that all SetFilePointer() operations to occur with such an alignment.
// We thus always write in sector/page size increments to the drive and leave
// the tail for the next write OR for Close() at which point we pad with zeros. No padding is required for
// buffered access.
class WinWritableFile : public WritableFile {
private:
const std::string filename_;
HANDLE hFile_;
AlignedBuffer buffer_;
uint64_t filesize_; // How much data is actually written disk
uint64_t reservedsize_; // how far we have reserved space
bool pending_sync_;
RateLimiter* rate_limiter_;
const bool use_os_buffer_; // Used to indicate unbuffered access, the file must be opened as unbuffered if false
public:
WinWritableFile(const std::string& fname,
HANDLE hFile,
size_t alignment,
size_t capacity,
const EnvOptions& options) :
filename_(fname),
hFile_(hFile),
buffer_(alignment),
filesize_(0),
reservedsize_(0),
pending_sync_(false),
rate_limiter_(options.rate_limiter),
use_os_buffer_(options.use_os_buffer) {
assert(!options.use_mmap_writes);
buffer_.AllocateNewBuffer(capacity);
}
~WinWritableFile() {
if (NULL != hFile_ && INVALID_HANDLE_VALUE != hFile_) {
WinWritableFile::Close();
}
}
virtual Status Append(const Slice& data) override {
const char* src = data.data();
assert(data.size() < INT_MAX);
size_t left = data.size();
Status s;
pending_sync_ = true;
// This would call Alloc() if we are out of blocks
PrepareWrite(GetFileSize(), left);
// Flush only when I/O is buffered
if (use_os_buffer_ &&
(buffer_.GetCapacity() - buffer_.GetCurrentSize()) < left) {
if (buffer_.GetCurrentSize() > 0) {
s = Flush();
if (!s.ok()) {
return s;
}
}
if (buffer_.GetCapacity() < c_OneMB) {
size_t desiredCapacity = buffer_.GetCapacity() * 2;
desiredCapacity = std::min(desiredCapacity, c_OneMB);
buffer_.AllocateNewBuffer(desiredCapacity);
}
}
// We always use the internal buffer for the unbuffered I/O
// or we simply use it for its original purpose to accumulate many small chunks
if (!use_os_buffer_ || (buffer_.GetCapacity() >= left)) {
while (left > 0) {
size_t appended = buffer_.Append(src, left);
left -= appended;
src += appended;
if (left > 0) {
s = Flush();
if (!s.ok()) {
break;
}
size_t cursize = buffer_.GetCurrentSize();
size_t capacity = buffer_.GetCapacity();
// We double the buffer here because
// Flush calls do not keep up with the incoming bytes
// This is the only place when buffer is changed with unbuffered I/O
if (cursize == 0 && capacity < c_OneMB) {
size_t desiredCapacity = capacity * 2;
desiredCapacity = std::min(desiredCapacity, c_OneMB);
buffer_.AllocateNewBuffer(desiredCapacity);
}
}
}
} else {
// Writing directly to file bypassing what is in the buffer
assert(buffer_.GetCurrentSize() == 0);
// Use rate limiter for normal I/O very large request if available
s = WriteBuffered(src, left);
}
return s;
}
virtual Status Close() override {
Status s;
// If there is any data in the cache not written we need to deal with it
const size_t cursize = buffer_.GetCurrentSize();
const uint64_t final_size = filesize_ + cursize;
if (cursize > 0) {
// If OS buffering is on, we just flush the remainder, otherwise need
if (!use_os_buffer_) {
s = WriteUnbuffered();
} else {
s = WriteBuffered(buffer_.GetBufferStart(), cursize);
}
}
if (s.ok()) {
s = ftruncate(filename_, hFile_, final_size);
}
// Sync data if buffer was flushed
if (s.ok() && (cursize > 0) && fsync(hFile_) < 0) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_, lastError);
}
if (FALSE == ::CloseHandle(hFile_)) {
if (s.ok()) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_, lastError);
}
}
hFile_ = INVALID_HANDLE_VALUE;
return s;
}
// write out the cached data to the OS cache
virtual Status Flush() override {
Status status;
if (buffer_.GetCurrentSize() > 0) {
if(!use_os_buffer_) {
status = WriteUnbuffered();
} else {
status = WriteBuffered(buffer_.GetBufferStart(), buffer_.GetCurrentSize());
if (status.ok()) {
buffer_.SetSize(0);
}
}
}
return status;
}
virtual Status Sync() override {
Status s = Flush();
if (!s.ok()) {
return s;
}
// Calls flush buffers
if (pending_sync_ && fsync(hFile_) < 0) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_, lastError);
} else {
pending_sync_ = false;
}
return s;
}
virtual Status Fsync() override {
return Sync();
}
virtual uint64_t GetFileSize() override {
return filesize_ + buffer_.GetCurrentSize();
}
virtual Status Allocate(off_t offset, off_t len) override {
Status status;
TEST_KILL_RANDOM(rocksdb_kill_odds);
// Make sure that we reserve an aligned amount of space
// since the reservation block size is driven outside so we want
// to check if we are ok with reservation here
size_t spaceToReserve = Roundup(offset + len, buffer_.GetAlignment());
// Nothing to do
if (spaceToReserve <= reservedsize_) {
return status;
}
IOSTATS_TIMER_GUARD(allocate_nanos);
status = fallocate(filename_, hFile_, spaceToReserve);
if (status.ok()) {
reservedsize_ = spaceToReserve;
}
return status;
}
private:
// This method writes to disk the specified data and makes use of the rate limiter
// if available
Status WriteBuffered(const char* data, size_t size) {
Status s;
assert(use_os_buffer_);
const char* src = data;
size_t left = size;
size_t actually_written = 0;
while (left > 0) {
size_t bytes_allowed = RequestToken(left, false);
DWORD bytesWritten = 0;
if (!WriteFile(hFile_, src, bytes_allowed, &bytesWritten, NULL)) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Failed to write buffered via rate_limiter: " + filename_, lastError);
break;
} else {
actually_written += bytesWritten;
src += bytesWritten;
left -= bytesWritten;
}
}
IOSTATS_ADD(bytes_written, actually_written);
filesize_ += actually_written;
return s;
}
// This flushes the accumulated data in the buffer. We pad data with zeros if necessary to the whole page.
// However, during automatic flushes padding would not be necessary.
// We always use RateLimiter if available. We move (Refit) any buffer bytes that are left over the
// whole number of pages to be written again on the next flush because we can only write on aligned
// offsets.
Status WriteUnbuffered() {
Status s;
assert(!use_os_buffer_);
size_t alignment = buffer_.GetAlignment();
assert((filesize_ % alignment) == 0);
// Calculate whole page final file advance if all writes succeed
size_t file_advance = TruncateToPageBoundary(alignment, buffer_.GetCurrentSize());
// Calculate the leftover tail, we write it here padded with zeros BUT we will write
// it again in the future either on Close() OR when the current whole page fills out
size_t leftover_tail = buffer_.GetCurrentSize() - file_advance;
// Round up and pad
buffer_.PadToAlignmentWith(0);
const char* src = buffer_.GetBufferStart();
size_t left = buffer_.GetCurrentSize();
uint64_t file_offset = filesize_;
size_t actually_written = 0;
while (left > 0) {
// Request how much is allowed. If this is less than one alignment we may be blocking a lot on every write
// because we can not write less than one alignment (page) unit thus check the configuration.
size_t bytes_allowed = RequestToken(left, true);
SSIZE_T ret = pwrite(hFile_, buffer_.GetBufferStart() + actually_written, bytes_allowed, file_offset);
// Error break
if (ret < 0) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Failed to pwrite for unbuffered: " + filename_, lastError);
buffer_.SetSize(file_advance + leftover_tail);
break;
}
actually_written += ret;
file_offset += ret;
left -= ret;
}
IOSTATS_ADD(bytes_written, actually_written);
if (s.ok()) {
// Move the tail to the beginning of the buffer
// This never happens during normal Append but rather during
// explicit call to Flush()/Sync() or Close()
buffer_.RefitTail(file_advance, leftover_tail);
// This is where we start writing next time which may or not be
// the actual file size on disk. They match if the buffer size
// is a multiple of whole pages otherwise filesize_ is leftover_tail behind
filesize_ += file_advance;
}
return s;
}
// This truncates the request to a single burst bytes
// and then goes through the request to make sure we are
// satisfied in the order of the I/O priority
size_t RequestToken(size_t bytes, bool align) const {
if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
bytes = std::min(bytes,
static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
if (align) {
// Here we may actually require more than burst and block
// but we can not write less than one page at a time on unbuffered
// thus we may want not to use ratelimiter s
size_t alignment = buffer_.GetAlignment();
bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
}
rate_limiter_->Request(bytes, io_priority_);
}
return bytes;
}
};
class WinDirectory : public Directory {
public:
WinDirectory() {
}
virtual Status Fsync() override {
return Status::OK();
}
};
class WinFileLock : public FileLock {
public:
explicit WinFileLock(HANDLE hFile) :
hFile_(hFile) {
assert(hFile != NULL);
assert(hFile != INVALID_HANDLE_VALUE);
}
~WinFileLock() {
BOOL ret = ::CloseHandle(hFile_);
assert(ret);
}
private:
HANDLE hFile_;
};
namespace
{
void WinthreadCall(const char* label, std::error_code result) {
if (0 != result.value()) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value()));
abort();
}
}
}
class WinEnv : public Env {
public:
WinEnv();
virtual ~WinEnv() {
for (auto& th : threads_to_join_) {
th.join();
}
threads_to_join_.clear();
for (auto& thpool : thread_pools_) {
thpool.JoinAllThreads();
}
// All threads must be joined before the deletion of
// thread_status_updater_.
delete thread_status_updater_;
}
virtual Status DeleteFile(const std::string& fname) override {
Status result;
if (_unlink(fname.c_str())) {
result = IOError("Failed to delete: " + fname, errno);
}
return result;
}
Status GetCurrentTime(int64_t* unix_time) override {
time_t time = std::time(nullptr);
if (time == (time_t)(-1)) {
return Status::NotSupported("Failed to get time");
}
*unix_time = time;
return Status::OK();
}
virtual Status NewSequentialFile(const std::string& fname, std::unique_ptr<SequentialFile>* result, const EnvOptions& options) override {
Status s;
result->reset();
// Corruption test needs to rename and delete files of these kind
// while they are still open with another handle. For that reason we
// allow share_write and delete(allows rename).
HANDLE hFile = 0;
{
IOSTATS_TIMER_GUARD(open_nanos);
hFile = CreateFileA(fname.c_str(),
GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
NULL,
OPEN_EXISTING, // Original fopen mode is "rb"
FILE_ATTRIBUTE_NORMAL,
NULL);
}
if (INVALID_HANDLE_VALUE == hFile) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname, lastError);
} else {
int fd = _open_osfhandle(reinterpret_cast<intptr_t>(hFile), 0);
if (fd == -1) {
auto code = errno;
CloseHandle(hFile);
s = IOError("Failed to _open_osfhandle for NewSequentialFile: " + fname, code);
} else {
FILE * file = _fdopen(fd, "rb");
if (file == nullptr) {
auto code = errno;
_close(fd);
s = IOError("Failed to fdopen NewSequentialFile: " + fname, code);
} else {
result->reset(new WinSequentialFile(fname, file, options));
}
}
}
return s;
}
virtual Status NewRandomAccessFile(const std::string& fname, std::unique_ptr<RandomAccessFile>* result, const EnvOptions& options) override {
result->reset();
Status s;
// Open the file for read-only random access
// Random access is to disable read-ahead as the system reads too much data
DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
if (!options.use_os_buffer && !options.use_mmap_reads) {
fileFlags |= FILE_FLAG_NO_BUFFERING;
} else {
fileFlags |= FILE_FLAG_RANDOM_ACCESS;
}
/// Shared access is necessary for corruption test to pass
// almost all tests would work with a possible exception of fault_injection
HANDLE hFile = 0;
{
IOSTATS_TIMER_GUARD(open_nanos);
hFile = CreateFileA(
fname.c_str(),
GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
NULL,
OPEN_EXISTING,
fileFlags,
NULL);
}
if (INVALID_HANDLE_VALUE == hFile) {
auto lastError = GetLastError();
return IOErrorFromWindowsError("NewRandomAccessFile failed to Create/Open: " + fname, lastError);
}
UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
// CAUTION! This will map the entire file into the process address space
if (options.use_mmap_reads && sizeof(void*) >= 8) {
// Use mmap when virtual address-space is plentiful.
uint64_t fileSize;
s = GetFileSize(fname, &fileSize);
if (s.ok()) {
// Will not map empty files
if (fileSize == 0) {
return IOError("NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
}
HANDLE hMap = CreateFileMappingA(hFile,
NULL,
PAGE_READONLY,
0, // Whole file at its present length
0,
NULL); // Mapping name
if (!hMap) {
auto lastError = GetLastError();
return IOErrorFromWindowsError("Failed to create file mapping for NewRandomAccessFile: " + fname, lastError);
}
UniqueCloseHandlePtr mapGuard (hMap, CloseHandleFunc);
const void* mapped_region = MapViewOfFileEx(hMap,
FILE_MAP_READ,
0, // High DWORD of access start
0, // Low DWORD
fileSize,
NULL); // Let the OS choose the mapping
if (!mapped_region) {
auto lastError = GetLastError();
return IOErrorFromWindowsError("Failed to MapViewOfFile for NewRandomAccessFile: " + fname, lastError);
}
result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region, fileSize));
mapGuard.release();
fileGuard.release();
}
}
else {
result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
fileGuard.release();
}
return s;
}
virtual Status NewWritableFile(const std::string& fname, std::unique_ptr<WritableFile>* result, const EnvOptions& options) override {
const size_t c_BufferCapacity = 64 * 1024;
EnvOptions local_options(options);
result->reset();
Status s;
DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
if (!local_options.use_os_buffer && !local_options.use_mmap_writes) {
fileFlags = FILE_FLAG_NO_BUFFERING;
}
// Desired access. We are want to write only here but if we want to memory map
// the file then there is no write only mode so we have to create it Read/Write
// However, MapViewOfFile specifies only Write only
DWORD desired_access = GENERIC_WRITE;
DWORD shared_mode = FILE_SHARE_READ;
if (local_options.use_mmap_writes) {
desired_access |= GENERIC_READ;
} else {
// Adding this solely for tests to pass (fault_injection_test, wal_manager_test).
shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE);
}
HANDLE hFile = 0;
{
IOSTATS_TIMER_GUARD(open_nanos);
hFile = CreateFileA(fname.c_str(),
desired_access, // Access desired
shared_mode,
NULL, // Security attributes
CREATE_ALWAYS, // Posix env says O_CREAT | O_RDWR | O_TRUNC
fileFlags, // Flags
NULL); // Template File
}
if (INVALID_HANDLE_VALUE == hFile) {
auto lastError = GetLastError();
return IOErrorFromWindowsError("Failed to create a NewWriteableFile: " + fname, lastError);
}
if (options.use_mmap_writes) {
// We usually do not use mmmapping on SSD and thus we pass memory page_size
result->reset(new WinMmapFile(fname, hFile, page_size_, allocation_granularity_, local_options));
} else {
// Here we want the buffer allocation to be aligned by the SSD page size and to be a multiple of it
result->reset(new WinWritableFile(fname, hFile, page_size_, c_BufferCapacity, local_options));
}
return s;
}
virtual Status NewRandomRWFile(const std::string& fname, std::unique_ptr<RandomRWFile>* result,
const EnvOptions& options) override {
result->reset();
// no support for mmap yet (same as POSIX env)
if (options.use_mmap_writes || options.use_mmap_reads) {
return Status::NotSupported("No support for mmap read/write yet");
}
Status s;
HANDLE hFile = 0;
{
IOSTATS_TIMER_GUARD(open_nanos);
hFile = CreateFileA(fname.c_str(),
GENERIC_READ | GENERIC_WRITE,
FILE_SHARE_READ,
NULL,
OPEN_ALWAYS, // Posix env specifies O_CREAT, it will open existing file or create new
FILE_ATTRIBUTE_NORMAL,
NULL);
}
if (INVALID_HANDLE_VALUE == hFile) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Failed to Open/Create NewRandomRWFile" + fname, lastError);
}
else {
result->reset(new WinRandomRWFile(fname, hFile, options));
}
return s;
}
virtual Status NewDirectory(const std::string& name, std::unique_ptr<Directory>* result) override {
Status s;
// Must be nullptr on failure
result->reset();
// Must fail if directory does not exist
if (!DirExists(name)) {
s = IOError("Directory does not exist: " + name, EEXIST);
} else {
IOSTATS_TIMER_GUARD(open_nanos);
result->reset(new WinDirectory);
}
return s;
}
virtual bool FileExists(const std::string& fname) override {
// F_OK == 0
const int F_OK_ = 0;
return _access(fname.c_str(), F_OK_) == 0;
}
virtual Status GetChildren(const std::string& dir, std::vector<std::string>* result) override {
std::vector<std::string> output;
Status status;
auto CloseDir = [](DIR* p) { closedir(p); };
std::unique_ptr<DIR, decltype(CloseDir)> dirp(opendir(dir.c_str()), CloseDir);
if (!dirp) {
status = IOError(dir, errno);
}
else {
if (result->capacity() > 0) {
output.reserve(result->capacity());
}
struct dirent* ent = readdir(dirp.get());
while (ent) {
output.push_back(ent->d_name);
ent = readdir(dirp.get());
}
}
output.swap(*result);
return status;
}
virtual Status CreateDir(const std::string& name) override {
Status result;
if (_mkdir(name.c_str()) != 0) {
auto code = errno;
result = IOError("Failed to create dir: " + name, code);
}
return result;
}
virtual Status CreateDirIfMissing(const std::string& name) override {
Status result;
if (DirExists(name)) {
return result;
}
if (_mkdir(name.c_str()) != 0) {
if (errno == EEXIST) {
result = Status::IOError("`" + name + "' exists but is not a directory");
} else {
auto code = errno;
result = IOError("Failed to create dir: " + name, code);
}
}
return result;
}
virtual Status DeleteDir(const std::string& name) override {
Status result;
if (_rmdir(name.c_str()) != 0) {
auto code = errno;
result = IOError("Failed to remove dir: " + name, code);
}
return result;
}
virtual Status GetFileSize(const std::string& fname, uint64_t* size) override {
Status s;
WIN32_FILE_ATTRIBUTE_DATA attrs;
if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
ULARGE_INTEGER file_size;
file_size.HighPart = attrs.nFileSizeHigh;
file_size.LowPart = attrs.nFileSizeLow;
*size = file_size.QuadPart;
} else {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError);
}
return s;
}
static inline uint64_t FileTimeToUnixTime(const FILETIME& ftTime) {
const uint64_t c_FileTimePerSecond = 10000000U;
// UNIX epoch starts on 1970-01-01T00:00:00Z
// Windows FILETIME starts on 1601-01-01T00:00:00Z
// Therefore, we need to subtract the below number of seconds from
// the seconds that we obtain from FILETIME with an obvious loss of precision
const uint64_t c_SecondBeforeUnixEpoch = 11644473600U;
ULARGE_INTEGER li;
li.HighPart = ftTime.dwHighDateTime;
li.LowPart = ftTime.dwLowDateTime;
uint64_t result = (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
return result;
}
virtual Status GetFileModificationTime(const std::string& fname, uint64_t* file_mtime) override {
Status s;
WIN32_FILE_ATTRIBUTE_DATA attrs;
if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
*file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
} else {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Can not get file modification time for: " + fname, lastError);
*file_mtime = 0;
}
return s;
}
virtual Status RenameFile(const std::string& src, const std::string& target) override {
Status result;
// rename() is not capable of replacing the existing file as on Linux
// so use OS API directly
if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) {
DWORD lastError = GetLastError();
std::string text("Failed to rename: ");
text.append(src).append(" to: ").append(target);
result = IOErrorFromWindowsError(text, lastError);
}
return result;
}
virtual Status LinkFile(const std::string& src, const std::string& target) override {
Status result;
if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) {
DWORD lastError = GetLastError();
std::string text("Failed to link: ");
text.append(src).append(" to: ").append(target);
result = IOErrorFromWindowsError(text, lastError);
}
return result;
}
virtual Status LockFile(const std::string& lockFname, FileLock** lock) override {
assert(lock != nullptr);
*lock = NULL;
Status result;
// No-sharing, this is a LOCK file
const DWORD ExclusiveAccessON = 0;
// Obtain exclusive access to the LOCK file
// Previously, instead of NORMAL attr we set DELETE on close and that worked
// well except with fault_injection test that insists on deleting it.
HANDLE hFile = 0;
{
IOSTATS_TIMER_GUARD(open_nanos);
hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE),
ExclusiveAccessON, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
}
if (INVALID_HANDLE_VALUE == hFile) {
auto lastError = GetLastError();
result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname, lastError);
} else {
*lock = new WinFileLock(hFile);
}
return result;
}
virtual Status UnlockFile(FileLock* lock) override {
Status result;
assert(lock != nullptr);
delete lock;
return result;
}
virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW, void* tag = nullptr) override;
virtual int UnSchedule(void* arg, Priority pri) override;
virtual void StartThread(void (*function)(void* arg), void* arg) override;
virtual void WaitForJoin() override;
virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
virtual Status GetTestDirectory(std::string* result) override {
std::string output;
const char* env = getenv("TEST_TMPDIR");
if (env && env[0] != '\0') {
output = env;
CreateDir(output);
} else {
env = getenv("TMP");
if (env && env[0] != '\0') {
output = env;
} else {
output = "c:\\tmp";
}
CreateDir(output);
}
output.append("\\testrocksdb-");
output.append(std::to_string(_getpid()));
CreateDir(output);
output.swap(*result);
return Status::OK();
}
virtual Status GetThreadList(
std::vector<ThreadStatus>* thread_list) override {
assert(thread_status_updater_);
return thread_status_updater_->GetThreadList(thread_list);
}
static uint64_t gettid() {
uint64_t thread_id = GetCurrentThreadId();
return thread_id;
}
virtual uint64_t GetThreadID() const override {
return gettid();
}
virtual Status NewLogger(const std::string& fname, std::shared_ptr<Logger>* result) override {
Status s;
result->reset();
HANDLE hFile = 0;
{
IOSTATS_TIMER_GUARD(open_nanos);
hFile = CreateFileA(fname.c_str(),
GENERIC_WRITE,
FILE_SHARE_READ | FILE_SHARE_DELETE, // In RocksDb log files are renamed and deleted before they are closed. This enables doing so.
NULL,
CREATE_ALWAYS, // Original fopen mode is "w"
FILE_ATTRIBUTE_NORMAL,
NULL);
}
if (INVALID_HANDLE_VALUE == hFile) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError);
} else {
{
// With log files we want to set the true creation time as of now because the system
// for some reason caches the attributes of the previous file that just been renamed from
// this name so auto_roll_logger_test fails
FILETIME ft;
GetSystemTimeAsFileTime(&ft);
// Set creation, last access and last write time to the same value
SetFileTime(hFile, &ft, &ft, &ft);
}
int fd = _open_osfhandle(reinterpret_cast<intptr_t>(hFile), 0);
if (fd == -1) {
auto code = errno;
CloseHandle(hFile);
s = IOError("Failed to _open_osfhandle: " + fname, code);
} else {
FILE * file = _fdopen(fd, "w");
if (file == nullptr) {
auto code = errno;
_close(fd);
s = IOError("Failed to fdopen: " + fname, code);
} else {
result->reset(new WinLogger(&WinEnv::gettid, this, file));
}
}
}
return s;
}
virtual uint64_t NowMicros() override {
using namespace std::chrono;
return duration_cast<microseconds>(system_clock::now().time_since_epoch()).count();
}
virtual uint64_t NowNanos() override {
// all std::chrono clocks on windows have the same resolution that is only
// good enough for microseconds but not nanoseconds
// On Windows 8 and Windows 2012 Server
// GetSystemTimePreciseAsFileTime(&current_time) can be used
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
// Convert to nanoseconds first to avoid loss of precision
// and divide by frequency
li.QuadPart *= std::nano::den;
li.QuadPart /= perf_counter_frequency_;
return li.QuadPart;
}
virtual void SleepForMicroseconds(int micros) override {
std::this_thread::sleep_for(std::chrono::microseconds(micros));
}
virtual Status GetHostName(char* name, uint64_t len) override {
Status s;
DWORD nSize = len;
if (!::GetComputerNameA(name, &nSize)) {
auto lastError = GetLastError();
s = IOErrorFromWindowsError("GetHostName", lastError);
} else {
name[nSize] = 0;
}
return s;
}
virtual Status GetCurrTime(int64_t* unix_time) {
Status s;
time_t ret = time(nullptr);
if (ret == (time_t) -1) {
*unix_time = 0;
s = IOError("GetCurrTime", errno);
} else {
*unix_time = (int64_t)ret;
}
return s;
}
virtual Status GetAbsolutePath(const std::string& db_path, std::string* output_path) override {
// Check if we already have an absolute path
// that starts with non dot and has a semicolon in it
if ((!db_path.empty() &&
(db_path[0] == '/' || db_path[0] == '\\')) ||
(
db_path.size() > 2 &&
db_path[0] != '.' &&
((db_path[1] == ':' && db_path[2] == '\\') ||
(db_path[1] == ':' && db_path[2] == '/'))
)
) {
*output_path = db_path;
return Status::OK();
}
std::string result;
result.resize(_MAX_PATH);
char* ret = _getcwd(&result[0], _MAX_PATH);
if (ret == nullptr) {
return Status::IOError("Failed to get current working directory", strerror(errno));
}
result.resize(strlen(result.data()));
result.swap(*output_path);
return Status::OK();
}
// Allow increasing the number of worker threads.
virtual void SetBackgroundThreads(int num, Priority pri) override {
assert(pri >= Priority::LOW && pri <= Priority::HIGH);
thread_pools_[pri].SetBackgroundThreads(num);
}
virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
assert(pri >= Priority::LOW && pri <= Priority::HIGH);
thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
}
virtual std::string TimeToString(uint64_t secondsSince1970) override {
std::string result;
const time_t seconds = secondsSince1970;
const int maxsize = 64;
struct tm t;
errno_t ret = localtime_s(&t, &seconds);
if (ret) {
result = std::to_string(seconds);
} else {
result.resize(maxsize);
char* p = &result[0];
int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
assert(len > 0);
result.resize(len);
}
return result;
}
EnvOptions OptimizeForLogWrite(const EnvOptions& env_options, const DBOptions& db_options) const override {
EnvOptions optimized = env_options;
optimized.use_mmap_writes = false;
optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
optimized.use_os_buffer = true; // This is because we flush only whole pages on unbuffered io and the last records are not guaranteed to be flushed.
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
// test and make this false
optimized.fallocate_with_keep_size = true;
return optimized;
}
EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const override{
EnvOptions optimized = env_options;
optimized.use_mmap_writes = false;
optimized.use_os_buffer = true;
optimized.fallocate_with_keep_size = true;
return optimized;
}
private:
// Returns true iff the named directory exists and is a directory.
virtual bool DirExists(const std::string& dname) {
WIN32_FILE_ATTRIBUTE_DATA attrs;
if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) {
return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
}
return false;
}
bool SupportsFastAllocate(const std::string& /* path */) {
return false;
}
class ThreadPool {
public:
ThreadPool() :
total_threads_limit_(1),
bgthreads_(0),
queue_(),
queue_len_(0U),
exit_all_threads_(false),
low_io_priority_(false),
env_(nullptr) {
}
~ThreadPool() {
assert(bgthreads_.size() == 0U);
}
void JoinAllThreads() {
{
std::lock_guard<std::mutex> lock(mu_);
assert(!exit_all_threads_);
exit_all_threads_ = true;
bgsignal_.notify_all();
}
for (std::thread& th : bgthreads_) {
th.join();
}
// Subject to assert in the __dtor
bgthreads_.clear();
}
void SetHostEnv(Env* env) {
env_ = env;
}
// Return true if there is at least one thread needs to terminate.
bool HasExcessiveThread() const {
return bgthreads_.size() > total_threads_limit_;
}
// Return true iff the current thread is the excessive thread to terminate.
// Always terminate the running thread that is added last, even if there are
// more than one thread to terminate.
bool IsLastExcessiveThread(size_t thread_id) const {
return HasExcessiveThread() && thread_id == bgthreads_.size() - 1;
}
// Is one of the threads to terminate.
bool IsExcessiveThread(size_t thread_id) const {
return thread_id >= total_threads_limit_;
}
// Return the thread priority.
// This would allow its member-thread to know its priority.
Env::Priority GetThreadPriority() {
return priority_;
}
// Set the thread priority.
void SetThreadPriority(Env::Priority priority) {
priority_ = priority;
}
void BGThread(size_t thread_id) {
while (true) {
// Wait until there is an item that is ready to run
std::unique_lock<std::mutex> uniqueLock(mu_);
// Stop waiting if the thread needs to do work or needs to terminate.
while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) && (queue_.empty() || IsExcessiveThread(thread_id))) {
bgsignal_.wait(uniqueLock);
}
if (exit_all_threads_) {
// mechanism to let BG threads exit safely
uniqueLock.unlock();
break;
}
if (IsLastExcessiveThread(thread_id)) {
// Current thread is the last generated one and is excessive.
// We always terminate excessive thread in the reverse order of
// generation time.
std::thread& terminating_thread = bgthreads_.back();
auto tid = terminating_thread.get_id();
// Ensure that that this thread is ours
assert(tid == std::this_thread::get_id());
terminating_thread.detach();
bgthreads_.pop_back();
if (HasExcessiveThread()) {
// There is still at least more excessive thread to terminate.
WakeUpAllThreads();
}
uniqueLock.unlock();
PrintThreadInfo(thread_id, gettid());
break;
}
void(*function)(void*) = queue_.front().function;
void* arg = queue_.front().arg;
queue_.pop_front();
queue_len_.store(queue_.size(), std::memory_order_relaxed);
uniqueLock.unlock();
(*function)(arg);
}
}
// Helper struct for passing arguments when creating threads.
struct BGThreadMetadata {
ThreadPool* thread_pool_;
size_t thread_id_; // Thread count in the thread.
explicit BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id) : thread_pool_(thread_pool), thread_id_(thread_id) {}
};
static void* BGThreadWrapper(void* arg) {
std::unique_ptr<BGThreadMetadata> meta(reinterpret_cast<BGThreadMetadata*>(arg));
size_t thread_id = meta->thread_id_;
ThreadPool* tp = meta->thread_pool_;
#if ROCKSDB_USING_THREAD_STATUS
// for thread-status
ThreadStatusUtil::RegisterThread(tp->env_,
(tp->GetThreadPriority() == Env::Priority::HIGH ?
ThreadStatus::HIGH_PRIORITY :
ThreadStatus::LOW_PRIORITY));
#endif
tp->BGThread(thread_id);
#if ROCKSDB_USING_THREAD_STATUS
ThreadStatusUtil::UnregisterThread();
#endif
return nullptr;
}
void WakeUpAllThreads() {
bgsignal_.notify_all();
}
void SetBackgroundThreadsInternal(size_t num, bool allow_reduce) {
std::lock_guard<std::mutex> lg(mu_);
if (exit_all_threads_) {
return;
}
if (num > total_threads_limit_ ||
(num < total_threads_limit_ && allow_reduce)) {
total_threads_limit_ = std::max(size_t(1), num);
WakeUpAllThreads();
StartBGThreads();
}
assert(total_threads_limit_ > 0);
}
void IncBackgroundThreadsIfNeeded(int num) {
SetBackgroundThreadsInternal(num, false);
}
void SetBackgroundThreads(int num) {
SetBackgroundThreadsInternal(num, true);
}
void StartBGThreads() {
// Start background thread if necessary
while (bgthreads_.size() < total_threads_limit_) {
std::thread p_t(&ThreadPool::BGThreadWrapper, new BGThreadMetadata(this, bgthreads_.size()));
bgthreads_.push_back(std::move(p_t));
}
}
void Schedule(void (*function)(void* arg1), void* arg, void* tag) {
std::lock_guard<std::mutex> lg(mu_);
if (exit_all_threads_) {
return;
}
StartBGThreads();
// Add to priority queue
queue_.push_back(BGItem());
queue_.back().function = function;
queue_.back().arg = arg;
queue_.back().tag = tag;
queue_len_.store(queue_.size(), std::memory_order_relaxed);
if (!HasExcessiveThread()) {
// Wake up at least one waiting thread.
bgsignal_.notify_one();
} else {
// Need to wake up all threads to make sure the one woken
// up is not the one to terminate.
WakeUpAllThreads();
}
}
int UnSchedule(void* arg) {
int count = 0;
std::lock_guard<std::mutex> lg(mu_);
// Remove from priority queue
BGQueue::iterator it = queue_.begin();
while (it != queue_.end()) {
if (arg == (*it).tag) {
it = queue_.erase(it);
count++;
} else {
++it;
}
}
queue_len_.store(queue_.size(), std::memory_order_relaxed);
return count;
}
unsigned int GetQueueLen() const {
return static_cast<unsigned int>(queue_len_.load(std::memory_order_relaxed));
}
private:
// Entry per Schedule() call
struct BGItem {
void* arg;
void (*function)(void*);
void* tag;
};
typedef std::deque<BGItem> BGQueue;
std::mutex mu_;
std::condition_variable bgsignal_;
size_t total_threads_limit_;
std::vector<std::thread> bgthreads_;
BGQueue queue_;
std::atomic_size_t queue_len_; // Queue length. Used for stats reporting
bool exit_all_threads_;
bool low_io_priority_;
Env::Priority priority_;
Env* env_;
};
bool checkedDiskForMmap_;
bool forceMmapOff; // do we override Env options?
size_t page_size_;
size_t allocation_granularity_;
uint64_t perf_counter_frequency_;
std::vector<ThreadPool> thread_pools_;
mutable std::mutex mu_;
std::vector<std::thread> threads_to_join_;
};
WinEnv::WinEnv() :
checkedDiskForMmap_(false),
forceMmapOff(false),
page_size_(4 * 1012),
allocation_granularity_(page_size_),
perf_counter_frequency_(0),
thread_pools_(Priority::TOTAL) {
SYSTEM_INFO sinfo;
GetSystemInfo(&sinfo);
page_size_ = sinfo.dwPageSize;
allocation_granularity_ = sinfo.dwAllocationGranularity;
{
LARGE_INTEGER qpf;
BOOL ret = QueryPerformanceFrequency(&qpf);
assert(ret == TRUE);
perf_counter_frequency_ = qpf.QuadPart;
}
for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
thread_pools_[pool_id].SetThreadPriority(
static_cast<Env::Priority>(pool_id));
// This allows later initializing the thread-local-env of each thread.
thread_pools_[pool_id].SetHostEnv(this);
}
// Protected member of the base class
thread_status_updater_ = CreateThreadStatusUpdater();
}
void WinEnv::Schedule(void(*function)(void*), void* arg, Priority pri, void* tag) {
assert(pri >= Priority::LOW && pri <= Priority::HIGH);
thread_pools_[pri].Schedule(function, arg, tag);
}
int WinEnv::UnSchedule(void* arg, Priority pri) {
return thread_pools_[pri].UnSchedule(arg);
}
unsigned int WinEnv::GetThreadPoolQueueLen(Priority pri) const {
assert(pri >= Priority::LOW && pri <= Priority::HIGH);
return thread_pools_[pri].GetQueueLen();
}
namespace
{
struct StartThreadState {
void (*user_function)(void*);
void* arg;
};
}
static void* StartThreadWrapper(void* arg) {
std::unique_ptr<StartThreadState> state(reinterpret_cast<StartThreadState*>(arg));
state->user_function(state->arg);
return nullptr;
}
void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
StartThreadState* state = new StartThreadState;
state->user_function = function;
state->arg = arg;
try {
std::thread th(&StartThreadWrapper, state);
std::lock_guard<std::mutex> lg(mu_);
threads_to_join_.push_back(std::move(th));
} catch (const std::system_error& ex) {
WinthreadCall("start thread", ex.code());
}
}
void WinEnv::WaitForJoin() {
for (auto& th : threads_to_join_) {
th.join();
}
threads_to_join_.clear();
}
} // namespace
std::string Env::GenerateUniqueId() {
std::string result;
UUID uuid;
UuidCreateSequential(&uuid);
RPC_CSTR rpc_str;
auto status = UuidToStringA(&uuid, &rpc_str);
assert(status == RPC_S_OK);
result = reinterpret_cast<char*>(rpc_str);
status = RpcStringFreeA(&rpc_str);
assert(status == RPC_S_OK);
return result;
}
// We choose to create this on the heap and using std::once for the following reasons
// 1) Currently available MS compiler does not implement atomic C++11 initialization of
// function local statics
// 2) We choose not to destroy the env because joining the threads from the system loader
// which destroys the statics (same as from DLLMain) creates a system loader dead-lock.
// in this manner any remaining threads are terminated OK.
namespace {
std::once_flag winenv_once_flag;
Env* envptr;
};
Env* Env::Default() {
std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); });
return envptr;
}
} // namespace rocksdb