From 5421c9728b0ccc31437a954001c60c5ad5fce208 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 10 Nov 2015 12:58:39 -0800 Subject: [PATCH 1/2] Make use of portable `uint64_t` type to make possible file access in 64-bit. Currently, a signed off_t type is being used for the following interfaces for both offset and the length in bytes: * `Allocate` * `RangeSync` On Linux `off_t` is automatically either 32 or 64-bit depending on the platform. On Windows it is always a 32-bit signed long which limits file access and in particular space pre-allocation to effectively 2 Gb. Proposal is to replace off_t with uint64_t as a portable type always access files with 64-bit interfaces. May need to modify posix code but lack resources to test it. --- include/rocksdb/env.h | 12 ++++++------ port/win/env_win.cc | 10 ++-------- util/env_test.cc | 4 ++-- util/file_reader_writer.cc | 2 +- util/file_reader_writer.h | 2 +- util/file_reader_writer_test.cc | 4 ++-- util/io_posix.cc | 21 +++++++++++++++------ util/io_posix.h | 6 +++--- 8 files changed, 32 insertions(+), 29 deletions(-) diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index bbc2de579c..eb811ad02d 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -570,7 +570,7 @@ class WritableFile { // This asks the OS to initiate flushing the cached data to disk, // without waiting for completion. // Default implementation does nothing. - virtual Status RangeSync(off_t offset, off_t nbytes) { return Status::OK(); } + virtual Status RangeSync(uint64_t offset, uint64_t nbytes) { return Status::OK(); } // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation @@ -590,8 +590,8 @@ class WritableFile { if (new_last_preallocated_block > last_preallocated_block_) { size_t num_spanned_blocks = new_last_preallocated_block - last_preallocated_block_; - Allocate(static_cast(block_size * last_preallocated_block_), - static_cast(block_size * num_spanned_blocks)); + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); last_preallocated_block_ = new_last_preallocated_block; } } @@ -600,7 +600,7 @@ class WritableFile { /* * Pre-allocate space for a file. */ - virtual Status Allocate(off_t offset, off_t len) { + virtual Status Allocate(uint64_t offset, uint64_t len) { return Status::OK(); } @@ -920,10 +920,10 @@ class WritableFileWrapper : public WritableFile { } protected: - Status Allocate(off_t offset, off_t len) override { + Status Allocate(uint64_t offset, uint64_t len) override { return target_->Allocate(offset, len); } - Status RangeSync(off_t offset, off_t nbytes) override { + Status RangeSync(uint64_t offset, uint64_t nbytes) override { return target_->RangeSync(offset, nbytes); } diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 95796554f0..9f81d9db22 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -61,12 +61,6 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() { return new ThreadStatusUpdater(); } -// A wrapper for fadvise, if the platform doesn't support fadvise, -// it will simply return Status::NotSupport. -int Fadvise(int fd, off_t offset, size_t len, int advice) { - return 0; // simply do nothing. -} - inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { return Status::IOError(context, GetWindowsErrSz(err)); } @@ -605,7 +599,7 @@ class WinMmapFile : public WritableFile { return Status::OK(); } - virtual Status Allocate(off_t offset, off_t len) override { + virtual Status Allocate(uint64_t offset, uint64_t len) override { return Status::OK(); } }; @@ -1053,7 +1047,7 @@ class WinWritableFile : public WritableFile { return filesize_; } - virtual Status Allocate(off_t offset, off_t len) override { + virtual Status Allocate(uint64_t offset, uint64_t len) override { Status status; TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds); diff --git a/util/env_test.cc b/util/env_test.cc index 7f5e4b93b1..e5fa37099c 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -971,11 +971,11 @@ TEST_F(EnvPosixTest, WritableFileWrapper) { } protected: - Status Allocate(off_t offset, off_t len) override { + Status Allocate(uint64_t offset, uint64_t len) override { inc(11); return Status::OK(); } - Status RangeSync(off_t offset, off_t nbytes) override { + Status RangeSync(uint64_t offset, uint64_t nbytes) override { inc(12); return Status::OK(); } diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index f5c1788966..6d548c449d 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -248,7 +248,7 @@ Status WritableFileWriter::SyncInternal(bool use_fsync) { return s; } -Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) { +Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { IOSTATS_TIMER_GUARD(range_sync_nanos); TEST_SYNC_POINT("WritableFileWriter::RangeSync:0"); return writable_file_->RangeSync(offset, nbytes); diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 720979099f..c10cde2abe 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -162,7 +162,7 @@ class WritableFileWriter { Status WriteUnbuffered(); // Normal write Status WriteBuffered(const char* data, size_t size); - Status RangeSync(off_t offset, off_t nbytes); + Status RangeSync(uint64_t offset, uint64_t nbytes); size_t RequestToken(size_t bytes, bool align); Status SyncInternal(bool use_fsync); }; diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 370f523926..69b8cfea88 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -47,8 +47,8 @@ TEST_F(WritableFileWriterTest, RangeSync) { } protected: - Status Allocate(off_t offset, off_t len) override { return Status::OK(); } - Status RangeSync(off_t offset, off_t nbytes) override { + Status Allocate(uint64_t offset, uint64_t len) override { return Status::OK(); } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { EXPECT_EQ(offset % 4096, 0u); EXPECT_EQ(nbytes % 4096, 0u); diff --git a/util/io_posix.cc b/util/io_posix.cc index 0854ab013f..dd41e2a035 100644 --- a/util/io_posix.cc +++ b/util/io_posix.cc @@ -478,12 +478,15 @@ Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) { } #ifdef ROCKSDB_FALLOCATE_PRESENT -Status PosixMmapFile::Allocate(off_t offset, off_t len) { +Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) { + assert(offset <= std::numeric_limits::max()); + assert(len <= std::numeric_limits::max()); TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); int alloc_status = 0; if (allow_fallocate_) { alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -606,13 +609,16 @@ Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { } #ifdef ROCKSDB_FALLOCATE_PRESENT -Status PosixWritableFile::Allocate(off_t offset, off_t len) { +Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { + assert(offset <= std::numeric_limits::max()); + assert(len <= std::numeric_limits::max()); TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = 0; if (allow_fallocate_) { alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -621,8 +627,11 @@ Status PosixWritableFile::Allocate(off_t offset, off_t len) { } } -Status PosixWritableFile::RangeSync(off_t offset, off_t nbytes) { - if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { +Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { + assert(offset <= std::numeric_limits::max()); + assert(nbytes <= std::numeric_limits::max()); + if (sync_file_range(fd_, static_cast(offset), + static_cast(nbytes), SYNC_FILE_RANGE_WRITE) == 0) { return Status::OK(); } else { return IOError(filename_, errno); diff --git a/util/io_posix.h b/util/io_posix.h index 0e5da39d6b..2a45d10ffe 100644 --- a/util/io_posix.h +++ b/util/io_posix.h @@ -90,8 +90,8 @@ class PosixWritableFile : public WritableFile { virtual uint64_t GetFileSize() override; virtual Status InvalidateCache(size_t offset, size_t length) override; #ifdef ROCKSDB_FALLOCATE_PRESENT - virtual Status Allocate(off_t offset, off_t len) override; - virtual Status RangeSync(off_t offset, off_t nbytes) override; + virtual Status Allocate(uint64_t offset, uint64_t len) override; + virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; #endif }; @@ -157,7 +157,7 @@ class PosixMmapFile : public WritableFile { virtual uint64_t GetFileSize() override; virtual Status InvalidateCache(size_t offset, size_t length) override; #ifdef ROCKSDB_FALLOCATE_PRESENT - virtual Status Allocate(off_t offset, off_t len) override; + virtual Status Allocate(uint64_t offset, uint64_t len) override; #endif }; From 5270b33bd38762376268395a3e243086953518ae Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 10 Nov 2015 17:03:42 -0800 Subject: [PATCH 2/2] Make use of portable `uint64_t` type to make possible file access in 64-bit. Currently, a signed off_t type is being used for the following interfaces for both offset and the length in bytes: * `Allocate` * `RangeSync` On Linux `off_t` is automatically either 32 or 64-bit depending on the platform. On Windows it is always a 32-bit signed long which limits file access and in particular space pre-allocation to effectively 2 Gb. Proposal is to replace off_t with uint64_t as a portable type always access files with 64-bit interfaces. May need to modify posix code but lack resources to test it. --- include/rocksdb/env.h | 12 ++++++------ port/win/env_win.cc | 10 ++-------- util/env_test.cc | 4 ++-- util/file_reader_writer.cc | 2 +- util/file_reader_writer.h | 2 +- util/file_reader_writer_test.cc | 4 ++-- util/io_posix.cc | 21 +++++++++++++++------ util/io_posix.h | 6 +++--- 8 files changed, 32 insertions(+), 29 deletions(-) diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index bbc2de579c..eb811ad02d 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -570,7 +570,7 @@ class WritableFile { // This asks the OS to initiate flushing the cached data to disk, // without waiting for completion. // Default implementation does nothing. - virtual Status RangeSync(off_t offset, off_t nbytes) { return Status::OK(); } + virtual Status RangeSync(uint64_t offset, uint64_t nbytes) { return Status::OK(); } // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation @@ -590,8 +590,8 @@ class WritableFile { if (new_last_preallocated_block > last_preallocated_block_) { size_t num_spanned_blocks = new_last_preallocated_block - last_preallocated_block_; - Allocate(static_cast(block_size * last_preallocated_block_), - static_cast(block_size * num_spanned_blocks)); + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); last_preallocated_block_ = new_last_preallocated_block; } } @@ -600,7 +600,7 @@ class WritableFile { /* * Pre-allocate space for a file. */ - virtual Status Allocate(off_t offset, off_t len) { + virtual Status Allocate(uint64_t offset, uint64_t len) { return Status::OK(); } @@ -920,10 +920,10 @@ class WritableFileWrapper : public WritableFile { } protected: - Status Allocate(off_t offset, off_t len) override { + Status Allocate(uint64_t offset, uint64_t len) override { return target_->Allocate(offset, len); } - Status RangeSync(off_t offset, off_t nbytes) override { + Status RangeSync(uint64_t offset, uint64_t nbytes) override { return target_->RangeSync(offset, nbytes); } diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 95796554f0..9f81d9db22 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -61,12 +61,6 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() { return new ThreadStatusUpdater(); } -// A wrapper for fadvise, if the platform doesn't support fadvise, -// it will simply return Status::NotSupport. -int Fadvise(int fd, off_t offset, size_t len, int advice) { - return 0; // simply do nothing. -} - inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) { return Status::IOError(context, GetWindowsErrSz(err)); } @@ -605,7 +599,7 @@ class WinMmapFile : public WritableFile { return Status::OK(); } - virtual Status Allocate(off_t offset, off_t len) override { + virtual Status Allocate(uint64_t offset, uint64_t len) override { return Status::OK(); } }; @@ -1053,7 +1047,7 @@ class WinWritableFile : public WritableFile { return filesize_; } - virtual Status Allocate(off_t offset, off_t len) override { + virtual Status Allocate(uint64_t offset, uint64_t len) override { Status status; TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds); diff --git a/util/env_test.cc b/util/env_test.cc index 7f5e4b93b1..e5fa37099c 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -971,11 +971,11 @@ TEST_F(EnvPosixTest, WritableFileWrapper) { } protected: - Status Allocate(off_t offset, off_t len) override { + Status Allocate(uint64_t offset, uint64_t len) override { inc(11); return Status::OK(); } - Status RangeSync(off_t offset, off_t nbytes) override { + Status RangeSync(uint64_t offset, uint64_t nbytes) override { inc(12); return Status::OK(); } diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index f5c1788966..6d548c449d 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -248,7 +248,7 @@ Status WritableFileWriter::SyncInternal(bool use_fsync) { return s; } -Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) { +Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { IOSTATS_TIMER_GUARD(range_sync_nanos); TEST_SYNC_POINT("WritableFileWriter::RangeSync:0"); return writable_file_->RangeSync(offset, nbytes); diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 720979099f..c10cde2abe 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -162,7 +162,7 @@ class WritableFileWriter { Status WriteUnbuffered(); // Normal write Status WriteBuffered(const char* data, size_t size); - Status RangeSync(off_t offset, off_t nbytes); + Status RangeSync(uint64_t offset, uint64_t nbytes); size_t RequestToken(size_t bytes, bool align); Status SyncInternal(bool use_fsync); }; diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 370f523926..69b8cfea88 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -47,8 +47,8 @@ TEST_F(WritableFileWriterTest, RangeSync) { } protected: - Status Allocate(off_t offset, off_t len) override { return Status::OK(); } - Status RangeSync(off_t offset, off_t nbytes) override { + Status Allocate(uint64_t offset, uint64_t len) override { return Status::OK(); } + Status RangeSync(uint64_t offset, uint64_t nbytes) override { EXPECT_EQ(offset % 4096, 0u); EXPECT_EQ(nbytes % 4096, 0u); diff --git a/util/io_posix.cc b/util/io_posix.cc index 0854ab013f..dd41e2a035 100644 --- a/util/io_posix.cc +++ b/util/io_posix.cc @@ -478,12 +478,15 @@ Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) { } #ifdef ROCKSDB_FALLOCATE_PRESENT -Status PosixMmapFile::Allocate(off_t offset, off_t len) { +Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) { + assert(offset <= std::numeric_limits::max()); + assert(len <= std::numeric_limits::max()); TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); int alloc_status = 0; if (allow_fallocate_) { alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -606,13 +609,16 @@ Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { } #ifdef ROCKSDB_FALLOCATE_PRESENT -Status PosixWritableFile::Allocate(off_t offset, off_t len) { +Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) { + assert(offset <= std::numeric_limits::max()); + assert(len <= std::numeric_limits::max()); TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = 0; if (allow_fallocate_) { alloc_status = fallocate( - fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, + static_cast(offset), static_cast(len)); } if (alloc_status == 0) { return Status::OK(); @@ -621,8 +627,11 @@ Status PosixWritableFile::Allocate(off_t offset, off_t len) { } } -Status PosixWritableFile::RangeSync(off_t offset, off_t nbytes) { - if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { +Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { + assert(offset <= std::numeric_limits::max()); + assert(nbytes <= std::numeric_limits::max()); + if (sync_file_range(fd_, static_cast(offset), + static_cast(nbytes), SYNC_FILE_RANGE_WRITE) == 0) { return Status::OK(); } else { return IOError(filename_, errno); diff --git a/util/io_posix.h b/util/io_posix.h index 0e5da39d6b..2a45d10ffe 100644 --- a/util/io_posix.h +++ b/util/io_posix.h @@ -90,8 +90,8 @@ class PosixWritableFile : public WritableFile { virtual uint64_t GetFileSize() override; virtual Status InvalidateCache(size_t offset, size_t length) override; #ifdef ROCKSDB_FALLOCATE_PRESENT - virtual Status Allocate(off_t offset, off_t len) override; - virtual Status RangeSync(off_t offset, off_t nbytes) override; + virtual Status Allocate(uint64_t offset, uint64_t len) override; + virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override; virtual size_t GetUniqueId(char* id, size_t max_size) const override; #endif }; @@ -157,7 +157,7 @@ class PosixMmapFile : public WritableFile { virtual uint64_t GetFileSize() override; virtual Status InvalidateCache(size_t offset, size_t length) override; #ifdef ROCKSDB_FALLOCATE_PRESENT - virtual Status Allocate(off_t offset, off_t len) override; + virtual Status Allocate(uint64_t offset, uint64_t len) override; #endif };