mirror of https://github.com/facebook/rocksdb.git
Fix block cache ID uniqueness for Windows builds (#5844)
Summary: Since we do not evict a file's blocks from block cache before that file is deleted, we require a file's cache ID prefix is both unique and non-reusable. However, the Windows functionality we were relying on only guaranteed uniqueness. That meant a newly created file could be assigned the same cache ID prefix as a deleted file. If the newly created file had block offsets matching the deleted file, full cache keys could be exactly the same, resulting in obsolete data blocks returned from cache when trying to read from the new file. We noticed this when running on FAT32 where compaction was writing out of order keys due to reading obsolete blocks from its input files. The functionality is documented as behaving the same on NTFS, although I wasn't able to repro it there. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5844 Test Plan: we had a reliable repro of out-of-order keys on FAT32 that was fixed by this change Differential Revision: D17752442 fbshipit-source-id: 95d983f9196cf415f269e19293b97341edbf7e00
This commit is contained in:
parent
bc8b05cb77
commit
b00761eea6
|
@ -1942,7 +1942,10 @@ TEST_F(DBTest2, TestPerfContextIterCpuTime) {
|
|||
}
|
||||
#endif // OS_LINUX
|
||||
|
||||
#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
|
||||
// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache
|
||||
// breaks when that function is not implemented and no regular block cache is
|
||||
// provided.
|
||||
#if !defined(OS_SOLARIS) && !defined(OS_WIN)
|
||||
TEST_F(DBTest2, PersistentCache) {
|
||||
int num_iter = 80;
|
||||
|
||||
|
@ -2006,7 +2009,7 @@ TEST_F(DBTest2, PersistentCache) {
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif // !OS_SOLARIS
|
||||
#endif // !defined(OS_SOLARIS) && !defined(OS_WIN)
|
||||
|
||||
namespace {
|
||||
void CountSyncPoint() {
|
||||
|
|
|
@ -883,7 +883,9 @@ TEST_F(EnvPosixTest, PositionedAppend) {
|
|||
}
|
||||
#endif // !ROCKSDB_LITE
|
||||
|
||||
// Only works in linux platforms
|
||||
// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
|
||||
// handle a return value of zero but this test case cannot.
|
||||
#ifndef OS_WIN
|
||||
TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
|
||||
// Create file.
|
||||
if (env_ == Env::Default()) {
|
||||
|
@ -926,6 +928,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
|
|||
env_->DeleteFile(fname);
|
||||
}
|
||||
}
|
||||
#endif // !defined(OS_WIN)
|
||||
|
||||
// only works in linux platforms
|
||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||
|
@ -1016,7 +1019,9 @@ bool HasPrefix(const std::unordered_set<std::string>& ss) {
|
|||
return false;
|
||||
}
|
||||
|
||||
// Only works in linux and WIN platforms
|
||||
// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
|
||||
// handle a return value of zero but this test case cannot.
|
||||
#ifndef OS_WIN
|
||||
TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
|
||||
if (env_ == Env::Default()) {
|
||||
// Check whether a bunch of concurrently existing files have unique IDs.
|
||||
|
@ -1058,7 +1063,6 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
|
|||
}
|
||||
}
|
||||
|
||||
// Only works in linux and WIN platforms
|
||||
TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
|
||||
if (env_ == Env::Default()) {
|
||||
EnvOptions soptions;
|
||||
|
@ -1098,6 +1102,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
|
|||
ASSERT_TRUE(!HasPrefix(ids));
|
||||
}
|
||||
}
|
||||
#endif // !defined(OS_WIN)
|
||||
|
||||
TEST_P(EnvPosixTestWithParam, MultiRead) {
|
||||
EnvOptions soptions;
|
||||
|
|
|
@ -175,60 +175,18 @@ Status ftruncate(const std::string& filename, HANDLE hFile,
|
|||
return status;
|
||||
}
|
||||
|
||||
size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
|
||||
|
||||
if (max_size < kMaxVarint64Length * 3) {
|
||||
return 0;
|
||||
}
|
||||
#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
|
||||
// MINGGW as defined by CMake file.
|
||||
// yuslepukhin: I hate the guts of the above macros.
|
||||
// This impl does not guarantee uniqueness everywhere
|
||||
// is reasonably good
|
||||
BY_HANDLE_FILE_INFORMATION FileInfo;
|
||||
|
||||
BOOL result = GetFileInformationByHandle(hFile, &FileInfo);
|
||||
|
||||
TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
|
||||
|
||||
if (!result) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
char* rid = id;
|
||||
rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber));
|
||||
rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh));
|
||||
rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow));
|
||||
|
||||
assert(rid >= id);
|
||||
return static_cast<size_t>(rid - id);
|
||||
#else
|
||||
FILE_ID_INFO FileInfo;
|
||||
BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo,
|
||||
sizeof(FileInfo));
|
||||
|
||||
TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
|
||||
|
||||
if (!result) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber),
|
||||
"Wrong sizeof expectations");
|
||||
// FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t
|
||||
static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier),
|
||||
"Wrong sizeof expectations");
|
||||
|
||||
char* rid = id;
|
||||
rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber));
|
||||
uint64_t* file_id = reinterpret_cast<uint64_t*>(&FileInfo.FileId.Identifier[0]);
|
||||
rid = EncodeVarint64(rid, *file_id);
|
||||
++file_id;
|
||||
rid = EncodeVarint64(rid, *file_id);
|
||||
|
||||
assert(rid >= id);
|
||||
return static_cast<size_t>(rid - id);
|
||||
#endif
|
||||
size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
|
||||
size_t /*max_size*/) {
|
||||
// Returning 0 is safe as it causes the table reader to generate a unique ID.
|
||||
// This is suboptimal for performance as it prevents multiple table readers
|
||||
// for the same file from sharing cached blocks. For example, if users have
|
||||
// a low value for `max_open_files`, there can be many table readers opened
|
||||
// for the same file.
|
||||
//
|
||||
// TODO: this is a temporarily solution as it is safe but not optimal for
|
||||
// performance. For more details see discussion in
|
||||
// https://github.com/facebook/rocksdb/pull/5844.
|
||||
return 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -981,7 +981,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
|
|||
|
||||
// If the prefix wasn't generated or was too long,
|
||||
// create one from the cache.
|
||||
if (cc && *size == 0) {
|
||||
if (cc != nullptr && *size == 0) {
|
||||
char* end = EncodeVarint64(buffer, cc->NewId());
|
||||
*size = static_cast<size_t>(end - buffer);
|
||||
}
|
||||
|
@ -994,7 +994,7 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
|
|||
|
||||
// If the prefix wasn't generated or was too long,
|
||||
// create one from the cache.
|
||||
if (*size == 0) {
|
||||
if (cc != nullptr && *size == 0) {
|
||||
char* end = EncodeVarint64(buffer, cc->NewId());
|
||||
*size = static_cast<size_t>(end - buffer);
|
||||
}
|
||||
|
|
|
@ -6,7 +6,10 @@
|
|||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
// GetUniqueIdFromFile is not implemented on Windows. Persistent cache
|
||||
// breaks when that function is not implemented
|
||||
#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
|
||||
|
||||
#include "utilities/persistent_cache/persistent_cache_test.h"
|
||||
|
||||
|
@ -466,6 +469,6 @@ int main(int argc, char** argv) {
|
|||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
#else
|
||||
#else // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
|
||||
int main() { return 0; }
|
||||
#endif
|
||||
#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
|
||||
|
|
Loading…
Reference in New Issue