2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
//
|
|
|
|
// File names used by DB code
|
|
|
|
|
2013-10-05 05:32:05 +00:00
|
|
|
#pragma once
|
2011-03-18 22:37:00 +00:00
|
|
|
#include <stdint.h>
|
2022-10-25 01:34:52 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
#include <string>
|
2022-10-25 01:34:52 +00:00
|
|
|
#include <unordered_map>
|
2014-07-02 16:54:20 +00:00
|
|
|
#include <vector>
|
2015-01-09 20:57:11 +00:00
|
|
|
|
2017-04-06 02:02:00 +00:00
|
|
|
#include "options/db_options.h"
|
2015-01-09 20:57:11 +00:00
|
|
|
#include "port/port.h"
|
2020-03-03 00:14:00 +00:00
|
|
|
#include "rocksdb/file_system.h"
|
2015-01-09 20:57:11 +00:00
|
|
|
#include "rocksdb/options.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "rocksdb/status.h"
|
2013-10-24 06:39:23 +00:00
|
|
|
#include "rocksdb/transaction_log.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
class Env;
|
2014-05-06 21:51:33 +00:00
|
|
|
class Directory;
|
2021-01-26 06:07:26 +00:00
|
|
|
class SystemClock;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
class WritableFileWriter;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
#ifdef OS_WIN
|
2021-10-16 17:03:19 +00:00
|
|
|
constexpr char kFilePathSeparator = '\\';
|
2020-03-21 02:17:54 +00:00
|
|
|
#else
|
2021-10-16 17:03:19 +00:00
|
|
|
constexpr char kFilePathSeparator = '/';
|
2020-03-21 02:17:54 +00:00
|
|
|
#endif
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Return the name of the log file with the specified number
|
|
|
|
// in the db named by "dbname". The result will be prefixed with
|
|
|
|
// "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string LogFileName(const std::string& dbname, uint64_t number);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string LogFileName(uint64_t number);
|
2019-08-01 22:45:19 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string BlobFileName(uint64_t number);
|
2020-05-07 16:29:21 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string BlobFileName(const std::string& bdirname, uint64_t number);
|
2017-04-18 19:00:36 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
|
|
|
|
uint64_t number);
|
2018-08-31 18:59:49 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string ArchivalDirectory(const std::string& dbname);
|
2012-12-08 00:30:22 +00:00
|
|
|
|
2012-11-30 01:28:37 +00:00
|
|
|
// Return the name of the archived log file with the specified number
|
|
|
|
// in the db named by "dbname". The result will be prefixed with "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string ArchivedLogFileName(const std::string& dbname, uint64_t num);
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string MakeTableFileName(const std::string& name, uint64_t number);
|
2014-07-02 16:54:20 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string MakeTableFileName(uint64_t number);
|
2019-08-01 22:45:19 +00:00
|
|
|
|
2015-10-07 00:46:22 +00:00
|
|
|
// Return the name of sstable with LevelDB suffix
|
|
|
|
// created from RocksDB sstable suffixed name
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string Rocks2LevelTableFileName(const std::string& fullname);
|
2015-10-07 00:46:22 +00:00
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
// the reverse function of MakeTableFileName
|
|
|
|
// TODO(yhchiang): could merge this function with ParseFileName()
|
2024-01-29 18:38:08 +00:00
|
|
|
uint64_t TableFileNameToNumber(const std::string& name);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Return the name of the sstable with the specified number
|
|
|
|
// in the db named by "dbname". The result will be prefixed with
|
|
|
|
// "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
|
|
|
|
uint32_t path_id);
|
2014-07-02 16:54:20 +00:00
|
|
|
|
2014-08-13 18:57:40 +00:00
|
|
|
// Sufficient buffer size for FormatFileNumber.
|
2015-07-01 23:13:49 +00:00
|
|
|
const size_t kFormatFileNumberBufSize = 38;
|
2014-08-13 18:57:40 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
|
|
|
|
size_t out_buf_size);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Return the name of the descriptor file for the db named by
|
|
|
|
// "dbname" and the specified incarnation number. The result will be
|
|
|
|
// prefixed with "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string DescriptorFileName(const std::string& dbname, uint64_t number);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string DescriptorFileName(uint64_t number);
|
2021-10-16 17:03:19 +00:00
|
|
|
|
|
|
|
extern const std::string kCurrentFileName; // = "CURRENT"
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Return the name of the current file. This file contains the name
|
|
|
|
// of the current manifest file. The result will be prefixed with
|
|
|
|
// "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string CurrentFileName(const std::string& dbname);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Return the name of the lock file for the db named by
|
|
|
|
// "dbname". The result will be prefixed with "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string LockFileName(const std::string& dbname);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Return the name of a temporary file owned by the db named "dbname".
|
|
|
|
// The result will be prefixed with "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string TempFileName(const std::string& dbname, uint64_t number);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2014-08-14 17:05:16 +00:00
|
|
|
// A helper structure for prefix of info log names.
|
|
|
|
struct InfoLogPrefix {
|
|
|
|
char buf[260];
|
|
|
|
Slice prefix;
|
|
|
|
// Prefix with DB absolute path encoded
|
|
|
|
explicit InfoLogPrefix(bool has_log_dir, const std::string& db_absolute_path);
|
|
|
|
// Default Prefix
|
|
|
|
explicit InfoLogPrefix();
|
|
|
|
};
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Return the name of the info log file for "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string InfoLogFileName(const std::string& dbname,
|
|
|
|
const std::string& db_path = "",
|
|
|
|
const std::string& log_dir = "");
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Return the name of the old info log file for "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
|
|
|
|
const std::string& db_path = "",
|
|
|
|
const std::string& log_dir = "");
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2021-10-16 17:03:19 +00:00
|
|
|
extern const std::string kOptionsFileNamePrefix; // = "OPTIONS-"
|
|
|
|
extern const std::string kTempFileNameSuffix; // = "dbtmp"
|
2015-11-11 06:58:01 +00:00
|
|
|
|
|
|
|
// Return a options file name given the "dbname" and file number.
|
|
|
|
// Format: OPTIONS-[number].dbtmp
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string OptionsFileName(const std::string& dbname, uint64_t file_num);
|
|
|
|
std::string OptionsFileName(uint64_t file_num);
|
2015-11-11 06:58:01 +00:00
|
|
|
|
|
|
|
// Return a temp options file name given the "dbname" and file number.
|
|
|
|
// Format: OPTIONS-[number]
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num);
|
2015-11-11 06:58:01 +00:00
|
|
|
|
2012-12-17 19:26:59 +00:00
|
|
|
// Return the name to use for a metadatabase. The result will be prefixed with
|
|
|
|
// "dbname".
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string MetaDatabaseName(const std::string& dbname, uint64_t number);
|
2012-12-17 19:26:59 +00:00
|
|
|
|
2013-10-18 21:50:54 +00:00
|
|
|
// Return the name of the Identity file which stores a unique number for the db
|
|
|
|
// that will get regenerated if the db loses all its data and is recreated fresh
|
|
|
|
// either from a backup-image or empty
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string IdentityFileName(const std::string& dbname);
|
2013-10-18 21:50:54 +00:00
|
|
|
|
2013-10-05 05:32:05 +00:00
|
|
|
// If filename is a rocksdb file, store the type of the file in *type.
|
2011-04-20 22:48:11 +00:00
|
|
|
// The number encoded in the filename is stored in *number. If the
|
|
|
|
// filename was successfully parsed, returns true. Else return false.
|
2014-08-14 17:05:16 +00:00
|
|
|
// info_log_name_prefix is the path of info logs.
|
2024-01-29 18:38:08 +00:00
|
|
|
bool ParseFileName(const std::string& filename, uint64_t* number,
|
|
|
|
const Slice& info_log_name_prefix, FileType* type,
|
|
|
|
WalFileType* log_type = nullptr);
|
2014-08-14 17:05:16 +00:00
|
|
|
// Same as previous function, but skip info log files.
|
2024-01-29 18:38:08 +00:00
|
|
|
bool ParseFileName(const std::string& filename, uint64_t* number,
|
|
|
|
FileType* type, WalFileType* log_type = nullptr);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Make the CURRENT file point to the descriptor file with the
|
Sync dir containing CURRENT after RenameFile on CURRENT as much as possible (#10573)
Summary:
**Context:**
Below crash test revealed a bug that directory containing CURRENT file (short for `dir_contains_current_file` below) was not always get synced after a new CURRENT is created and being called with `RenameFile` as part of the creation.
This bug exposes a risk that such un-synced directory containing the updated CURRENT can’t survive a host crash (e.g, power loss) hence get corrupted. This then will be followed by a recovery from a corrupted CURRENT that we don't want.
The root-cause is that a nullptr `FSDirectory* dir_contains_current_file` sometimes gets passed-down to `SetCurrentFile()` hence in those case `dir_contains_current_file->FSDirectory::FsyncWithDirOptions()` will be skipped (which otherwise will internally call`Env/FS::SyncDic()` )
```
./db_stress --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --allow_data_in_errors=True --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=8 --block_size=16384 --bloom_bits=134.8015470676662 --bottommost_compression_type=disable --cache_size=8388608 --checkpoint_one_in=1000000 --checksum_type=kCRC32c --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=2 --compaction_ttl=100 --compression_max_dict_buffer_bytes=511 --compression_max_dict_bytes=16384 --compression_type=zstd --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=65536 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --disable_wal=0 --enable_compaction_filter=0 --enable_pipelined_write=1 --expected_values_dir=$exp --fail_if_options_file_error=1 --file_checksum_impl=none --flush_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --ingest_external_file_one_in=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --mark_for_compaction_one_file_in=10 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=16384 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=1 --memtable_whole_key_filtering=1 --mmap_read=1 --nooverwritepercent=1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_pinning=2 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=5 --prefixpercent=5 --prepopulate_block_cache=1 --progress_reports=0 --read_fault_one_in=1000 --readpercent=45 --recycle_log_file_num=0 --reopen=0 --ribbon_starting_level=999 --secondary_cache_fault_one_in=32 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --subcompactions=3 --sync_fault_injection=1 --target_file_size_base=2097 --target_file_size_multiplier=2 --test_batches_snapshots=1 --top_level_index_pinning=1 --use_full_merge_v1=1 --use_merge=1 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --write_buffer_size=4194 --writepercent=35
```
```
stderr:
WARNING: prefix_size is non-zero but memtablerep != prefix_hash
db_stress: utilities/fault_injection_fs.cc:748: virtual rocksdb::IOStatus rocksdb::FaultInjectionTestFS::RenameFile(const std::string &, const std::string &, const rocksdb::IOOptions &, rocksdb::IODebugContext *): Assertion `tlist.find(tdn.second) == tlist.end()' failed.`
```
**Summary:**
The PR ensured the non-test path pass down a non-null dir containing CURRENT (which is by current RocksDB assumption just db_dir) by doing the following:
- Renamed `directory_to_fsync` as `dir_contains_current_file` in `SetCurrentFile()` to tighten the association between this directory and CURRENT file
- Changed `SetCurrentFile()` API to require `dir_contains_current_file` being passed-in, instead of making it by default nullptr.
- Because `SetCurrentFile()`'s `dir_contains_current_file` is passed down from `VersionSet::LogAndApply()` then `VersionSet::ProcessManifestWrites()` (i.e, think about this as a chain of 3 functions related to MANIFEST update), these 2 functions also got refactored to require `dir_contains_current_file`
- Updated the non-test-path callers of these 3 functions to obtain and pass in non-nullptr `dir_contains_current_file`, which by current assumption of RocksDB, is the `FSDirectory* db_dir`.
- `db_impl` path will obtain `DBImpl::directories_.getDbDir()` while others with no access to such `directories_` are obtained on the fly by creating such object `FileSystem::NewDirectory(..)` and manage it by unique pointers to ensure short life time.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10573
Test Plan:
- `make check`
- Passed the repro db_stress command
- For future improvement, since we currently don't assert dir containing CURRENT to be non-nullptr due to https://github.com/facebook/rocksdb/pull/10573#pullrequestreview-1087698899, there is still chances that future developers mistakenly pass down nullptr dir containing CURRENT thus resulting skipped sync dir and cause the bug again. Therefore a smarter test (e.g, such as quoted from ajkr "(make) unsynced data loss to be dropping files corresponding to unsynced directory entries") is still needed.
Reviewed By: ajkr
Differential Revision: D39005886
Pulled By: hx235
fbshipit-source-id: 336fb9090d0cfa6ca3dd580db86268007dde7f5a
2022-08-30 00:35:21 +00:00
|
|
|
// specified number. On its success and when dir_contains_current_file is not
|
|
|
|
// nullptr, the function will fsync the directory containing the CURRENT file
|
|
|
|
// when
|
2024-01-29 18:38:08 +00:00
|
|
|
IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
|
|
|
|
const std::string& dbname, uint64_t descriptor_number,
|
|
|
|
FSDirectory* dir_contains_current_file);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-10-18 21:50:54 +00:00
|
|
|
// Make the IDENTITY file for the db
|
2024-01-29 18:38:08 +00:00
|
|
|
Status SetIdentityFile(const WriteOptions& write_options, Env* env,
|
|
|
|
const std::string& dbname,
|
|
|
|
const std::string& db_id = {});
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2015-01-22 19:43:38 +00:00
|
|
|
// Sync manifest file `file`.
|
2024-01-29 18:38:08 +00:00
|
|
|
IOStatus SyncManifest(const ImmutableDBOptions* db_options,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
WritableFileWriter* file);
|
2015-01-22 19:43:38 +00:00
|
|
|
|
2019-05-31 17:45:20 +00:00
|
|
|
// Return list of file names of info logs in `file_names`.
|
|
|
|
// The list only contains file name. The parent directory name is stored
|
|
|
|
// in `parent_dir`.
|
|
|
|
// `db_log_dir` should be the one as in options.db_log_dir
|
2024-01-29 18:38:08 +00:00
|
|
|
Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
|
|
|
|
const std::string& db_log_dir, const std::string& dbname,
|
|
|
|
std::string* parent_dir,
|
|
|
|
std::vector<std::string>* file_names);
|
2020-03-21 02:17:54 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
std::string NormalizePath(const std::string& path);
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|