2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2014-04-15 20:39:26 +00:00
|
|
|
|
2013-08-06 19:54:37 +00:00
|
|
|
#include <algorithm>
|
2021-10-16 17:03:19 +00:00
|
|
|
#include <cstdint>
|
|
|
|
#include <memory>
|
2012-09-15 00:11:35 +00:00
|
|
|
#include <string>
|
2021-10-16 17:03:19 +00:00
|
|
|
#include <vector>
|
2021-09-29 11:01:57 +00:00
|
|
|
|
2019-05-31 18:52:59 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2014-11-14 23:43:10 +00:00
|
|
|
#include "db/job_context.h"
|
2012-09-15 00:11:35 +00:00
|
|
|
#include "db/version_set.h"
|
2019-05-30 03:44:08 +00:00
|
|
|
#include "file/file_util.h"
|
|
|
|
#include "file/filename.h"
|
2021-09-29 11:01:57 +00:00
|
|
|
#include "logging/logging.h"
|
2016-06-10 02:03:10 +00:00
|
|
|
#include "port/port.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
2021-10-16 17:03:19 +00:00
|
|
|
#include "rocksdb/metadata.h"
|
|
|
|
#include "rocksdb/types.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/sync_point.h"
|
2021-10-16 17:03:19 +00:00
|
|
|
#include "util/file_checksum_helper.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/mutexlock.h"
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2021-10-16 17:03:19 +00:00
|
|
|
Status DBImpl::FlushForGetLiveFiles() {
|
2023-05-31 19:53:51 +00:00
|
|
|
return DBImpl::FlushAllColumnFamilies(FlushOptions(),
|
|
|
|
FlushReason::kGetLiveFiles);
|
2021-10-16 17:03:19 +00:00
|
|
|
}
|
|
|
|
|
2012-11-29 00:42:36 +00:00
|
|
|
Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
2022-11-02 21:34:24 +00:00
|
|
|
uint64_t* manifest_file_size, bool flush_memtable) {
|
2012-09-24 21:01:01 +00:00
|
|
|
*manifest_file_size = 0;
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2014-02-25 21:16:59 +00:00
|
|
|
mutex_.Lock();
|
|
|
|
|
2013-10-03 21:38:32 +00:00
|
|
|
if (flush_memtable) {
|
2021-10-16 17:03:19 +00:00
|
|
|
Status status = FlushForGetLiveFiles();
|
2013-10-03 21:38:32 +00:00
|
|
|
if (!status.ok()) {
|
2014-02-25 21:16:59 +00:00
|
|
|
mutex_.Unlock();
|
2017-03-16 02:22:52 +00:00
|
|
|
ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
|
|
|
|
status.ToString().c_str());
|
2013-10-03 21:38:32 +00:00
|
|
|
return status;
|
|
|
|
}
|
2012-09-15 00:11:35 +00:00
|
|
|
}
|
|
|
|
|
2020-05-04 22:05:34 +00:00
|
|
|
// Make a set of all of the live table and blob files
|
|
|
|
std::vector<uint64_t> live_table_files;
|
|
|
|
std::vector<uint64_t> live_blob_files;
|
2014-02-07 22:47:16 +00:00
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
2015-03-20 00:04:29 +00:00
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
2020-05-04 22:05:34 +00:00
|
|
|
cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
|
2014-02-07 22:47:16 +00:00
|
|
|
}
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2013-08-29 21:30:52 +00:00
|
|
|
ret.clear();
|
2020-05-04 22:05:34 +00:00
|
|
|
ret.reserve(live_table_files.size() + live_blob_files.size() +
|
|
|
|
3); // for CURRENT + MANIFEST + OPTIONS
|
2012-09-15 00:11:35 +00:00
|
|
|
|
|
|
|
// create names of the live files. The names are not absolute
|
2022-04-01 23:06:14 +00:00
|
|
|
// paths, instead they are relative to dbname_.
|
2020-05-04 22:05:34 +00:00
|
|
|
for (const auto& table_file_number : live_table_files) {
|
|
|
|
ret.emplace_back(MakeTableFileName("", table_file_number));
|
2012-09-15 00:11:35 +00:00
|
|
|
}
|
|
|
|
|
2020-05-04 22:05:34 +00:00
|
|
|
for (const auto& blob_file_number : live_blob_files) {
|
|
|
|
ret.emplace_back(BlobFileName("", blob_file_number));
|
|
|
|
}
|
|
|
|
|
|
|
|
ret.emplace_back(CurrentFileName(""));
|
|
|
|
ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
|
2021-05-05 19:53:42 +00:00
|
|
|
// The OPTIONS file number is zero in read-write mode when OPTIONS file
|
|
|
|
// writing failed and the DB was configured with
|
|
|
|
// `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
|
|
|
|
// number is zero when no OPTIONS file exist at all. In those cases we do not
|
|
|
|
// record any OPTIONS file in the live file list.
|
|
|
|
if (versions_->options_file_number() != 0) {
|
|
|
|
ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
|
|
|
|
}
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2012-09-24 21:01:01 +00:00
|
|
|
// find length of manifest file while holding the mutex lock
|
2014-11-04 01:45:55 +00:00
|
|
|
*manifest_file_size = versions_->manifest_file_size();
|
2012-09-24 21:01:01 +00:00
|
|
|
|
2014-02-25 21:16:59 +00:00
|
|
|
mutex_.Unlock();
|
2012-09-15 00:11:35 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2024-05-28 16:24:49 +00:00
|
|
|
Status DBImpl::GetSortedWalFiles(VectorWalPtr& files) {
|
2022-06-01 18:02:27 +00:00
|
|
|
// Record tracked WALs as a (minimum) cross-check for directory scan
|
|
|
|
std::vector<uint64_t> required_by_manifest;
|
|
|
|
|
2021-11-24 22:50:52 +00:00
|
|
|
// If caller disabled deletions, this function should return files that are
|
|
|
|
// guaranteed not to be deleted until deletions are re-enabled. We need to
|
|
|
|
// wait for pending purges to finish since WalManager doesn't know which
|
|
|
|
// files are going to be purged. Additional purges won't be scheduled as
|
|
|
|
// long as deletions are disabled (so the below loop must terminate).
|
|
|
|
// Also note that we disable deletions anyway to avoid the case where a
|
|
|
|
// file is deleted in the middle of the scan, causing IO error.
|
|
|
|
Status deletions_disabled = DisableFileDeletions();
|
2018-01-18 01:37:10 +00:00
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
2021-11-24 22:50:52 +00:00
|
|
|
while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
|
2018-01-18 01:37:10 +00:00
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
2022-06-01 18:02:27 +00:00
|
|
|
|
|
|
|
// Record tracked WALs as a (minimum) cross-check for directory scan
|
|
|
|
const auto& manifest_wals = versions_->GetWalSet().GetWals();
|
|
|
|
required_by_manifest.reserve(manifest_wals.size());
|
|
|
|
for (const auto& wal : manifest_wals) {
|
|
|
|
required_by_manifest.push_back(wal.first);
|
|
|
|
}
|
2018-01-18 01:37:10 +00:00
|
|
|
}
|
2021-07-29 18:50:00 +00:00
|
|
|
|
2021-11-24 22:50:52 +00:00
|
|
|
Status s = wal_manager_.GetSortedWalFiles(files);
|
|
|
|
|
|
|
|
// DisableFileDeletions / EnableFileDeletions not supported in read-only DB
|
|
|
|
if (deletions_disabled.ok()) {
|
2024-02-14 02:36:25 +00:00
|
|
|
Status s2 = EnableFileDeletions();
|
2021-11-24 22:50:52 +00:00
|
|
|
assert(s2.ok());
|
|
|
|
s2.PermitUncheckedError();
|
|
|
|
} else {
|
|
|
|
assert(deletions_disabled.IsNotSupported());
|
2021-07-29 18:50:00 +00:00
|
|
|
}
|
|
|
|
|
2022-06-01 18:02:27 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
// Verify includes those required by manifest (one sorted list is superset
|
|
|
|
// of the other)
|
|
|
|
auto required = required_by_manifest.begin();
|
|
|
|
auto included = files.begin();
|
|
|
|
|
|
|
|
while (required != required_by_manifest.end()) {
|
|
|
|
if (included == files.end() || *required < (*included)->LogNumber()) {
|
|
|
|
// FAIL - did not find
|
|
|
|
return Status::Corruption(
|
|
|
|
"WAL file " + std::to_string(*required) +
|
|
|
|
" required by manifest but not in directory list");
|
|
|
|
}
|
|
|
|
if (*required == (*included)->LogNumber()) {
|
|
|
|
++required;
|
|
|
|
++included;
|
|
|
|
} else {
|
|
|
|
assert(*required > (*included)->LogNumber());
|
|
|
|
++included;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-04-05 17:23:31 +00:00
|
|
|
if (s.ok()) {
|
2024-05-28 16:24:49 +00:00
|
|
|
size_t wal_count = files.size();
|
2024-04-05 17:23:31 +00:00
|
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
2024-05-28 16:24:49 +00:00
|
|
|
"Number of WAL files %" ROCKSDB_PRIszt " (%" ROCKSDB_PRIszt
|
2024-04-05 17:23:31 +00:00
|
|
|
" required by manifest)",
|
2024-05-28 16:24:49 +00:00
|
|
|
wal_count, required_by_manifest.size());
|
2024-04-05 17:23:31 +00:00
|
|
|
#ifndef NDEBUG
|
|
|
|
std::ostringstream wal_names;
|
|
|
|
for (const auto& wal : files) {
|
|
|
|
wal_names << wal->PathName() << " ";
|
|
|
|
}
|
|
|
|
|
|
|
|
std::ostringstream wal_required_by_manifest_names;
|
|
|
|
for (const auto& wal : required_by_manifest) {
|
|
|
|
wal_required_by_manifest_names << wal << ".log ";
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
|
|
"Log files : %s .Log files required by manifest: %s.",
|
|
|
|
wal_names.str().c_str(),
|
|
|
|
wal_required_by_manifest_names.str().c_str());
|
|
|
|
#endif // NDEBUG
|
|
|
|
}
|
2021-07-29 18:50:00 +00:00
|
|
|
return s;
|
2013-08-06 19:54:37 +00:00
|
|
|
}
|
2014-11-14 19:38:26 +00:00
|
|
|
|
2024-05-28 16:24:49 +00:00
|
|
|
Status DBImpl::GetCurrentWalFile(std::unique_ptr<WalFile>* current_log_file) {
|
2019-09-04 19:08:56 +00:00
|
|
|
uint64_t current_logfile_number;
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock l(&mutex_);
|
|
|
|
current_logfile_number = logfile_number_;
|
|
|
|
}
|
|
|
|
|
|
|
|
return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
|
|
|
|
}
|
2021-10-16 17:03:19 +00:00
|
|
|
|
|
|
|
Status DBImpl::GetLiveFilesStorageInfo(
|
|
|
|
const LiveFilesStorageInfoOptions& opts,
|
|
|
|
std::vector<LiveFileStorageInfo>* files) {
|
2022-04-01 23:06:14 +00:00
|
|
|
// To avoid returning partial results, only move results to files on success.
|
2021-10-16 17:03:19 +00:00
|
|
|
assert(files);
|
|
|
|
files->clear();
|
|
|
|
std::vector<LiveFileStorageInfo> results;
|
|
|
|
|
|
|
|
// NOTE: This implementation was largely migrated from Checkpoint.
|
|
|
|
|
|
|
|
Status s;
|
2024-05-28 16:24:49 +00:00
|
|
|
VectorWalPtr live_wal_files;
|
2021-10-16 17:03:19 +00:00
|
|
|
bool flush_memtable = true;
|
|
|
|
if (!immutable_db_options_.allow_2pc) {
|
2022-05-05 20:08:21 +00:00
|
|
|
if (opts.wal_size_for_flush == std::numeric_limits<uint64_t>::max()) {
|
2021-10-16 17:03:19 +00:00
|
|
|
flush_memtable = false;
|
|
|
|
} else if (opts.wal_size_for_flush > 0) {
|
2024-05-28 16:24:49 +00:00
|
|
|
// If the outstanding WAL files are small, we skip the flush.
|
2021-10-16 17:03:19 +00:00
|
|
|
s = GetSortedWalFiles(live_wal_files);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Don't flush column families if total log size is smaller than
|
|
|
|
// log_size_for_flush. We copy the log files instead.
|
|
|
|
// We may be able to cover 2PC case too.
|
|
|
|
uint64_t total_wal_size = 0;
|
|
|
|
for (auto& wal : live_wal_files) {
|
|
|
|
total_wal_size += wal->SizeFileBytes();
|
|
|
|
}
|
|
|
|
if (total_wal_size < opts.wal_size_for_flush) {
|
|
|
|
flush_memtable = false;
|
|
|
|
}
|
|
|
|
live_wal_files.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// This is a modified version of GetLiveFiles, to get access to more
|
|
|
|
// metadata.
|
|
|
|
mutex_.Lock();
|
|
|
|
if (flush_memtable) {
|
2024-05-21 17:17:34 +00:00
|
|
|
bool wal_locked = lock_wal_count_ > 0;
|
|
|
|
if (wal_locked) {
|
|
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
|
|
"Can't FlushForGetLiveFiles while WAL is locked");
|
|
|
|
} else {
|
|
|
|
Status status = FlushForGetLiveFiles();
|
|
|
|
if (!status.ok()) {
|
|
|
|
mutex_.Unlock();
|
|
|
|
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
|
|
|
|
"Cannot Flush data %s\n", status.ToString().c_str());
|
|
|
|
return status;
|
|
|
|
}
|
2021-10-16 17:03:19 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make a set of all of the live table and blob files
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
VersionStorageInfo& vsi = *cfd->current()->storage_info();
|
|
|
|
auto& cf_paths = cfd->ioptions()->cf_paths;
|
|
|
|
|
|
|
|
auto GetDir = [&](size_t path_id) {
|
|
|
|
// Matching TableFileName() behavior
|
|
|
|
if (path_id >= cf_paths.size()) {
|
|
|
|
assert(false);
|
|
|
|
return cf_paths.back().path;
|
|
|
|
} else {
|
|
|
|
return cf_paths[path_id].path;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int level = 0; level < vsi.num_levels(); ++level) {
|
|
|
|
const auto& level_files = vsi.LevelFiles(level);
|
|
|
|
for (const auto& meta : level_files) {
|
|
|
|
assert(meta);
|
|
|
|
|
|
|
|
results.emplace_back();
|
|
|
|
LiveFileStorageInfo& info = results.back();
|
|
|
|
|
|
|
|
info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
|
|
|
|
info.directory = GetDir(meta->fd.GetPathId());
|
|
|
|
info.file_number = meta->fd.GetNumber();
|
|
|
|
info.file_type = kTableFile;
|
|
|
|
info.size = meta->fd.GetFileSize();
|
|
|
|
if (opts.include_checksum_info) {
|
|
|
|
info.file_checksum_func_name = meta->file_checksum_func_name;
|
|
|
|
info.file_checksum = meta->file_checksum;
|
|
|
|
if (info.file_checksum_func_name.empty()) {
|
|
|
|
info.file_checksum_func_name = kUnknownFileChecksumFuncName;
|
|
|
|
info.file_checksum = kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
info.temperature = meta->temperature;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
const auto& blob_files = vsi.GetBlobFiles();
|
Use a sorted vector instead of a map to store blob file metadata (#9526)
Summary:
The patch replaces `std::map` with a sorted `std::vector` for
`VersionStorageInfo::blob_files_` and preallocates the space
for the `vector` before saving the `BlobFileMetaData` into the
new `VersionStorageInfo` in `VersionBuilder::Rep::SaveBlobFilesTo`.
These changes reduce the time the DB mutex is held while
saving new `Version`s, and using a sorted `vector` also makes
lookups faster thanks to better memory locality.
In addition, the patch introduces helper methods
`VersionStorageInfo::GetBlobFileMetaData` and
`VersionStorageInfo::GetBlobFileMetaDataLB` that can be used by
clients to perform lookups in the `vector`, and does some general
cleanup in the parts of code where blob file metadata are used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9526
Test Plan:
Ran `make check` and the crash test script for a while.
Performance was tested using a load-optimized benchmark (`fillseq` with vector memtable, no WAL) and small file sizes so that a significant number of files are produced:
```
numactl --interleave=all ./db_bench --benchmarks=fillseq --allow_concurrent_memtable_write=false --level0_file_num_compaction_trigger=4 --level0_slowdown_writes_trigger=20 --level0_stop_writes_trigger=30 --max_background_jobs=8 --max_write_buffer_number=8 --db=/data/ltamasi-dbbench --wal_dir=/data/ltamasi-dbbench --num=800000000 --num_levels=8 --key_size=20 --value_size=400 --block_size=8192 --cache_size=51539607552 --cache_numshardbits=6 --compression_max_dict_bytes=0 --compression_ratio=0.5 --compression_type=lz4 --bytes_per_sync=8388608 --cache_index_and_filter_blocks=1 --cache_high_pri_pool_ratio=0.5 --benchmark_write_rate_limit=0 --write_buffer_size=16777216 --target_file_size_base=16777216 --max_bytes_for_level_base=67108864 --verify_checksum=1 --delete_obsolete_files_period_micros=62914560 --max_bytes_for_level_multiplier=8 --statistics=0 --stats_per_interval=1 --stats_interval_seconds=20 --histogram=1 --memtablerep=skip_list --bloom_bits=10 --open_files=-1 --subcompactions=1 --compaction_style=0 --min_level_to_compress=3 --level_compaction_dynamic_level_bytes=true --pin_l0_filter_and_index_blocks_in_cache=1 --soft_pending_compaction_bytes_limit=167503724544 --hard_pending_compaction_bytes_limit=335007449088 --min_level_to_compress=0 --use_existing_db=0 --sync=0 --threads=1 --memtablerep=vector --allow_concurrent_memtable_write=false --disable_wal=1 --enable_blob_files=1 --blob_file_size=16777216 --min_blob_size=0 --blob_compression_type=lz4 --enable_blob_garbage_collection=1 --seed=<some value>
```
Final statistics before the patch:
```
Cumulative writes: 0 writes, 700M keys, 0 commit groups, 0.0 writes per commit group, ingest: 284.62 GB, 121.27 MB/s
Interval writes: 0 writes, 334K keys, 0 commit groups, 0.0 writes per commit group, ingest: 139.28 MB, 72.46 MB/s
```
With the patch:
```
Cumulative writes: 0 writes, 760M keys, 0 commit groups, 0.0 writes per commit group, ingest: 308.66 GB, 131.52 MB/s
Interval writes: 0 writes, 445K keys, 0 commit groups, 0.0 writes per commit group, ingest: 185.35 MB, 93.15 MB/s
```
Total time to complete the benchmark is 2611 seconds with the patch, down from 2986 secs.
Reviewed By: riversand963
Differential Revision: D34082728
Pulled By: ltamasi
fbshipit-source-id: fc598abf676dce436734d06bb9d2d99a26a004fc
2022-02-09 20:35:39 +00:00
|
|
|
for (const auto& meta : blob_files) {
|
2021-10-16 17:03:19 +00:00
|
|
|
assert(meta);
|
|
|
|
|
|
|
|
results.emplace_back();
|
|
|
|
LiveFileStorageInfo& info = results.back();
|
|
|
|
|
|
|
|
info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
|
Use a sorted vector instead of a map to store blob file metadata (#9526)
Summary:
The patch replaces `std::map` with a sorted `std::vector` for
`VersionStorageInfo::blob_files_` and preallocates the space
for the `vector` before saving the `BlobFileMetaData` into the
new `VersionStorageInfo` in `VersionBuilder::Rep::SaveBlobFilesTo`.
These changes reduce the time the DB mutex is held while
saving new `Version`s, and using a sorted `vector` also makes
lookups faster thanks to better memory locality.
In addition, the patch introduces helper methods
`VersionStorageInfo::GetBlobFileMetaData` and
`VersionStorageInfo::GetBlobFileMetaDataLB` that can be used by
clients to perform lookups in the `vector`, and does some general
cleanup in the parts of code where blob file metadata are used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9526
Test Plan:
Ran `make check` and the crash test script for a while.
Performance was tested using a load-optimized benchmark (`fillseq` with vector memtable, no WAL) and small file sizes so that a significant number of files are produced:
```
numactl --interleave=all ./db_bench --benchmarks=fillseq --allow_concurrent_memtable_write=false --level0_file_num_compaction_trigger=4 --level0_slowdown_writes_trigger=20 --level0_stop_writes_trigger=30 --max_background_jobs=8 --max_write_buffer_number=8 --db=/data/ltamasi-dbbench --wal_dir=/data/ltamasi-dbbench --num=800000000 --num_levels=8 --key_size=20 --value_size=400 --block_size=8192 --cache_size=51539607552 --cache_numshardbits=6 --compression_max_dict_bytes=0 --compression_ratio=0.5 --compression_type=lz4 --bytes_per_sync=8388608 --cache_index_and_filter_blocks=1 --cache_high_pri_pool_ratio=0.5 --benchmark_write_rate_limit=0 --write_buffer_size=16777216 --target_file_size_base=16777216 --max_bytes_for_level_base=67108864 --verify_checksum=1 --delete_obsolete_files_period_micros=62914560 --max_bytes_for_level_multiplier=8 --statistics=0 --stats_per_interval=1 --stats_interval_seconds=20 --histogram=1 --memtablerep=skip_list --bloom_bits=10 --open_files=-1 --subcompactions=1 --compaction_style=0 --min_level_to_compress=3 --level_compaction_dynamic_level_bytes=true --pin_l0_filter_and_index_blocks_in_cache=1 --soft_pending_compaction_bytes_limit=167503724544 --hard_pending_compaction_bytes_limit=335007449088 --min_level_to_compress=0 --use_existing_db=0 --sync=0 --threads=1 --memtablerep=vector --allow_concurrent_memtable_write=false --disable_wal=1 --enable_blob_files=1 --blob_file_size=16777216 --min_blob_size=0 --blob_compression_type=lz4 --enable_blob_garbage_collection=1 --seed=<some value>
```
Final statistics before the patch:
```
Cumulative writes: 0 writes, 700M keys, 0 commit groups, 0.0 writes per commit group, ingest: 284.62 GB, 121.27 MB/s
Interval writes: 0 writes, 334K keys, 0 commit groups, 0.0 writes per commit group, ingest: 139.28 MB, 72.46 MB/s
```
With the patch:
```
Cumulative writes: 0 writes, 760M keys, 0 commit groups, 0.0 writes per commit group, ingest: 308.66 GB, 131.52 MB/s
Interval writes: 0 writes, 445K keys, 0 commit groups, 0.0 writes per commit group, ingest: 185.35 MB, 93.15 MB/s
```
Total time to complete the benchmark is 2611 seconds with the patch, down from 2986 secs.
Reviewed By: riversand963
Differential Revision: D34082728
Pulled By: ltamasi
fbshipit-source-id: fc598abf676dce436734d06bb9d2d99a26a004fc
2022-02-09 20:35:39 +00:00
|
|
|
info.directory = GetDir(/* path_id */ 0);
|
2021-10-16 17:03:19 +00:00
|
|
|
info.file_number = meta->GetBlobFileNumber();
|
|
|
|
info.file_type = kBlobFile;
|
|
|
|
info.size = meta->GetBlobFileSize();
|
|
|
|
if (opts.include_checksum_info) {
|
|
|
|
info.file_checksum_func_name = meta->GetChecksumMethod();
|
|
|
|
info.file_checksum = meta->GetChecksumValue();
|
|
|
|
if (info.file_checksum_func_name.empty()) {
|
|
|
|
info.file_checksum_func_name = kUnknownFileChecksumFuncName;
|
|
|
|
info.file_checksum = kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// TODO?: info.temperature
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Capture some final info before releasing mutex
|
|
|
|
const uint64_t manifest_number = versions_->manifest_file_number();
|
|
|
|
const uint64_t manifest_size = versions_->manifest_file_size();
|
|
|
|
const uint64_t options_number = versions_->options_file_number();
|
|
|
|
const uint64_t options_size = versions_->options_file_size_;
|
|
|
|
const uint64_t min_log_num = MinLogNumberToKeep();
|
|
|
|
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
std::string manifest_fname = DescriptorFileName(manifest_number);
|
|
|
|
{ // MANIFEST
|
|
|
|
results.emplace_back();
|
|
|
|
LiveFileStorageInfo& info = results.back();
|
|
|
|
|
|
|
|
info.relative_filename = manifest_fname;
|
|
|
|
info.directory = GetName();
|
|
|
|
info.file_number = manifest_number;
|
|
|
|
info.file_type = kDescriptorFile;
|
|
|
|
info.size = manifest_size;
|
|
|
|
info.trim_to_size = true;
|
|
|
|
if (opts.include_checksum_info) {
|
|
|
|
info.file_checksum_func_name = kUnknownFileChecksumFuncName;
|
|
|
|
info.file_checksum = kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{ // CURRENT
|
|
|
|
results.emplace_back();
|
|
|
|
LiveFileStorageInfo& info = results.back();
|
|
|
|
|
|
|
|
info.relative_filename = kCurrentFileName;
|
|
|
|
info.directory = GetName();
|
|
|
|
info.file_type = kCurrentFile;
|
2022-04-01 23:06:14 +00:00
|
|
|
// CURRENT could be replaced so we have to record the contents as needed.
|
2021-10-16 17:03:19 +00:00
|
|
|
info.replacement_contents = manifest_fname + "\n";
|
|
|
|
info.size = manifest_fname.size() + 1;
|
|
|
|
if (opts.include_checksum_info) {
|
|
|
|
info.file_checksum_func_name = kUnknownFileChecksumFuncName;
|
|
|
|
info.file_checksum = kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The OPTIONS file number is zero in read-write mode when OPTIONS file
|
|
|
|
// writing failed and the DB was configured with
|
|
|
|
// `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
|
|
|
|
// number is zero when no OPTIONS file exist at all. In those cases we do not
|
|
|
|
// record any OPTIONS file in the live file list.
|
|
|
|
if (options_number != 0) {
|
|
|
|
results.emplace_back();
|
|
|
|
LiveFileStorageInfo& info = results.back();
|
|
|
|
|
|
|
|
info.relative_filename = OptionsFileName(options_number);
|
|
|
|
info.directory = GetName();
|
|
|
|
info.file_number = options_number;
|
|
|
|
info.file_type = kOptionsFile;
|
|
|
|
info.size = options_size;
|
|
|
|
if (opts.include_checksum_info) {
|
|
|
|
info.file_checksum_func_name = kUnknownFileChecksumFuncName;
|
|
|
|
info.file_checksum = kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Some legacy testing stuff TODO: carefully clean up obsolete parts
|
|
|
|
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
|
|
|
|
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2022-06-01 18:02:27 +00:00
|
|
|
// To maximize the effectiveness of track_and_verify_wals_in_manifest,
|
|
|
|
// sync WAL when it is enabled.
|
|
|
|
s = FlushWAL(
|
|
|
|
immutable_db_options_.track_and_verify_wals_in_manifest /* sync */);
|
|
|
|
if (s.IsNotSupported()) { // read-only DB or similar
|
|
|
|
s = Status::OK();
|
|
|
|
}
|
2021-10-16 17:03:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
|
|
|
|
TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
|
|
|
|
|
2022-04-01 23:06:14 +00:00
|
|
|
// If we have more than one column family, we also need to get WAL files.
|
2021-10-16 17:03:19 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
s = GetSortedWalFiles(live_wal_files);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2024-05-28 16:24:49 +00:00
|
|
|
size_t wal_count = live_wal_files.size();
|
2021-10-16 17:03:19 +00:00
|
|
|
// Link WAL files. Copy exact size of last one because it is the only one
|
|
|
|
// that has changes after the last flush.
|
|
|
|
auto wal_dir = immutable_db_options_.GetWalDir();
|
2024-05-28 16:24:49 +00:00
|
|
|
for (size_t i = 0; s.ok() && i < wal_count; ++i) {
|
2021-10-16 17:03:19 +00:00
|
|
|
if ((live_wal_files[i]->Type() == kAliveLogFile) &&
|
|
|
|
(!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) {
|
|
|
|
results.emplace_back();
|
|
|
|
LiveFileStorageInfo& info = results.back();
|
|
|
|
auto f = live_wal_files[i]->PathName();
|
|
|
|
assert(!f.empty() && f[0] == '/');
|
|
|
|
info.relative_filename = f.substr(1);
|
|
|
|
info.directory = wal_dir;
|
|
|
|
info.file_number = live_wal_files[i]->LogNumber();
|
|
|
|
info.file_type = kWalFile;
|
|
|
|
info.size = live_wal_files[i]->SizeFileBytes();
|
2024-03-21 19:29:35 +00:00
|
|
|
// Trim the log either if its the last one, or log file recycling is
|
|
|
|
// enabled. In the latter case, a hard link doesn't prevent the file
|
|
|
|
// from being renamed and recycled. So we need to copy it instead.
|
2024-05-28 16:24:49 +00:00
|
|
|
info.trim_to_size = (i + 1 == wal_count) ||
|
2024-03-21 19:29:35 +00:00
|
|
|
(immutable_db_options_.recycle_log_file_num > 0);
|
2021-10-16 17:03:19 +00:00
|
|
|
if (opts.include_checksum_info) {
|
|
|
|
info.file_checksum_func_name = kUnknownFileChecksumFuncName;
|
|
|
|
info.file_checksum = kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2022-04-01 23:06:14 +00:00
|
|
|
// Only move results to output on success.
|
2021-10-16 17:03:19 +00:00
|
|
|
*files = std::move(results);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|