2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2013-10-05 05:32:05 +00:00
|
|
|
#pragma once
|
2014-01-02 19:26:57 +00:00
|
|
|
|
2013-05-24 19:52:45 +00:00
|
|
|
#include <atomic>
|
2012-03-09 00:23:21 +00:00
|
|
|
#include <deque>
|
2016-04-18 18:11:51 +00:00
|
|
|
#include <functional>
|
2014-07-03 22:47:02 +00:00
|
|
|
#include <limits>
|
2014-11-07 19:50:34 +00:00
|
|
|
#include <list>
|
2016-11-12 04:45:47 +00:00
|
|
|
#include <map>
|
2015-09-02 20:58:22 +00:00
|
|
|
#include <set>
|
2014-02-26 18:03:34 +00:00
|
|
|
#include <string>
|
2023-01-24 17:54:04 +00:00
|
|
|
#include <unordered_map>
|
2015-09-02 20:58:22 +00:00
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
2013-12-31 02:33:57 +00:00
|
|
|
|
2014-01-24 22:30:28 +00:00
|
|
|
#include "db/column_family.h"
|
Memtable "MemPurge" prototype (#8454)
Summary:
Implement an experimental feature called "MemPurge", which consists in purging "garbage" bytes out of a memtable and reuse the memtable struct instead of making it immutable and eventually flushing its content to storage.
The prototype is by default deactivated and is not intended for use. It is intended for correctness and validation testing. At the moment, the "MemPurge" feature can be switched on by using the `options.experimental_allow_mempurge` flag. For this early stage, when the allow_mempurge flag is set to `true`, all the flush operations will be rerouted to perform a MemPurge. This is a temporary design decision that will give us the time to explore meaningful heuristics to use MemPurge at the right time for relevant workloads . Moreover, the current MemPurge operation only supports `Puts`, `Deletes`, `DeleteRange` operations, and handles `Iterators` as well as `CompactionFilter`s that are invoked at flush time .
Three unit tests are added to `db_flush_test.cc` to test if MemPurge works correctly (and checks that the previously mentioned operations are fully supported thoroughly tested).
One noticeable design decision is the timing of the MemPurge operation in the memtable workflow: for this prototype, the mempurge happens when the memtable is switched (and usually made immutable). This is an inefficient process because it implies that the entirety of the MemPurge operation happens while holding the db_mutex. Future commits will make the MemPurge operation a background task (akin to the regular flush operation) and aim at drastically enhancing the performance of this operation. The MemPurge is also not fully "WAL-compatible" yet, but when the WAL is full, or when the regular MemPurge operation fails (or when the purged memtable still needs to be flushed), a regular flush operation takes place. Later commits will also correct these behaviors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8454
Reviewed By: anand1976
Differential Revision: D29433971
Pulled By: bjlemaire
fbshipit-source-id: 6af48213554e35048a7e03816955100a80a26dc5
2021-07-02 12:22:03 +00:00
|
|
|
#include "db/compaction/compaction_iterator.h"
|
2019-05-31 18:52:59 +00:00
|
|
|
#include "db/compaction/compaction_job.h"
|
2018-06-28 19:23:57 +00:00
|
|
|
#include "db/error_handler.h"
|
|
|
|
#include "db/event_helpers.h"
|
2018-08-23 17:04:10 +00:00
|
|
|
#include "db/external_sst_file_ingestion_job.h"
|
2015-06-02 21:12:23 +00:00
|
|
|
#include "db/flush_job.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "db/flush_scheduler.h"
|
Export Import sst files (#5495)
Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135
This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"
We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.
(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
// is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
const std::string& export_dir,
ExportImportFilesMetaData** metadata);
Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.
(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
// an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle);
Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.
If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.
Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495
Differential Revision: D16018881
fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 19:22:21 +00:00
|
|
|
#include "db/import_column_family_job.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "db/internal_stats.h"
|
2015-08-07 00:59:05 +00:00
|
|
|
#include "db/log_writer.h"
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
#include "db/logs_with_prep_tracker.h"
|
2019-05-31 22:21:36 +00:00
|
|
|
#include "db/memtable_list.h"
|
2022-08-26 01:52:37 +00:00
|
|
|
#include "db/periodic_task_scheduler.h"
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
#include "db/post_memtable_callback.h"
|
2017-12-01 07:39:56 +00:00
|
|
|
#include "db/pre_release_callback.h"
|
2018-12-18 01:26:56 +00:00
|
|
|
#include "db/range_del_aggregator.h"
|
2017-09-11 15:58:52 +00:00
|
|
|
#include "db/read_callback.h"
|
2022-07-15 04:49:34 +00:00
|
|
|
#include "db/seqno_to_time_mapping.h"
|
2017-10-06 17:26:38 +00:00
|
|
|
#include "db/snapshot_checker.h"
|
2015-08-07 00:59:05 +00:00
|
|
|
#include "db/snapshot_impl.h"
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
#include "db/trim_history_scheduler.h"
|
2013-11-12 19:53:26 +00:00
|
|
|
#include "db/version_edit.h"
|
2014-10-30 00:43:37 +00:00
|
|
|
#include "db/wal_manager.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "db/write_controller.h"
|
|
|
|
#include "db/write_thread.h"
|
2019-06-01 00:19:43 +00:00
|
|
|
#include "logging/event_logger.h"
|
2017-04-06 02:02:00 +00:00
|
|
|
#include "monitoring/instrumented_mutex.h"
|
|
|
|
#include "options/db_options.h"
|
2014-01-02 19:26:57 +00:00
|
|
|
#include "port/port.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/memtablerep.h"
|
2017-04-06 00:14:05 +00:00
|
|
|
#include "rocksdb/status.h"
|
2018-08-01 07:14:43 +00:00
|
|
|
#include "rocksdb/trace_reader_writer.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/transaction_log.h"
|
2021-08-12 02:31:44 +00:00
|
|
|
#include "rocksdb/utilities/replayer.h"
|
2016-06-21 01:01:03 +00:00
|
|
|
#include "rocksdb/write_buffer_manager.h"
|
Memtable "MemPurge" prototype (#8454)
Summary:
Implement an experimental feature called "MemPurge", which consists in purging "garbage" bytes out of a memtable and reuse the memtable struct instead of making it immutable and eventually flushing its content to storage.
The prototype is by default deactivated and is not intended for use. It is intended for correctness and validation testing. At the moment, the "MemPurge" feature can be switched on by using the `options.experimental_allow_mempurge` flag. For this early stage, when the allow_mempurge flag is set to `true`, all the flush operations will be rerouted to perform a MemPurge. This is a temporary design decision that will give us the time to explore meaningful heuristics to use MemPurge at the right time for relevant workloads . Moreover, the current MemPurge operation only supports `Puts`, `Deletes`, `DeleteRange` operations, and handles `Iterators` as well as `CompactionFilter`s that are invoked at flush time .
Three unit tests are added to `db_flush_test.cc` to test if MemPurge works correctly (and checks that the previously mentioned operations are fully supported thoroughly tested).
One noticeable design decision is the timing of the MemPurge operation in the memtable workflow: for this prototype, the mempurge happens when the memtable is switched (and usually made immutable). This is an inefficient process because it implies that the entirety of the MemPurge operation happens while holding the db_mutex. Future commits will make the MemPurge operation a background task (akin to the regular flush operation) and aim at drastically enhancing the performance of this operation. The MemPurge is also not fully "WAL-compatible" yet, but when the WAL is full, or when the regular MemPurge operation fails (or when the purged memtable still needs to be flushed), a regular flush operation takes place. Later commits will also correct these behaviors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8454
Reviewed By: anand1976
Differential Revision: D29433971
Pulled By: bjlemaire
fbshipit-source-id: 6af48213554e35048a7e03816955100a80a26dc5
2021-07-02 12:22:03 +00:00
|
|
|
#include "table/merging_iterator.h"
|
2014-01-14 22:49:31 +00:00
|
|
|
#include "util/autovector.h"
|
EventLogger
Summary:
Here's my proposal for making our LOGs easier to read by machines.
The idea is to dump all events as JSON objects. JSON is easy to read by humans, but more importantly, it's easy to read by machines. That way, we can parse this, load into SQLite/mongo and then query or visualize.
I started with table_create and table_delete events, but if everybody agrees, I'll continue by adding more events (flush/compaction/etc etc)
Test Plan:
Ran db_bench. Observed:
2015/01/15-14:13:25.788019 1105ef000 EVENT_LOG_v1 {"time_micros": 1421360005788015, "event": "table_file_creation", "file_number": 12, "file_size": 1909699}
2015/01/15-14:13:25.956500 110740000 EVENT_LOG_v1 {"time_micros": 1421360005956498, "event": "table_file_deletion", "file_number": 12}
Reviewers: yhchiang, rven, dhruba, MarkCallaghan, lgalanis, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31647
2015-03-13 17:15:54 +00:00
|
|
|
#include "util/hash.h"
|
move dump stats to a separate thread (#4382)
Summary:
Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all.
We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior.
Tested with db_bench using `--stats_dump_period_sec=20` in command line:
> LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG content:
> 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606]
** DB Stats **
Uptime(secs): 20.0 total, 20.0 interval
Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s
Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s
Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent
Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s
Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s
Interval stall: 00:00:0.012 H:M:S, 0.1 percent
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382
Differential Revision: D9933051
Pulled By: miasantreble
fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 05:52:58 +00:00
|
|
|
#include "util/repeatable_thread.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "util/stop_watch.h"
|
|
|
|
#include "util/thread_local.h"
|
2012-08-14 22:20:36 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2017-10-06 17:26:38 +00:00
|
|
|
class Arena;
|
2017-10-03 16:08:07 +00:00
|
|
|
class ArenaWrappedDBIter;
|
2019-02-20 23:46:59 +00:00
|
|
|
class InMemoryStatsHistoryIterator;
|
2011-03-18 22:37:00 +00:00
|
|
|
class MemTable;
|
2019-06-17 22:17:43 +00:00
|
|
|
class PersistentStatsHistoryIterator;
|
2011-03-18 22:37:00 +00:00
|
|
|
class TableCache;
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
2018-12-13 21:16:04 +00:00
|
|
|
class TaskLimiterToken;
|
2011-03-18 22:37:00 +00:00
|
|
|
class Version;
|
|
|
|
class VersionEdit;
|
|
|
|
class VersionSet;
|
2015-05-29 21:36:35 +00:00
|
|
|
class WriteCallback;
|
2014-11-14 23:43:10 +00:00
|
|
|
struct JobContext;
|
2015-09-23 19:42:43 +00:00
|
|
|
struct ExternalSstFileInfo;
|
2016-06-02 18:57:31 +00:00
|
|
|
struct MemTableInfo;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2019-06-21 17:12:29 +00:00
|
|
|
// Class to maintain directories for all database paths other than main one.
|
|
|
|
class Directories {
|
|
|
|
public:
|
2020-03-03 00:14:00 +00:00
|
|
|
IOStatus SetDirectories(FileSystem* fs, const std::string& dbname,
|
|
|
|
const std::string& wal_dir,
|
|
|
|
const std::vector<DbPath>& data_paths);
|
2019-06-21 17:12:29 +00:00
|
|
|
|
2020-03-03 00:14:00 +00:00
|
|
|
FSDirectory* GetDataDir(size_t path_id) const {
|
2019-06-21 17:12:29 +00:00
|
|
|
assert(path_id < data_dirs_.size());
|
2020-03-03 00:14:00 +00:00
|
|
|
FSDirectory* ret_dir = data_dirs_[path_id].get();
|
2019-06-21 17:12:29 +00:00
|
|
|
if (ret_dir == nullptr) {
|
|
|
|
// Should use db_dir_
|
|
|
|
return db_dir_.get();
|
|
|
|
}
|
|
|
|
return ret_dir;
|
|
|
|
}
|
|
|
|
|
2020-03-03 00:14:00 +00:00
|
|
|
FSDirectory* GetWalDir() {
|
2019-06-21 17:12:29 +00:00
|
|
|
if (wal_dir_) {
|
|
|
|
return wal_dir_.get();
|
|
|
|
}
|
|
|
|
return db_dir_.get();
|
|
|
|
}
|
|
|
|
|
2020-03-03 00:14:00 +00:00
|
|
|
FSDirectory* GetDbDir() { return db_dir_.get(); }
|
2019-06-21 17:12:29 +00:00
|
|
|
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
IOStatus Close(const IOOptions& options, IODebugContext* dbg) {
|
|
|
|
// close all directories for all database paths
|
|
|
|
IOStatus s = IOStatus::OK();
|
2022-06-07 16:49:31 +00:00
|
|
|
|
|
|
|
// The default implementation for Close() in Directory/FSDirectory class
|
|
|
|
// "NotSupported" status, the upper level interface should be able to
|
|
|
|
// handle this error so that Close() does not fail after upgrading when
|
|
|
|
// run on FileSystems that have not implemented `Directory::Close()` or
|
|
|
|
// `FSDirectory::Close()` yet
|
|
|
|
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
if (db_dir_) {
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
IOStatus temp_s = db_dir_->Close(options, dbg);
|
|
|
|
if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
|
|
|
|
s = std::move(temp_s);
|
2022-06-07 16:49:31 +00:00
|
|
|
}
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
}
|
|
|
|
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
// Attempt to close everything even if one fails
|
|
|
|
s.PermitUncheckedError();
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
|
|
|
|
if (wal_dir_) {
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
IOStatus temp_s = wal_dir_->Close(options, dbg);
|
|
|
|
if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
|
|
|
|
s = std::move(temp_s);
|
2022-06-07 16:49:31 +00:00
|
|
|
}
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
}
|
|
|
|
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
s.PermitUncheckedError();
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
for (auto& data_dir_ptr : data_dirs_) {
|
|
|
|
if (data_dir_ptr) {
|
|
|
|
IOStatus temp_s = data_dir_ptr->Close(options, dbg);
|
|
|
|
if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
|
|
|
|
s = std::move(temp_s);
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
// Ready for caller
|
|
|
|
s.MustCheck();
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-06-21 17:12:29 +00:00
|
|
|
private:
|
2020-03-03 00:14:00 +00:00
|
|
|
std::unique_ptr<FSDirectory> db_dir_;
|
|
|
|
std::vector<std::unique_ptr<FSDirectory>> data_dirs_;
|
|
|
|
std::unique_ptr<FSDirectory> wal_dir_;
|
2019-06-21 17:12:29 +00:00
|
|
|
};
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// While DB is the public interface of RocksDB, and DBImpl is the actual
|
|
|
|
// class implementing it. It's the entrance of the core RocksdB engine.
|
|
|
|
// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
|
|
|
|
// DBImpl internally.
|
|
|
|
// Other than functions implementing the DB interface, some public
|
|
|
|
// functions are there for other internal components to call. For
|
|
|
|
// example, TransactionDB directly calls DBImpl::WriteImpl() and
|
|
|
|
// BlobDB directly calls DBImpl::GetImpl(). Some other functions
|
|
|
|
// are for sub-components to call. For example, ColumnFamilyHandleImpl
|
|
|
|
// calls DBImpl::FindObsoleteFiles().
|
|
|
|
//
|
|
|
|
// Since it's a very large class, the definition of the functions is
|
|
|
|
// divided in several db_impl_*.cc files, besides db_impl.cc.
|
2011-03-18 22:37:00 +00:00
|
|
|
class DBImpl : public DB {
|
|
|
|
public:
|
2017-11-11 01:18:01 +00:00
|
|
|
DBImpl(const DBOptions& options, const std::string& dbname,
|
Make backups openable as read-only DBs (#8142)
Summary:
A current limitation of backups is that you don't know the
exact database state of when the backup was taken. With this new
feature, you can at least inspect the backup's DB state without
restoring it by opening it as a read-only DB.
Rather than add something like OpenAsReadOnlyDB to the BackupEngine API,
which would inhibit opening stackable DB implementations read-only
(if/when their APIs support it), we instead provide a DB name and Env
that can be used to open as a read-only DB.
Possible follow-up work:
* Add a version of GetBackupInfo for a single backup.
* Let CreateNewBackup return the BackupID of the newly-created backup.
Implementation details:
Refactored ChrootFileSystem to split off new base class RemapFileSystem,
which allows more general remapping of files. We use this base class to
implement BackupEngineImpl::RemapSharedFileSystem.
To minimize API impact, I decided to just add these fields `name_for_open`
and `env_for_open` to those set by GetBackupInfo when
include_file_details=true. Creating the RemapSharedFileSystem adds a bit
to the memory consumption, perhaps unnecessarily in some cases, but this
has been mitigated by (a) only initialize the RemapSharedFileSystem
lazily when GetBackupInfo with include_file_details=true is called, and
(b) using the existing `shared_ptr<FileInfo>` objects to hold most of the
mapping data.
To enhance API safety, RemapSharedFileSystem is wrapped by new
ReadOnlyFileSystem which rejects any attempts to write. This uncovered a
couple of places in which DB::OpenForReadOnly would write to the
filesystem, so I fixed these. Added a release note because this affects
logging.
Additional minor refactoring in backupable_db.cc to support the new
functionality.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8142
Test Plan:
new test (run with ASAN and UBSAN), added to stress test and
ran it for a while with amplified backup_one_in
Reviewed By: ajkr
Differential Revision: D27535408
Pulled By: pdillinger
fbshipit-source-id: 04666d310aa0261ef6b2385c43ca793ce1dfd148
2021-04-06 21:36:45 +00:00
|
|
|
const bool seq_per_batch = false, const bool batch_per_txn = true,
|
|
|
|
bool read_only = false);
|
2019-09-12 01:07:12 +00:00
|
|
|
// No copying allowed
|
|
|
|
DBImpl(const DBImpl&) = delete;
|
|
|
|
void operator=(const DBImpl&) = delete;
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual ~DBImpl();
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// ---- Implementations of the DB interface ----
|
|
|
|
|
2018-06-28 19:23:57 +00:00
|
|
|
using DB::Resume;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status Resume() override;
|
2018-06-28 19:23:57 +00:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Put;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) override;
|
|
|
|
Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& ts, const Slice& value) override;
|
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
using DB::PutEntity;
|
|
|
|
Status PutEntity(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const WideColumns& columns) override;
|
2023-11-07 00:52:51 +00:00
|
|
|
Status PutEntity(const WriteOptions& options, const Slice& key,
|
|
|
|
const AttributeGroups& attribute_groups) override;
|
2022-06-25 22:30:47 +00:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Merge;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) override;
|
2022-11-01 05:28:58 +00:00
|
|
|
Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& ts, const Slice& value) override;
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Delete;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) override;
|
|
|
|
Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& ts) override;
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
using DB::SingleDelete;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status SingleDelete(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) override;
|
|
|
|
Status SingleDelete(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& ts) override;
|
|
|
|
|
|
|
|
using DB::DeleteRange;
|
|
|
|
Status DeleteRange(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& begin_key,
|
|
|
|
const Slice& end_key) override;
|
2022-09-30 23:13:03 +00:00
|
|
|
Status DeleteRange(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& begin_key,
|
|
|
|
const Slice& end_key, const Slice& ts) override;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Write;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Write(const WriteOptions& options, WriteBatch* updates) override;
|
2015-05-29 21:36:35 +00:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Get;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Get(const ReadOptions& _read_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* value, std::string* timestamp) override;
|
2017-10-18 00:24:25 +00:00
|
|
|
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
using DB::GetEntity;
|
|
|
|
Status GetEntity(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableWideColumns* columns) override;
|
2023-11-06 23:04:41 +00:00
|
|
|
Status GetEntity(const ReadOptions& options, const Slice& key,
|
|
|
|
PinnableAttributeGroups* result) override;
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
using DB::GetMergeOperands;
|
|
|
|
Status GetMergeOperands(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* merge_operands,
|
|
|
|
GetMergeOperandsOptions* get_merge_operands_options,
|
|
|
|
int* number_of_operands) override {
|
|
|
|
GetImplOptions get_impl_options;
|
|
|
|
get_impl_options.column_family = column_family;
|
|
|
|
get_impl_options.merge_operands = merge_operands;
|
|
|
|
get_impl_options.get_merge_operands_options = get_merge_operands_options;
|
|
|
|
get_impl_options.number_of_operands = number_of_operands;
|
|
|
|
get_impl_options.get_value = false;
|
|
|
|
return GetImpl(options, key, get_impl_options);
|
|
|
|
}
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::MultiGet;
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
// This MultiGet is a batched version, which may be faster than calling Get
|
|
|
|
// multiple times, especially if the keys have some spatial locality that
|
|
|
|
// enables them to be queried in the same SST files/set of files. The larger
|
|
|
|
// the batch size, the more scope for batching and performance improvement
|
|
|
|
// The values and statuses parameters are arrays with number of elements
|
|
|
|
// equal to keys.size(). This allows the storage for those to be alloacted
|
|
|
|
// by the caller on the stack for small batches
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
void MultiGet(const ReadOptions& _read_options, const size_t num_keys,
|
2023-02-15 17:34:17 +00:00
|
|
|
ColumnFamilyHandle** column_families, const Slice* keys,
|
|
|
|
PinnableSlice* values, std::string* timestamps,
|
|
|
|
Status* statuses, const bool sorted_input = false) override;
|
|
|
|
|
|
|
|
void MultiGetWithCallback(
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
|
2019-11-12 21:51:18 +00:00
|
|
|
ReadCallback* callback,
|
|
|
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
|
2023-02-15 17:34:17 +00:00
|
|
|
using DB::MultiGetEntity;
|
|
|
|
|
|
|
|
void MultiGetEntity(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, size_t num_keys,
|
|
|
|
const Slice* keys, PinnableWideColumns* results,
|
|
|
|
Status* statuses, bool sorted_input) override;
|
|
|
|
|
|
|
|
void MultiGetEntity(const ReadOptions& options, size_t num_keys,
|
|
|
|
ColumnFamilyHandle** column_families, const Slice* keys,
|
|
|
|
PinnableWideColumns* results, Status* statuses,
|
|
|
|
bool sorted_input) override;
|
AttributeGroups - MultiGetEntity Implementation (#11925)
Summary:
Introducing the notion of AttributeGroup by adding the `MultiGetEntity()` API retrieving `PinnableAttributeGroups`.
An "attribute group" refers to a logical grouping of wide-column entities within RocksDB. These attribute groups are implemented using column families.
Users can store WideColumns in different CFs for various reasons (e.g. similar access patterns, same types, etc.). This new API `MultiGetEntity()` takes keys and `PinnableAttributeGroups` per key. `PinnableAttributeGroups` is just a list of `PinnableAttributeGroup`s in which we have `ColumnFamilyHandle*`, `Status`, and `PinnableWideColumns`.
Let's say a user stored "hot" wide columns in column family "hot_data_cf" and "cold" wide columns in column family "cold_data_cf" and all other columns in "common_cf".
Prior to this PR, if the user wants to query for two keys, "key_1" and "key_2" and but only interested in "common_cf" and "hot_data_cf" for "key_1", and "common_cf" and "cold_data_cf" for "key_2", the user would have to construct input like `keys = ["key_1", "key_1", "key_2", "key_2"]`, `column_families = ["common_cf", "hot_data_cf", "common_cf", "cold_data_cf"]` and get the flat list of `PinnableWideColumns` to find the corresponding <key,CF> combo.
With the new `MultiGetEntity()` introduced in this PR, users can now query only `["common_cf", "hot_data_cf"]` for `"key_1"`, and only `["common_cf", "cold_data_cf"]` for `"key_2"`. The user will get `PinnableAttributeGroups` for each key, and `PinnableAttributeGroups` gives a list of `PinnableAttributeGroup`s where the user can find column family and corresponding `PinnableWideColumns` and the `Status`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11925
Test Plan:
- `DBWideBasicTest::MultiCFMultiGetEntityAsPinnableAttributeGroups` added
will enable this new API in the `db_stress` in a separate PR
Reviewed By: ltamasi
Differential Revision: D50017414
Pulled By: jaykorean
fbshipit-source-id: 643611d1273c574bc81b94c6f5aeea24b40c4586
2023-10-13 22:58:03 +00:00
|
|
|
void MultiGetEntity(const ReadOptions& options, size_t num_keys,
|
|
|
|
const Slice* keys,
|
|
|
|
PinnableAttributeGroups* results) override;
|
2023-02-15 17:34:17 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
|
|
|
const std::string& column_family,
|
|
|
|
ColumnFamilyHandle** handle) override {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
|
|
return CreateColumnFamily(ReadOptions(), WriteOptions(), cf_options,
|
|
|
|
column_family, handle);
|
|
|
|
}
|
|
|
|
virtual Status CreateColumnFamily(const ReadOptions& read_options,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
const ColumnFamilyOptions& cf_options,
|
|
|
|
const std::string& column_family,
|
|
|
|
ColumnFamilyHandle** handle);
|
2024-01-31 21:14:42 +00:00
|
|
|
Status CreateColumnFamilies(
|
2017-05-08 05:12:55 +00:00
|
|
|
const ColumnFamilyOptions& cf_options,
|
|
|
|
const std::vector<std::string>& column_family_names,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
std::vector<ColumnFamilyHandle*>* handles) override {
|
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
|
|
return CreateColumnFamilies(ReadOptions(), WriteOptions(), cf_options,
|
|
|
|
column_family_names, handles);
|
|
|
|
}
|
|
|
|
virtual Status CreateColumnFamilies(
|
|
|
|
const ReadOptions& read_options, const WriteOptions& write_options,
|
|
|
|
const ColumnFamilyOptions& cf_options,
|
|
|
|
const std::vector<std::string>& column_family_names,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles);
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status CreateColumnFamilies(
|
2017-05-08 05:12:55 +00:00
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
std::vector<ColumnFamilyHandle*>* handles) override {
|
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
|
|
return CreateColumnFamilies(ReadOptions(), WriteOptions(), column_families,
|
|
|
|
handles);
|
|
|
|
}
|
|
|
|
virtual Status CreateColumnFamilies(
|
|
|
|
const ReadOptions& read_options, const WriteOptions& write_options,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles);
|
2024-01-31 21:14:42 +00:00
|
|
|
Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
|
|
|
|
Status DropColumnFamilies(
|
2017-05-08 05:12:55 +00:00
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families) override;
|
2014-01-02 17:08:12 +00:00
|
|
|
|
2013-07-26 19:57:01 +00:00
|
|
|
// Returns false if key doesn't exist in the database and true if it may.
|
|
|
|
// If value_found is not passed in as null, then return the value if found in
|
|
|
|
// memory. On return, if value was found, then value_found will be set to true
|
|
|
|
// , otherwise false.
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::KeyMayExist;
|
2024-01-31 21:14:42 +00:00
|
|
|
bool KeyMayExist(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value, std::string* timestamp,
|
|
|
|
bool* value_found = nullptr) override;
|
2017-10-03 16:08:07 +00:00
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::NewIterator;
|
2024-01-31 21:14:42 +00:00
|
|
|
Iterator* NewIterator(const ReadOptions& _read_options,
|
|
|
|
ColumnFamilyHandle* column_family) override;
|
|
|
|
Status NewIterators(const ReadOptions& _read_options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
|
|
|
std::vector<Iterator*>* iterators) override;
|
|
|
|
|
|
|
|
const Snapshot* GetSnapshot() override;
|
|
|
|
void ReleaseSnapshot(const Snapshot* snapshot) override;
|
Introduce MultiCfIterator (#12153)
Summary:
This PR introduces a new implementation of `Iterator` via a new public API called `NewMultiCfIterator()`. The new API takes a vector of column family handles to build a cross-column-family iterator, which internally maintains multiple `DBIter`s as child iterators from a consistent database state. When a key exists in multiple column families, the iterator selects the value (and wide columns) from the first column family containing the key, following the order provided in the `column_families` parameter. Similar to the merging iterator, a min heap is used to iterate across the child iterators. Backward iteration and direction change functionalities will be implemented in future PRs.
The comparator used to compare keys across different column families will be derived from the iterator of the first column family specified in `column_families`. This comparator will be checked against the comparators from all other column families that the iterator will traverse. If there's a mismatch with any of the comparators, the initialization of the iterator will fail.
Please note that this PR is not enough for users to start using `MultiCfIterator`. The `MultiCfIterator` and related APIs are still marked as "**DO NOT USE - UNDER CONSTRUCTION**". This PR is just the first of many PRs that will follow soon.
This PR includes the following:
- Introduction and partial implementation of the `MultiCfIterator`, which implements the generic `Iterator` interface. The implementation includes the construction of the iterator, `SeekToFirst()`, `Next()`, `Valid()`, `key()`, `value()`, and `columns()`.
- Unit tests to verify iteration across multiple column families in two distinct scenarios: (1) keys are unique across all column families, and (2) the same keys exist in multiple column families.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12153
Reviewed By: pdillinger
Differential Revision: D52308697
Pulled By: jaykorean
fbshipit-source-id: b03e69f13b40af5a8f0598d0f43a0bec01ef8294
2024-03-05 18:22:43 +00:00
|
|
|
|
|
|
|
// UNDER CONSTRUCTION - DO NOT USE
|
|
|
|
// Return a cross-column-family iterator from a consistent database state.
|
|
|
|
std::unique_ptr<Iterator> NewMultiCfIterator(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families) override;
|
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
// Create a timestamped snapshot. This snapshot can be shared by multiple
|
|
|
|
// readers. If any of them uses it for write conflict checking, then
|
|
|
|
// is_write_conflict_boundary is true. For simplicity, set it to true by
|
|
|
|
// default.
|
|
|
|
std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
|
|
|
|
SequenceNumber snapshot_seq, uint64_t ts);
|
|
|
|
std::shared_ptr<const SnapshotImpl> GetTimestampedSnapshot(uint64_t ts) const;
|
|
|
|
void ReleaseTimestampedSnapshotsOlderThan(
|
|
|
|
uint64_t ts, size_t* remaining_total_ss = nullptr);
|
|
|
|
Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub,
|
|
|
|
std::vector<std::shared_ptr<const Snapshot>>&
|
|
|
|
timestamped_snapshots) const;
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetProperty;
|
2024-01-31 21:14:42 +00:00
|
|
|
bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property,
|
|
|
|
std::string* value) override;
|
2016-11-12 04:45:47 +00:00
|
|
|
using DB::GetMapProperty;
|
2024-01-31 21:14:42 +00:00
|
|
|
bool GetMapProperty(ColumnFamilyHandle* column_family, const Slice& property,
|
|
|
|
std::map<std::string, std::string>* value) override;
|
2014-07-28 22:28:53 +00:00
|
|
|
using DB::GetIntProperty;
|
2024-01-31 21:14:42 +00:00
|
|
|
bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property,
|
|
|
|
uint64_t* value) override;
|
2015-11-03 23:54:18 +00:00
|
|
|
using DB::GetAggregatedIntProperty;
|
2024-01-31 21:14:42 +00:00
|
|
|
bool GetAggregatedIntProperty(const Slice& property,
|
|
|
|
uint64_t* aggregated_value) override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetApproximateSizes;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status GetApproximateSizes(const SizeApproximationOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Range* range, int n,
|
|
|
|
uint64_t* sizes) override;
|
2017-02-06 22:42:38 +00:00
|
|
|
using DB::GetApproximateMemTableStats;
|
2024-01-31 21:14:42 +00:00
|
|
|
void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
|
|
|
|
const Range& range, uint64_t* const count,
|
|
|
|
uint64_t* const size) override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::CompactRange;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status CompactRange(const CompactRangeOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice* begin,
|
|
|
|
const Slice* end) override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
using DB::CompactFiles;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status CompactFiles(
|
2018-12-13 22:12:02 +00:00
|
|
|
const CompactionOptions& compact_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<std::string>& input_file_names, const int output_level,
|
|
|
|
const int output_path_id = -1,
|
|
|
|
std::vector<std::string>* const output_file_names = nullptr,
|
|
|
|
CompactionJobInfo* compaction_job_info = nullptr) override;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status PauseBackgroundWork() override;
|
|
|
|
Status ContinueBackgroundWork() override;
|
2015-10-02 20:17:34 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status EnableAutoCompaction(
|
2015-11-20 09:07:24 +00:00
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
void EnableManualCompaction() override;
|
|
|
|
void DisableManualCompaction() override;
|
2019-09-17 04:00:13 +00:00
|
|
|
|
2014-09-17 19:49:13 +00:00
|
|
|
using DB::SetOptions;
|
2015-02-26 19:28:41 +00:00
|
|
|
Status SetOptions(
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::unordered_map<std::string, std::string>& options_map) override;
|
2014-09-17 19:49:13 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status SetDBOptions(
|
2016-10-14 19:25:39 +00:00
|
|
|
const std::unordered_map<std::string, std::string>& options_map) override;
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::NumberLevels;
|
2024-01-31 21:14:42 +00:00
|
|
|
int NumberLevels(ColumnFamilyHandle* column_family) override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::MaxMemCompactionLevel;
|
2024-01-31 21:14:42 +00:00
|
|
|
int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Level0StopWriteTrigger;
|
2024-01-31 21:14:42 +00:00
|
|
|
int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override;
|
|
|
|
const std::string& GetName() const override;
|
|
|
|
Env* GetEnv() const override;
|
|
|
|
FileSystem* GetFileSystem() const override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetOptions;
|
2024-01-31 21:14:42 +00:00
|
|
|
Options GetOptions(ColumnFamilyHandle* column_family) const override;
|
2015-05-11 21:51:51 +00:00
|
|
|
using DB::GetDBOptions;
|
2024-01-31 21:14:42 +00:00
|
|
|
DBOptions GetDBOptions() const override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Flush;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Flush(const FlushOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family) override;
|
|
|
|
Status Flush(
|
2018-10-26 22:06:44 +00:00
|
|
|
const FlushOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families) override;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status FlushWAL(bool sync) override {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
|
|
return FlushWAL(WriteOptions(), sync);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status FlushWAL(const WriteOptions& write_options, bool sync);
|
Cleanup, improve, stress test LockWAL() (#11143)
Summary:
The previous API comments for LockWAL didn't provide much about why you might want to use it, and didn't really meet what one would infer its contract was. Also, LockWAL was not in db_stress / crash test. In this change:
* Implement a counting semantics for LockWAL()+UnlockWAL(), so that they can safely be used concurrently across threads or recursively within a thread. This should make the API much less bug-prone and easier to use.
* Make sure no UnlockWAL() is needed after non-OK LockWAL() (to match RocksDB conventions)
* Make UnlockWAL() reliably return non-OK when there's no matching LockWAL() (for debug-ability)
* Clarify API comments on LockWAL(), UnlockWAL(), FlushWAL(), and SyncWAL(). Their exact meanings are not obvious, and I don't think it's appropriate to talk about implementation mutexes in the API comments, but about what operations might block each other.
* Add LockWAL()/UnlockWAL() to db_stress and crash test, mostly to check for assertion failures, but also checks that latest seqno doesn't change while WAL is locked. This is simpler to add when LockWAL() is allowed in multiple threads.
* Remove unnecessary use of sync points in test DBWALTest::LockWal. There was a bug during development of above changes that caused this test to fail sporadically, with and without this sync point change.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11143
Test Plan: unit tests added / updated, added to stress/crash test
Reviewed By: ajkr
Differential Revision: D42848627
Pulled By: pdillinger
fbshipit-source-id: 6d976c51791941a31fd8fbf28b0f82e888d9f4b4
2023-01-31 06:52:30 +00:00
|
|
|
bool WALBufferIsEmpty();
|
2024-01-31 21:14:42 +00:00
|
|
|
Status SyncWAL() override;
|
|
|
|
Status LockWAL() override;
|
|
|
|
Status UnlockWAL() override;
|
2014-04-15 20:39:26 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
SequenceNumber GetLatestSequenceNumber() const override;
|
2014-04-15 20:39:26 +00:00
|
|
|
|
2021-12-23 19:02:43 +00:00
|
|
|
// IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire
|
|
|
|
// and release db_mutex
|
|
|
|
Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
|
|
|
|
std::string ts_low) override;
|
|
|
|
|
|
|
|
// GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and
|
|
|
|
// release db_mutex
|
|
|
|
Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
|
|
|
|
std::string* ts_low) override;
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status GetDbIdentity(std::string& identity) const override;
|
2019-05-24 20:05:58 +00:00
|
|
|
|
2019-09-03 15:50:47 +00:00
|
|
|
virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status GetDbSessionId(std::string& session_id) const override;
|
2020-06-15 17:45:03 +00:00
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
ColumnFamilyHandle* DefaultColumnFamily() const override;
|
|
|
|
|
2019-06-17 22:17:43 +00:00
|
|
|
ColumnFamilyHandle* PersistentStatsColumnFamily() const;
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Close() override;
|
2019-05-24 20:05:58 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status DisableFileDeletions() override;
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
|
2024-02-14 02:36:25 +00:00
|
|
|
Status EnableFileDeletions() override;
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
|
|
|
|
virtual bool IsFileDeletionsEnabled() const;
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
Status GetStatsHistory(
|
|
|
|
uint64_t start_time, uint64_t end_time,
|
|
|
|
std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
|
|
|
|
|
2017-04-18 23:30:51 +00:00
|
|
|
using DB::ResetStats;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status ResetStats() override;
|
2013-11-08 23:23:46 +00:00
|
|
|
// All the returned filenames start with "/"
|
2024-01-31 21:14:42 +00:00
|
|
|
Status GetLiveFiles(std::vector<std::string>&, uint64_t* manifest_file_size,
|
|
|
|
bool flush_memtable = true) override;
|
|
|
|
Status GetSortedWalFiles(VectorLogPtr& files) override;
|
|
|
|
Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) override;
|
|
|
|
Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override;
|
|
|
|
|
|
|
|
Status GetUpdatesSince(
|
2018-11-09 19:17:34 +00:00
|
|
|
SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
|
|
|
|
const TransactionLogIterator::ReadOptions& read_options =
|
|
|
|
TransactionLogIterator::ReadOptions()) override;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status DeleteFile(std::string name) override;
|
2018-01-30 21:46:08 +00:00
|
|
|
Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
|
|
|
|
const RangePtr* ranges, size_t n,
|
|
|
|
bool include_end = true);
|
2013-08-22 21:32:53 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) override;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) override;
|
2020-07-03 01:13:31 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status GetLiveFilesStorageInfo(
|
2021-10-16 17:03:19 +00:00
|
|
|
const LiveFilesStorageInfoOptions& opts,
|
|
|
|
std::vector<LiveFileStorageInfo>* files) override;
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
// Obtains the meta data of the specified column family of the DB.
|
|
|
|
// TODO(yhchiang): output parameter is placed in the end in this codebase.
|
2024-01-31 21:14:42 +00:00
|
|
|
void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
|
|
|
|
ColumnFamilyMetaData* metadata) override;
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
|
2021-06-28 15:12:32 +00:00
|
|
|
void GetAllColumnFamilyMetaData(
|
|
|
|
std::vector<ColumnFamilyMetaData>* metadata) override;
|
|
|
|
|
Add experimental API MarkForCompaction()
Summary:
Some Mongo+Rocks datasets in Parse's environment are not doing compactions very frequently. During the quiet period (with no IO), we'd like to schedule compactions so that our reads become faster. Also, aggressively compacting during quiet periods helps when write bursts happen. In addition, we also want to compact files that are containing deleted key ranges (like old oplog keys).
All of this is currently not possible with CompactRange() because it's single-threaded and blocks all other compactions from happening. Running CompactRange() risks an issue of blocking writes because we generate too much Level 0 files before the compaction is over. Stopping writes is very dangerous because they hold transaction locks. We tried running manual compaction once on Mongo+Rocks and everything fell apart.
MarkForCompaction() solves all of those problems. This is very light-weight manual compaction. It is lower priority than automatic compactions, which means it shouldn't interfere with background process keeping the LSM tree clean. However, if no automatic compactions need to be run (or we have extra background threads available), we will start compacting files that are marked for compaction.
Test Plan: added a new unit test
Reviewers: yhchiang, rven, MarkCallaghan, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37083
2015-04-17 23:44:45 +00:00
|
|
|
Status SuggestCompactRange(ColumnFamilyHandle* column_family,
|
2017-07-28 23:23:50 +00:00
|
|
|
const Slice* begin, const Slice* end) override;
|
Add experimental API MarkForCompaction()
Summary:
Some Mongo+Rocks datasets in Parse's environment are not doing compactions very frequently. During the quiet period (with no IO), we'd like to schedule compactions so that our reads become faster. Also, aggressively compacting during quiet periods helps when write bursts happen. In addition, we also want to compact files that are containing deleted key ranges (like old oplog keys).
All of this is currently not possible with CompactRange() because it's single-threaded and blocks all other compactions from happening. Running CompactRange() risks an issue of blocking writes because we generate too much Level 0 files before the compaction is over. Stopping writes is very dangerous because they hold transaction locks. We tried running manual compaction once on Mongo+Rocks and everything fell apart.
MarkForCompaction() solves all of those problems. This is very light-weight manual compaction. It is lower priority than automatic compactions, which means it shouldn't interfere with background process keeping the LSM tree clean. However, if no automatic compactions need to be run (or we have extra background threads available), we will start compacting files that are marked for compaction.
Test Plan: added a new unit test
Reviewers: yhchiang, rven, MarkCallaghan, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37083
2015-04-17 23:44:45 +00:00
|
|
|
|
2017-07-28 23:23:50 +00:00
|
|
|
Status PromoteL0(ColumnFamilyHandle* column_family,
|
|
|
|
int target_level) override;
|
Implement DB::PromoteL0 method
Summary:
This diff implements a new `DB` method `PromoteL0` which moves all files in L0
to a given level skipping compaction, provided that the files have disjoint
ranges and all levels up to the target level are empty.
This method provides finer-grain control for trivial compactions, and it is
useful for bulk-loading pre-sorted keys. Compared to D34797, it does not change
the semantics of an existing operation, which can impact existing code.
PromoteL0 is designed to work well in combination with the proposed
`GetSstFileWriter`/`AddFile` interface, enabling to "design" the level structure
by populating one level at a time. Such fine-grained control can be very useful
for static or mostly-static databases.
Test Plan: `make check`
Reviewers: IslamAbdelRahman, philipp, MarkCallaghan, yhchiang, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D37107
2015-04-23 19:10:36 +00:00
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
using DB::IngestExternalFile;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status IngestExternalFile(
|
2019-05-24 20:05:58 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<std::string>& external_files,
|
|
|
|
const IngestExternalFileOptions& ingestion_options) override;
|
|
|
|
|
|
|
|
using DB::IngestExternalFiles;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status IngestExternalFiles(
|
2019-05-24 20:05:58 +00:00
|
|
|
const std::vector<IngestExternalFileArg>& args) override;
|
|
|
|
|
Export Import sst files (#5495)
Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135
This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"
We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.
(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
// is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
const std::string& export_dir,
ExportImportFilesMetaData** metadata);
Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.
(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
// an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle);
Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.
If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.
Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495
Differential Revision: D16018881
fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 19:22:21 +00:00
|
|
|
using DB::CreateColumnFamilyWithImport;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status CreateColumnFamilyWithImport(
|
Export Import sst files (#5495)
Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135
This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"
We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.
(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
// is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
const std::string& export_dir,
ExportImportFilesMetaData** metadata);
Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.
(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
// an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle);
Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.
If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.
Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495
Differential Revision: D16018881
fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 19:22:21 +00:00
|
|
|
const ColumnFamilyOptions& options, const std::string& column_family_name,
|
|
|
|
const ImportColumnFamilyOptions& import_options,
|
2023-06-15 19:25:04 +00:00
|
|
|
const std::vector<const ExportImportFilesMetaData*>& metadatas,
|
Export Import sst files (#5495)
Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135
This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"
We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.
(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
// is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
const std::string& export_dir,
ExportImportFilesMetaData** metadata);
Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.
(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
// an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle);
Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.
If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.
Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495
Differential Revision: D16018881
fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 19:22:21 +00:00
|
|
|
ColumnFamilyHandle** handle) override;
|
|
|
|
|
2023-05-18 20:25:01 +00:00
|
|
|
using DB::ClipColumnFamily;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status ClipColumnFamily(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key,
|
|
|
|
const Slice& end_key) override;
|
2023-05-18 20:25:01 +00:00
|
|
|
|
2020-11-04 04:33:45 +00:00
|
|
|
using DB::VerifyFileChecksums;
|
|
|
|
Status VerifyFileChecksums(const ReadOptions& read_options) override;
|
|
|
|
|
2019-08-16 23:40:09 +00:00
|
|
|
using DB::VerifyChecksum;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
|
2020-11-04 04:33:45 +00:00
|
|
|
// Verify the checksums of files in db. Currently only tables are checked.
|
|
|
|
//
|
|
|
|
// read_options: controls file I/O behavior, e.g. read ahead size while
|
|
|
|
// reading all the live table files.
|
|
|
|
//
|
|
|
|
// use_file_checksum: if false, verify the block checksums of all live table
|
|
|
|
// in db. Otherwise, obtain the file checksums and compare
|
|
|
|
// with the MANIFEST. Currently, file checksums are
|
|
|
|
// recomputed by reading all table files.
|
|
|
|
//
|
|
|
|
// Returns: OK if there is no file whose file or block checksum mismatches.
|
|
|
|
Status VerifyChecksumInternal(const ReadOptions& read_options,
|
|
|
|
bool use_file_checksum);
|
|
|
|
|
2021-02-23 06:07:59 +00:00
|
|
|
Status VerifyFullFileChecksum(const std::string& file_checksum_expected,
|
|
|
|
const std::string& func_name_expected,
|
|
|
|
const std::string& fpath,
|
|
|
|
const ReadOptions& read_options);
|
2019-05-24 20:05:58 +00:00
|
|
|
|
|
|
|
using DB::StartTrace;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status StartTrace(const TraceOptions& options,
|
|
|
|
std::unique_ptr<TraceWriter>&& trace_writer) override;
|
2019-05-24 20:05:58 +00:00
|
|
|
|
|
|
|
using DB::EndTrace;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status EndTrace() override;
|
2019-05-24 20:05:58 +00:00
|
|
|
|
2021-08-12 02:31:44 +00:00
|
|
|
using DB::NewDefaultReplayer;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status NewDefaultReplayer(const std::vector<ColumnFamilyHandle*>& handles,
|
|
|
|
std::unique_ptr<TraceReader>&& reader,
|
|
|
|
std::unique_ptr<Replayer>* replayer) override;
|
2021-08-12 02:31:44 +00:00
|
|
|
|
2019-06-13 22:39:52 +00:00
|
|
|
using DB::StartBlockCacheTrace;
|
|
|
|
Status StartBlockCacheTrace(
|
2022-10-21 19:15:35 +00:00
|
|
|
const TraceOptions& trace_options,
|
2019-06-13 22:39:52 +00:00
|
|
|
std::unique_ptr<TraceWriter>&& trace_writer) override;
|
|
|
|
|
2022-10-21 19:15:35 +00:00
|
|
|
Status StartBlockCacheTrace(
|
|
|
|
const BlockCacheTraceOptions& options,
|
|
|
|
std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override;
|
|
|
|
|
2019-06-13 22:39:52 +00:00
|
|
|
using DB::EndBlockCacheTrace;
|
|
|
|
Status EndBlockCacheTrace() override;
|
|
|
|
|
2020-08-05 01:40:19 +00:00
|
|
|
using DB::StartIOTrace;
|
2021-04-01 20:14:01 +00:00
|
|
|
Status StartIOTrace(const TraceOptions& options,
|
2020-08-05 01:40:19 +00:00
|
|
|
std::unique_ptr<TraceWriter>&& trace_writer) override;
|
|
|
|
|
|
|
|
using DB::EndIOTrace;
|
|
|
|
Status EndIOTrace() override;
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
using DB::GetPropertiesOfAllTables;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
|
|
|
TablePropertiesCollection* props) override;
|
|
|
|
Status GetPropertiesOfTablesInRange(
|
2019-05-24 20:05:58 +00:00
|
|
|
ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
|
|
|
|
TablePropertiesCollection* props) override;
|
|
|
|
|
|
|
|
// ---- End of implementations of the DB interface ----
|
2021-03-15 11:32:24 +00:00
|
|
|
SystemClock* GetSystemClock() const;
|
2019-05-24 20:05:58 +00:00
|
|
|
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
struct GetImplOptions {
|
|
|
|
ColumnFamilyHandle* column_family = nullptr;
|
|
|
|
PinnableSlice* value = nullptr;
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
PinnableWideColumns* columns = nullptr;
|
2020-03-02 23:58:32 +00:00
|
|
|
std::string* timestamp = nullptr;
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
bool* value_found = nullptr;
|
|
|
|
ReadCallback* callback = nullptr;
|
|
|
|
bool* is_blob_index = nullptr;
|
|
|
|
// If true return value associated with key via value pointer else return
|
|
|
|
// all merge operands for key via merge_operands pointer
|
|
|
|
bool get_value = true;
|
|
|
|
// Pointer to an array of size
|
|
|
|
// get_merge_operands_options.expected_max_number_of_operands allocated by
|
|
|
|
// user
|
|
|
|
PinnableSlice* merge_operands = nullptr;
|
|
|
|
GetMergeOperandsOptions* get_merge_operands_options = nullptr;
|
|
|
|
int* number_of_operands = nullptr;
|
|
|
|
};
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
Status GetImpl(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* value);
|
|
|
|
|
|
|
|
Status GetImpl(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* value, std::string* timestamp);
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// Function that Get and KeyMayExist call with no_io true or false
|
|
|
|
// Note: 'value_found' from KeyMayExist propagates here
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
|
|
|
// This function is also called by GetMergeOperands
|
|
|
|
// If get_impl_options.get_value = true get value associated with
|
|
|
|
// get_impl_options.key via get_impl_options.value
|
|
|
|
// If get_impl_options.get_value = false get merge operands associated with
|
|
|
|
// get_impl_options.key via get_impl_options.merge_operands
|
2023-09-15 15:30:44 +00:00
|
|
|
virtual Status GetImpl(const ReadOptions& options, const Slice& key,
|
|
|
|
GetImplOptions& get_impl_options);
|
2019-05-24 20:05:58 +00:00
|
|
|
|
2020-06-18 17:15:16 +00:00
|
|
|
// If `snapshot` == kMaxSequenceNumber, set a recent one inside the file.
|
2019-05-24 20:05:58 +00:00
|
|
|
ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
|
Access DBImpl* and CFD* by CFHImpl* in Iterators (#12395)
Summary:
In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR https://github.com/facebook/rocksdb/issues/12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: https://github.com/facebook/rocksdb/issues/11925 #11943, and https://github.com/facebook/rocksdb/issues/11977.
To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12395
Test Plan:
# Summary
In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR #12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: #11925 #11943, and #11977.
To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`.
# Test Plan
There should be no behavior changes. Existing tests and CI for the correctness tests.
**Test for Perf Regression**
Build
```
$> make -j64 release
```
Setup
```
$> TEST_TMPDIR=/dev/shm/db_bench ./db_bench -benchmarks="filluniquerandom" -key_size=32 -value_size=512 -num=1000000 -compression_type=none
```
Run
```
TEST_TMPDIR=/dev/shm/db_bench ./db_bench -use_existing_db=1 -benchmarks="newiterator,seekrandom" -cache_size=10485760000
```
Before the change
```
DB path: [/dev/shm/db_bench/dbbench]
newiterator : 0.552 micros/op 1810157 ops/sec 0.552 seconds 1000000 operations;
DB path: [/dev/shm/db_bench/dbbench]
seekrandom : 4.502 micros/op 222143 ops/sec 4.502 seconds 1000000 operations; (0 of 1000000 found)
```
After the change
```
DB path: [/dev/shm/db_bench/dbbench]
newiterator : 0.520 micros/op 1924401 ops/sec 0.520 seconds 1000000 operations;
DB path: [/dev/shm/db_bench/dbbench]
seekrandom : 4.532 micros/op 220657 ops/sec 4.532 seconds 1000000 operations; (0 of 1000000 found)
```
Reviewed By: pdillinger
Differential Revision: D54332713
Pulled By: jaykorean
fbshipit-source-id: b28d897ad519e58b1ca82eb068a6319544a4fae5
2024-03-01 18:28:20 +00:00
|
|
|
ColumnFamilyHandleImpl* cfh,
|
|
|
|
SuperVersion* sv, SequenceNumber snapshot,
|
2019-05-24 20:05:58 +00:00
|
|
|
ReadCallback* read_callback,
|
2020-12-05 05:28:26 +00:00
|
|
|
bool expose_blob_index = false,
|
2019-05-24 20:05:58 +00:00
|
|
|
bool allow_refresh = true);
|
|
|
|
|
|
|
|
virtual SequenceNumber GetLastPublishedSequence() const {
|
|
|
|
if (last_seq_same_as_publish_seq_) {
|
|
|
|
return versions_->LastSequence();
|
|
|
|
} else {
|
|
|
|
return versions_->LastPublishedSequence();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: joined the main write queue if two_write_queues is disabled, and
|
|
|
|
// the second write queue otherwise.
|
|
|
|
virtual void SetLastPublishedSequence(SequenceNumber seq);
|
|
|
|
// Returns LastSequence in last_seq_same_as_publish_seq_
|
|
|
|
// mode and LastAllocatedSequence otherwise. This is useful when visiblility
|
|
|
|
// depends also on data written to the WAL but not to the memtable.
|
|
|
|
SequenceNumber TEST_GetLastVisibleSequence() const;
|
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
// Similar to Write() but will call the callback once on the single write
|
|
|
|
// thread to determine whether it is safe to perform the write.
|
|
|
|
virtual Status WriteWithCallback(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch,
|
|
|
|
WriteCallback* callback);
|
|
|
|
|
|
|
|
// Returns the sequence number that is guaranteed to be smaller than or equal
|
|
|
|
// to the sequence number of any key that could be inserted into the current
|
|
|
|
// memtables. It can then be assumed that any write with a larger(or equal)
|
|
|
|
// sequence number will be present in this memtable or a later memtable.
|
|
|
|
//
|
|
|
|
// If the earliest sequence number could not be determined,
|
|
|
|
// kMaxSequenceNumber will be returned.
|
|
|
|
//
|
|
|
|
// If include_history=true, will also search Memtables in MemTableList
|
|
|
|
// History.
|
|
|
|
SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
|
|
|
|
bool include_history);
|
|
|
|
|
|
|
|
// For a given key, check to see if there are any records for this key
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-15 23:37:15 +00:00
|
|
|
// in the memtables, including memtable history. If cache_only is false,
|
|
|
|
// SST files will also be checked.
|
|
|
|
//
|
2021-11-15 20:50:42 +00:00
|
|
|
// `key` should NOT have user-defined timestamp appended to user key even if
|
|
|
|
// timestamp is enabled.
|
|
|
|
//
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-15 23:37:15 +00:00
|
|
|
// If a key is found, *found_record_for_key will be set to true and
|
2016-04-28 09:30:44 +00:00
|
|
|
// *seq will be set to the stored sequence number for the latest
|
2021-11-15 20:50:42 +00:00
|
|
|
// operation on this key or kMaxSequenceNumber if unknown. If user-defined
|
|
|
|
// timestamp is enabled for this column family and timestamp is not nullptr,
|
|
|
|
// then *timestamp will be set to the stored timestamp for the latest
|
|
|
|
// operation on this key.
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-15 23:37:15 +00:00
|
|
|
// If no key is found, *found_record_for_key will be set to false.
|
|
|
|
//
|
|
|
|
// Note: If cache_only=false, it is possible for *seq to be set to 0 if
|
|
|
|
// the sequence number has been cleared from the record. If the caller is
|
|
|
|
// holding an active db snapshot, we know the missing sequence must be less
|
|
|
|
// than the snapshot's sequence number (sequence numbers are only cleared
|
|
|
|
// when there are no earlier active snapshots).
|
|
|
|
//
|
|
|
|
// If NotFound is returned and found_record_for_key is set to false, then no
|
|
|
|
// record for this key was found. If the caller is holding an active db
|
|
|
|
// snapshot, we know that no key could have existing after this snapshot
|
|
|
|
// (since we do not compact keys that have an earlier snapshot).
|
|
|
|
//
|
2019-06-11 18:42:19 +00:00
|
|
|
// Only records newer than or at `lower_bound_seq` are guaranteed to be
|
|
|
|
// returned. Memtables and files may not be checked if it only contains data
|
|
|
|
// older than `lower_bound_seq`.
|
|
|
|
//
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-15 23:37:15 +00:00
|
|
|
// Returns OK or NotFound on success,
|
|
|
|
// other status on unexpected error.
|
2016-08-19 22:10:31 +00:00
|
|
|
// TODO(andrewkr): this API need to be aware of range deletion operations
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-15 23:37:15 +00:00
|
|
|
Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
2019-06-11 18:42:19 +00:00
|
|
|
bool cache_only,
|
|
|
|
SequenceNumber lower_bound_seq,
|
2021-11-15 20:50:42 +00:00
|
|
|
SequenceNumber* seq, std::string* timestamp,
|
2017-10-18 00:24:25 +00:00
|
|
|
bool* found_record_for_key,
|
2021-11-15 20:50:42 +00:00
|
|
|
bool* is_blob_index);
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2021-02-19 07:03:49 +00:00
|
|
|
Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
|
|
|
|
const Slice& lower_bound, const Slice upper_bound);
|
|
|
|
Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
|
|
|
|
const Slice& lower_bound,
|
|
|
|
const Slice upper_bound);
|
2012-11-26 21:56:45 +00:00
|
|
|
|
2015-12-08 20:25:48 +00:00
|
|
|
// Similar to GetSnapshot(), but also lets the db know that this snapshot
|
|
|
|
// will be used for transaction write-conflict checking. The DB can then
|
|
|
|
// make sure not to compact any keys that would prevent a write-conflict from
|
|
|
|
// being detected.
|
|
|
|
const Snapshot* GetSnapshotForWriteConflictBoundary();
|
|
|
|
|
2014-03-20 21:18:29 +00:00
|
|
|
// checks if all live files exist on file system and that their file sizes
|
|
|
|
// match to our in-memory records
|
|
|
|
virtual Status CheckConsistency();
|
|
|
|
|
2019-04-17 06:29:32 +00:00
|
|
|
// max_file_num_to_ignore allows bottom level compaction to filter out newly
|
|
|
|
// compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
|
|
|
|
// disable the filtering
|
Always allow L0->L1 trivial move during manual compaction (#11375)
Summary:
during manual compaction (CompactRange()), L0->L1 trivial move is disabled when only L0 overlaps with compacting key range (introduced in https://github.com/facebook/rocksdb/issues/7368 to enforce kForce* contract). This can cause large memory usage due to compaction readahead when number of L0 files is large. This PR allows L0->L1 trivial move in this case, and will do a L1 -> L1 intra-level compaction when needed (`bottommost_level_compaction` is kForce*). In brief, consider a DB with only L0 file, and user calls CompactRange(kForce, nullptr, nullptr),
- before this PR, RocksDB does a L0 -> L1 compaction (disallow trivial move),
- after this PR, RocksDB does a L0 -> L1 compaction (allow trivial move), and a L1 -> L1 compaction.
Users can use kForceOptimized to avoid this extra L1->L1 compaction overhead when L0s are overlapping and cannot be trivial moved.
This PR also fixed a bug (see previous discussion in https://github.com/facebook/rocksdb/issues/11041) where `final_output_level` of a manual compaction can be miscalculated when `level_compaction_dynamic_level_bytes=true`. This bug could cause incorrect level being moved when CompactRangeOptions::change_level is specified.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11375
Test Plan: - Added new unit tests to test that L0 -> L1 compaction allows trivial move and L1 -> L1 compaction is done when needed.
Reviewed By: ajkr
Differential Revision: D44943518
Pulled By: cbi42
fbshipit-source-id: e9fb770d17b163c18a623e1d1bd6b81159192708
2023-04-20 18:10:48 +00:00
|
|
|
// If `final_output_level` is not nullptr, it is set to manual compaction's
|
|
|
|
// output level if returned status is OK, and it may or may not be set to
|
|
|
|
// manual compaction's output level if returned status is not OK.
|
2014-02-01 00:45:20 +00:00
|
|
|
Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
|
2019-04-17 06:29:32 +00:00
|
|
|
int output_level,
|
|
|
|
const CompactRangeOptions& compact_range_options,
|
|
|
|
const Slice* begin, const Slice* end,
|
|
|
|
bool exclusive, bool disallow_trivial_move,
|
2022-03-12 00:13:23 +00:00
|
|
|
uint64_t max_file_num_to_ignore,
|
Always allow L0->L1 trivial move during manual compaction (#11375)
Summary:
during manual compaction (CompactRange()), L0->L1 trivial move is disabled when only L0 overlaps with compacting key range (introduced in https://github.com/facebook/rocksdb/issues/7368 to enforce kForce* contract). This can cause large memory usage due to compaction readahead when number of L0 files is large. This PR allows L0->L1 trivial move in this case, and will do a L1 -> L1 intra-level compaction when needed (`bottommost_level_compaction` is kForce*). In brief, consider a DB with only L0 file, and user calls CompactRange(kForce, nullptr, nullptr),
- before this PR, RocksDB does a L0 -> L1 compaction (disallow trivial move),
- after this PR, RocksDB does a L0 -> L1 compaction (allow trivial move), and a L1 -> L1 compaction.
Users can use kForceOptimized to avoid this extra L1->L1 compaction overhead when L0s are overlapping and cannot be trivial moved.
This PR also fixed a bug (see previous discussion in https://github.com/facebook/rocksdb/issues/11041) where `final_output_level` of a manual compaction can be miscalculated when `level_compaction_dynamic_level_bytes=true`. This bug could cause incorrect level being moved when CompactRangeOptions::change_level is specified.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11375
Test Plan: - Added new unit tests to test that L0 -> L1 compaction allows trivial move and L1 -> L1 compaction is done when needed.
Reviewed By: ajkr
Differential Revision: D44943518
Pulled By: cbi42
fbshipit-source-id: e9fb770d17b163c18a623e1d1bd6b81159192708
2023-04-20 18:10:48 +00:00
|
|
|
const std::string& trim_ts,
|
|
|
|
int* final_output_level = nullptr);
|
2014-01-15 00:19:09 +00:00
|
|
|
|
2015-10-13 00:22:37 +00:00
|
|
|
// Return an internal iterator over the current state of the database.
|
|
|
|
// The keys of this iterator are internal keys (see format.h).
|
|
|
|
// The returned iterator should be deleted when no longer needed.
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
// If allow_unprepared_value is true, the returned iterator may defer reading
|
|
|
|
// the value and so will require PrepareValue() to be called before value();
|
|
|
|
// allow_unprepared_value = false is convenient when this optimization is not
|
|
|
|
// useful, e.g. when reading the whole column family.
|
Skip swaths of range tombstone covered keys in merging iterator (2022 edition) (#10449)
Summary:
Delete range logic is moved from `DBIter` to `MergingIterator`, and `MergingIterator` will seek to the end of a range deletion if possible instead of scanning through each key and check with `RangeDelAggregator`.
With the invariant that a key in level L (consider memtable as the first level, each immutable and L0 as a separate level) has a larger sequence number than all keys in any level >L, a range tombstone `[start, end)` from level L covers all keys in its range in any level >L. This property motivates optimizations in iterator:
- in `Seek(target)`, if level L has a range tombstone `[start, end)` that covers `target.UserKey`, then for all levels > L, we can do Seek() on `end` instead of `target` to skip some range tombstone covered keys.
- in `Next()/Prev()`, if the current key is covered by a range tombstone `[start, end)` from level L, we can do `Seek` to `end` for all levels > L.
This PR implements the above optimizations in `MergingIterator`. As all range tombstone covered keys are now skipped in `MergingIterator`, the range tombstone logic is removed from `DBIter`. The idea in this PR is similar to https://github.com/facebook/rocksdb/issues/7317, but this PR leaves `InternalIterator` interface mostly unchanged. **Credit**: the cascading seek optimization and the sentinel key (discussed below) are inspired by [Pebble](https://github.com/cockroachdb/pebble/blob/master/merging_iter.go) and suggested by ajkr in https://github.com/facebook/rocksdb/issues/7317. The two optimizations are mostly implemented in `SeekImpl()/SeekForPrevImpl()` and `IsNextDeleted()/IsPrevDeleted()` in `merging_iterator.cc`. See comments for each method for more detail.
One notable change is that the minHeap/maxHeap used by `MergingIterator` now contains range tombstone end keys besides point key iterators. This helps to reduce the number of key comparisons. For example, for a range tombstone `[start, end)`, a `start` and an `end` `HeapItem` are inserted into the heap. When a `HeapItem` for range tombstone start key is popped from the minHeap, we know this range tombstone becomes "active" in the sense that, before the range tombstone's end key is popped from the minHeap, all the keys popped from this heap is covered by the range tombstone's internal key range `[start, end)`.
Another major change, *delete range sentinel key*, is made to `LevelIterator`. Before this PR, when all point keys in an SST file are iterated through in `MergingIterator`, a level iterator would advance to the next SST file in its level. In the case when an SST file has a range tombstone that covers keys beyond the SST file's last point key, advancing to the next SST file would lose this range tombstone. Consequently, `MergingIterator` could return keys that should have been deleted by some range tombstone. We prevent this by pretending that file boundaries in each SST file are sentinel keys. A `LevelIterator` now only advance the file iterator once the sentinel key is processed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10449
Test Plan:
- Added many unit tests in db_range_del_test
- Stress test: `./db_stress --readpercent=5 --prefixpercent=19 --writepercent=20 -delpercent=10 --iterpercent=44 --delrangepercent=2`
- Additional iterator stress test is added to verify against iterators against expected state: https://github.com/facebook/rocksdb/issues/10538. This is based on ajkr's previous attempt https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913.
```
python3 ./tools/db_crashtest.py blackbox --simple --write_buffer_size=524288 --target_file_size_base=524288 --max_bytes_for_level_base=2097152 --compression_type=none --max_background_compactions=8 --value_size_mult=33 --max_key=5000000 --interval=10 --duration=7200 --delrangepercent=3 --delpercent=9 --iterpercent=25 --writepercent=60 --readpercent=3 --prefixpercent=0 --num_iterations=1000 --range_deletion_width=100 --verify_iterator_with_expected_state_one_in=1
```
- Performance benchmark: I used a similar setup as in the blog [post](http://rocksdb.org/blog/2018/11/21/delete-range.html) that introduced DeleteRange, "a database with 5 million data keys, and 10000 range tombstones (ignoring those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written". As expected, the performance with this PR depends on the range tombstone width.
```
# Setup:
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=fillrandom --writes=4500000 --num=5000000
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=overwrite --writes=500000 --num=5000000 --use_existing_db=true --writes_per_range_tombstone=50
# Scan entire DB
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=readseq[-X5] --use_existing_db=true --num=5000000 --disable_auto_compactions=true
# Short range scan (10 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=100000 --seek_nexts=10 --disable_auto_compactions=true
# Long range scan(1000 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=2500 --seek_nexts=1000 --disable_auto_compactions=true
```
Avg over of 10 runs (some slower tests had fews runs):
For the first column (tombstone), 0 means no range tombstone, 100-10000 means width of the 10k range tombstones, and 1 means there is a single range tombstone in the entire DB (width is 1000). The 1 tombstone case is to test regression when there's very few range tombstones in the DB, as no range tombstone is likely to take a different code path than with range tombstones.
- Scan entire DB
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2525600 (± 43564) |2486917 (± 33698) |-1.53% |
| 100 |1853835 (± 24736) |2073884 (± 32176) |+11.87% |
| 1000 |422415 (± 7466) |1115801 (± 22781) |+164.15% |
| 10000 |22384 (± 227) |227919 (± 6647) |+918.22% |
| 1 range tombstone |2176540 (± 39050) |2434954 (± 24563) |+11.87% |
- Short range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |35398 (± 533) |35338 (± 569) |-0.17% |
| 100 |28276 (± 664) |31684 (± 331) |+12.05% |
| 1000 |7637 (± 77) |25422 (± 277) |+232.88% |
| 10000 |1367 |28667 |+1997.07% |
| 1 range tombstone |32618 (± 581) |32748 (± 506) |+0.4% |
- Long range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2262 (± 33) |2353 (± 20) |+4.02% |
| 100 |1696 (± 26) |1926 (± 18) |+13.56% |
| 1000 |410 (± 6) |1255 (± 29) |+206.1% |
| 10000 |25 |414 |+1556.0% |
| 1 range tombstone |1957 (± 30) |2185 (± 44) |+11.65% |
- Microbench does not show significant regression: https://gist.github.com/cbi42/59f280f85a59b678e7e5d8561e693b61
Reviewed By: ajkr
Differential Revision: D38450331
Pulled By: cbi42
fbshipit-source-id: b5ef12e8d8c289ed2e163ccdf277f5039b511fca
2022-09-02 16:51:19 +00:00
|
|
|
//
|
|
|
|
// read_options.ignore_range_deletions determines whether range tombstones are
|
|
|
|
// processed in the returned interator internally, i.e., whether range
|
|
|
|
// tombstone covered keys are in this iterator's output.
|
2020-08-03 22:21:56 +00:00
|
|
|
// @param read_options Must outlive the returned iterator.
|
2015-10-12 22:06:38 +00:00
|
|
|
InternalIterator* NewInternalIterator(
|
Skip swaths of range tombstone covered keys in merging iterator (2022 edition) (#10449)
Summary:
Delete range logic is moved from `DBIter` to `MergingIterator`, and `MergingIterator` will seek to the end of a range deletion if possible instead of scanning through each key and check with `RangeDelAggregator`.
With the invariant that a key in level L (consider memtable as the first level, each immutable and L0 as a separate level) has a larger sequence number than all keys in any level >L, a range tombstone `[start, end)` from level L covers all keys in its range in any level >L. This property motivates optimizations in iterator:
- in `Seek(target)`, if level L has a range tombstone `[start, end)` that covers `target.UserKey`, then for all levels > L, we can do Seek() on `end` instead of `target` to skip some range tombstone covered keys.
- in `Next()/Prev()`, if the current key is covered by a range tombstone `[start, end)` from level L, we can do `Seek` to `end` for all levels > L.
This PR implements the above optimizations in `MergingIterator`. As all range tombstone covered keys are now skipped in `MergingIterator`, the range tombstone logic is removed from `DBIter`. The idea in this PR is similar to https://github.com/facebook/rocksdb/issues/7317, but this PR leaves `InternalIterator` interface mostly unchanged. **Credit**: the cascading seek optimization and the sentinel key (discussed below) are inspired by [Pebble](https://github.com/cockroachdb/pebble/blob/master/merging_iter.go) and suggested by ajkr in https://github.com/facebook/rocksdb/issues/7317. The two optimizations are mostly implemented in `SeekImpl()/SeekForPrevImpl()` and `IsNextDeleted()/IsPrevDeleted()` in `merging_iterator.cc`. See comments for each method for more detail.
One notable change is that the minHeap/maxHeap used by `MergingIterator` now contains range tombstone end keys besides point key iterators. This helps to reduce the number of key comparisons. For example, for a range tombstone `[start, end)`, a `start` and an `end` `HeapItem` are inserted into the heap. When a `HeapItem` for range tombstone start key is popped from the minHeap, we know this range tombstone becomes "active" in the sense that, before the range tombstone's end key is popped from the minHeap, all the keys popped from this heap is covered by the range tombstone's internal key range `[start, end)`.
Another major change, *delete range sentinel key*, is made to `LevelIterator`. Before this PR, when all point keys in an SST file are iterated through in `MergingIterator`, a level iterator would advance to the next SST file in its level. In the case when an SST file has a range tombstone that covers keys beyond the SST file's last point key, advancing to the next SST file would lose this range tombstone. Consequently, `MergingIterator` could return keys that should have been deleted by some range tombstone. We prevent this by pretending that file boundaries in each SST file are sentinel keys. A `LevelIterator` now only advance the file iterator once the sentinel key is processed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10449
Test Plan:
- Added many unit tests in db_range_del_test
- Stress test: `./db_stress --readpercent=5 --prefixpercent=19 --writepercent=20 -delpercent=10 --iterpercent=44 --delrangepercent=2`
- Additional iterator stress test is added to verify against iterators against expected state: https://github.com/facebook/rocksdb/issues/10538. This is based on ajkr's previous attempt https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913.
```
python3 ./tools/db_crashtest.py blackbox --simple --write_buffer_size=524288 --target_file_size_base=524288 --max_bytes_for_level_base=2097152 --compression_type=none --max_background_compactions=8 --value_size_mult=33 --max_key=5000000 --interval=10 --duration=7200 --delrangepercent=3 --delpercent=9 --iterpercent=25 --writepercent=60 --readpercent=3 --prefixpercent=0 --num_iterations=1000 --range_deletion_width=100 --verify_iterator_with_expected_state_one_in=1
```
- Performance benchmark: I used a similar setup as in the blog [post](http://rocksdb.org/blog/2018/11/21/delete-range.html) that introduced DeleteRange, "a database with 5 million data keys, and 10000 range tombstones (ignoring those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written". As expected, the performance with this PR depends on the range tombstone width.
```
# Setup:
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=fillrandom --writes=4500000 --num=5000000
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=overwrite --writes=500000 --num=5000000 --use_existing_db=true --writes_per_range_tombstone=50
# Scan entire DB
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=readseq[-X5] --use_existing_db=true --num=5000000 --disable_auto_compactions=true
# Short range scan (10 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=100000 --seek_nexts=10 --disable_auto_compactions=true
# Long range scan(1000 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=2500 --seek_nexts=1000 --disable_auto_compactions=true
```
Avg over of 10 runs (some slower tests had fews runs):
For the first column (tombstone), 0 means no range tombstone, 100-10000 means width of the 10k range tombstones, and 1 means there is a single range tombstone in the entire DB (width is 1000). The 1 tombstone case is to test regression when there's very few range tombstones in the DB, as no range tombstone is likely to take a different code path than with range tombstones.
- Scan entire DB
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2525600 (± 43564) |2486917 (± 33698) |-1.53% |
| 100 |1853835 (± 24736) |2073884 (± 32176) |+11.87% |
| 1000 |422415 (± 7466) |1115801 (± 22781) |+164.15% |
| 10000 |22384 (± 227) |227919 (± 6647) |+918.22% |
| 1 range tombstone |2176540 (± 39050) |2434954 (± 24563) |+11.87% |
- Short range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |35398 (± 533) |35338 (± 569) |-0.17% |
| 100 |28276 (± 664) |31684 (± 331) |+12.05% |
| 1000 |7637 (± 77) |25422 (± 277) |+232.88% |
| 10000 |1367 |28667 |+1997.07% |
| 1 range tombstone |32618 (± 581) |32748 (± 506) |+0.4% |
- Long range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2262 (± 33) |2353 (± 20) |+4.02% |
| 100 |1696 (± 26) |1926 (± 18) |+13.56% |
| 1000 |410 (± 6) |1255 (± 29) |+206.1% |
| 10000 |25 |414 |+1556.0% |
| 1 range tombstone |1957 (± 30) |2185 (± 44) |+11.65% |
- Microbench does not show significant regression: https://gist.github.com/cbi42/59f280f85a59b678e7e5d8561e693b61
Reviewed By: ajkr
Differential Revision: D38450331
Pulled By: cbi42
fbshipit-source-id: b5ef12e8d8c289ed2e163ccdf277f5039b511fca
2022-09-02 16:51:19 +00:00
|
|
|
const ReadOptions& read_options, Arena* arena, SequenceNumber sequence,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ColumnFamilyHandle* column_family = nullptr,
|
|
|
|
bool allow_unprepared_value = false);
|
2015-10-13 00:22:37 +00:00
|
|
|
|
Skip swaths of range tombstone covered keys in merging iterator (2022 edition) (#10449)
Summary:
Delete range logic is moved from `DBIter` to `MergingIterator`, and `MergingIterator` will seek to the end of a range deletion if possible instead of scanning through each key and check with `RangeDelAggregator`.
With the invariant that a key in level L (consider memtable as the first level, each immutable and L0 as a separate level) has a larger sequence number than all keys in any level >L, a range tombstone `[start, end)` from level L covers all keys in its range in any level >L. This property motivates optimizations in iterator:
- in `Seek(target)`, if level L has a range tombstone `[start, end)` that covers `target.UserKey`, then for all levels > L, we can do Seek() on `end` instead of `target` to skip some range tombstone covered keys.
- in `Next()/Prev()`, if the current key is covered by a range tombstone `[start, end)` from level L, we can do `Seek` to `end` for all levels > L.
This PR implements the above optimizations in `MergingIterator`. As all range tombstone covered keys are now skipped in `MergingIterator`, the range tombstone logic is removed from `DBIter`. The idea in this PR is similar to https://github.com/facebook/rocksdb/issues/7317, but this PR leaves `InternalIterator` interface mostly unchanged. **Credit**: the cascading seek optimization and the sentinel key (discussed below) are inspired by [Pebble](https://github.com/cockroachdb/pebble/blob/master/merging_iter.go) and suggested by ajkr in https://github.com/facebook/rocksdb/issues/7317. The two optimizations are mostly implemented in `SeekImpl()/SeekForPrevImpl()` and `IsNextDeleted()/IsPrevDeleted()` in `merging_iterator.cc`. See comments for each method for more detail.
One notable change is that the minHeap/maxHeap used by `MergingIterator` now contains range tombstone end keys besides point key iterators. This helps to reduce the number of key comparisons. For example, for a range tombstone `[start, end)`, a `start` and an `end` `HeapItem` are inserted into the heap. When a `HeapItem` for range tombstone start key is popped from the minHeap, we know this range tombstone becomes "active" in the sense that, before the range tombstone's end key is popped from the minHeap, all the keys popped from this heap is covered by the range tombstone's internal key range `[start, end)`.
Another major change, *delete range sentinel key*, is made to `LevelIterator`. Before this PR, when all point keys in an SST file are iterated through in `MergingIterator`, a level iterator would advance to the next SST file in its level. In the case when an SST file has a range tombstone that covers keys beyond the SST file's last point key, advancing to the next SST file would lose this range tombstone. Consequently, `MergingIterator` could return keys that should have been deleted by some range tombstone. We prevent this by pretending that file boundaries in each SST file are sentinel keys. A `LevelIterator` now only advance the file iterator once the sentinel key is processed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10449
Test Plan:
- Added many unit tests in db_range_del_test
- Stress test: `./db_stress --readpercent=5 --prefixpercent=19 --writepercent=20 -delpercent=10 --iterpercent=44 --delrangepercent=2`
- Additional iterator stress test is added to verify against iterators against expected state: https://github.com/facebook/rocksdb/issues/10538. This is based on ajkr's previous attempt https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913.
```
python3 ./tools/db_crashtest.py blackbox --simple --write_buffer_size=524288 --target_file_size_base=524288 --max_bytes_for_level_base=2097152 --compression_type=none --max_background_compactions=8 --value_size_mult=33 --max_key=5000000 --interval=10 --duration=7200 --delrangepercent=3 --delpercent=9 --iterpercent=25 --writepercent=60 --readpercent=3 --prefixpercent=0 --num_iterations=1000 --range_deletion_width=100 --verify_iterator_with_expected_state_one_in=1
```
- Performance benchmark: I used a similar setup as in the blog [post](http://rocksdb.org/blog/2018/11/21/delete-range.html) that introduced DeleteRange, "a database with 5 million data keys, and 10000 range tombstones (ignoring those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written". As expected, the performance with this PR depends on the range tombstone width.
```
# Setup:
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=fillrandom --writes=4500000 --num=5000000
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=overwrite --writes=500000 --num=5000000 --use_existing_db=true --writes_per_range_tombstone=50
# Scan entire DB
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=readseq[-X5] --use_existing_db=true --num=5000000 --disable_auto_compactions=true
# Short range scan (10 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=100000 --seek_nexts=10 --disable_auto_compactions=true
# Long range scan(1000 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=2500 --seek_nexts=1000 --disable_auto_compactions=true
```
Avg over of 10 runs (some slower tests had fews runs):
For the first column (tombstone), 0 means no range tombstone, 100-10000 means width of the 10k range tombstones, and 1 means there is a single range tombstone in the entire DB (width is 1000). The 1 tombstone case is to test regression when there's very few range tombstones in the DB, as no range tombstone is likely to take a different code path than with range tombstones.
- Scan entire DB
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2525600 (± 43564) |2486917 (± 33698) |-1.53% |
| 100 |1853835 (± 24736) |2073884 (± 32176) |+11.87% |
| 1000 |422415 (± 7466) |1115801 (± 22781) |+164.15% |
| 10000 |22384 (± 227) |227919 (± 6647) |+918.22% |
| 1 range tombstone |2176540 (± 39050) |2434954 (± 24563) |+11.87% |
- Short range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |35398 (± 533) |35338 (± 569) |-0.17% |
| 100 |28276 (± 664) |31684 (± 331) |+12.05% |
| 1000 |7637 (± 77) |25422 (± 277) |+232.88% |
| 10000 |1367 |28667 |+1997.07% |
| 1 range tombstone |32618 (± 581) |32748 (± 506) |+0.4% |
- Long range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2262 (± 33) |2353 (± 20) |+4.02% |
| 100 |1696 (± 26) |1926 (± 18) |+13.56% |
| 1000 |410 (± 6) |1255 (± 29) |+206.1% |
| 10000 |25 |414 |+1556.0% |
| 1 range tombstone |1957 (± 30) |2185 (± 44) |+11.65% |
- Microbench does not show significant regression: https://gist.github.com/cbi42/59f280f85a59b678e7e5d8561e693b61
Reviewed By: ajkr
Differential Revision: D38450331
Pulled By: cbi42
fbshipit-source-id: b5ef12e8d8c289ed2e163ccdf277f5039b511fca
2022-09-02 16:51:19 +00:00
|
|
|
// Note: to support DB iterator refresh, memtable range tombstones in the
|
|
|
|
// underlying merging iterator needs to be refreshed. If db_iter is not
|
|
|
|
// nullptr, db_iter->SetMemtableRangetombstoneIter() is called with the
|
|
|
|
// memtable range tombstone iterator used by the underlying merging iterator.
|
|
|
|
// This range tombstone iterator can be refreshed later by db_iter.
|
|
|
|
// @param read_options Must outlive the returned iterator.
|
|
|
|
InternalIterator* NewInternalIterator(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyData* cfd,
|
|
|
|
SuperVersion* super_version,
|
|
|
|
Arena* arena, SequenceNumber sequence,
|
|
|
|
bool allow_unprepared_value,
|
|
|
|
ArenaWrappedDBIter* db_iter = nullptr);
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
LogsWithPrepTracker* logs_with_prep_tracker() {
|
|
|
|
return &logs_with_prep_tracker_;
|
|
|
|
}
|
|
|
|
|
2017-05-24 18:25:38 +00:00
|
|
|
struct BGJobLimits {
|
|
|
|
int max_flushes;
|
|
|
|
int max_compactions;
|
|
|
|
};
|
|
|
|
// Returns maximum background flushes and compactions allowed to be scheduled
|
|
|
|
BGJobLimits GetBGJobLimits() const;
|
|
|
|
// Need a static version that can be called during SanitizeOptions().
|
|
|
|
static BGJobLimits GetBGJobLimits(int max_background_flushes,
|
|
|
|
int max_background_compactions,
|
|
|
|
int max_background_jobs,
|
|
|
|
bool parallelize_compactions);
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
2016-01-28 19:56:16 +00:00
|
|
|
|
2016-08-04 00:42:06 +00:00
|
|
|
// move logs pending closing from job_context to the DB queue and
|
|
|
|
// schedule a purge
|
|
|
|
void ScheduleBgLogWriterClose(JobContext* job_context);
|
|
|
|
|
2016-12-28 19:53:29 +00:00
|
|
|
uint64_t MinLogNumberToKeep();
|
|
|
|
|
2018-11-06 03:28:21 +00:00
|
|
|
// Returns the lower bound file number for SSTs that won't be deleted, even if
|
|
|
|
// they're obsolete. This lower bound is used internally to prevent newly
|
|
|
|
// created flush/compaction output files from being deleted before they're
|
|
|
|
// installed. This technique avoids the need for tracking the exact numbers of
|
|
|
|
// files pending creation, although it prevents more files than necessary from
|
|
|
|
// being deleted.
|
|
|
|
uint64_t MinObsoleteSstNumberToKeep();
|
|
|
|
|
2023-06-13 22:52:45 +00:00
|
|
|
uint64_t GetObsoleteSstFilesSize();
|
|
|
|
|
2013-11-15 02:03:57 +00:00
|
|
|
// Returns the list of live files in 'live' and the list
|
2013-12-31 02:33:57 +00:00
|
|
|
// of all files in the filesystem in 'candidate_files'.
|
2013-11-15 02:03:57 +00:00
|
|
|
// If force == false and the last call was less than
|
2014-09-05 18:48:17 +00:00
|
|
|
// db_options_.delete_obsolete_files_period_micros microseconds ago,
|
2014-10-28 18:54:33 +00:00
|
|
|
// it will not fill up the job_context
|
|
|
|
void FindObsoleteFiles(JobContext* job_context, bool force,
|
2013-11-15 02:03:57 +00:00
|
|
|
bool no_full_scan = false);
|
|
|
|
|
|
|
|
// Diffs the files listed in filenames and those that do not
|
2018-03-08 18:18:34 +00:00
|
|
|
// belong to live files are possibly removed. Also, removes all the
|
2013-11-15 02:03:57 +00:00
|
|
|
// files in sst_delete_files and log_delete_files.
|
|
|
|
// It is not necessary to hold the mutex when invoking this method.
|
2018-01-18 01:37:10 +00:00
|
|
|
// If FindObsoleteFiles() was run, we need to also run
|
|
|
|
// PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
|
2018-04-06 02:49:06 +00:00
|
|
|
void PurgeObsoleteFiles(JobContext& background_contet,
|
2016-06-22 01:41:23 +00:00
|
|
|
bool schedule_only = false);
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// Schedule a background job to actually delete obsolete files.
|
2016-06-22 01:41:23 +00:00
|
|
|
void SchedulePurge();
|
2013-11-15 02:03:57 +00:00
|
|
|
|
2014-12-06 00:12:10 +00:00
|
|
|
const SnapshotList& snapshots() const { return snapshots_; }
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// load list of snapshots to `snap_vector` that is no newer than `max_seq`
|
|
|
|
// in ascending order.
|
|
|
|
// `oldest_write_conflict_snapshot` is filled with the oldest snapshot
|
|
|
|
// which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
|
2019-05-04 00:26:20 +00:00
|
|
|
void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
|
|
|
|
SequenceNumber* oldest_write_conflict_snapshot,
|
|
|
|
const SequenceNumber& max_seq) const {
|
|
|
|
InstrumentedMutexLock l(mutex());
|
|
|
|
snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
|
|
|
|
}
|
|
|
|
|
2016-10-18 20:19:26 +00:00
|
|
|
const ImmutableDBOptions& immutable_db_options() const {
|
|
|
|
return immutable_db_options_;
|
|
|
|
}
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// Cancel all background jobs, including flush, compaction, background
|
|
|
|
// purging, stats dumping threads, etc. If `wait` = true, wait for the
|
|
|
|
// running jobs to abort or finish before returning. Otherwise, only
|
|
|
|
// sends the signals.
|
2015-05-05 23:54:47 +00:00
|
|
|
void CancelAllBackgroundWork(bool wait);
|
2015-03-11 17:31:02 +00:00
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
// Find Super version and reference it. Based on options, it might return
|
|
|
|
// the thread local cached one.
|
|
|
|
// Call ReturnAndCleanupSuperVersion() when it is no longer needed.
|
|
|
|
SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
|
|
|
|
|
|
|
|
// Similar to the previous function but looks up based on a column family id.
|
|
|
|
// nullptr will be returned if this column family no longer exists.
|
|
|
|
// REQUIRED: this function should only be called on the write thread or if the
|
|
|
|
// mutex is held.
|
|
|
|
SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
|
|
|
|
|
2017-10-11 21:48:28 +00:00
|
|
|
// Un-reference the super version and clean it up if it is the last reference.
|
|
|
|
void CleanupSuperVersion(SuperVersion* sv);
|
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
// Un-reference the super version and return it to thread local cache if
|
|
|
|
// needed. If it is the last reference of the super version. Clean it up
|
|
|
|
// after un-referencing it.
|
|
|
|
void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
|
|
|
|
|
|
|
|
// Similar to the previous function but looks up based on a column family id.
|
|
|
|
// nullptr will be returned if this column family no longer exists.
|
|
|
|
// REQUIRED: this function should only be called on the write thread.
|
|
|
|
void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
|
|
|
|
|
|
|
|
// REQUIRED: this function should only be called on the write thread or if the
|
|
|
|
// mutex is held. Return value only valid until next call to this function or
|
|
|
|
// mutex is released.
|
|
|
|
ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
|
Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions. This diff contains the api changes discussed in D38913. This diff is pretty large, so let me know if people would prefer to meet up to discuss it.
MyRocks folks: please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.
Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint(). After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex. We can then decide which route is preferable.
Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.
Test Plan: Unit tests, db_bench parallel testing.
Reviewers: igor, rven, sdong, yhchiang, yoshinorim
Reviewed By: sdong
Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D40869
2015-05-26 00:37:33 +00:00
|
|
|
|
2017-05-10 21:54:35 +00:00
|
|
|
// Same as above, should called without mutex held and not on write thread.
|
2018-10-08 21:22:06 +00:00
|
|
|
std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
|
|
|
|
uint32_t column_family_id);
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2015-10-17 07:16:36 +00:00
|
|
|
// Returns the number of currently running flushes.
|
|
|
|
// REQUIREMENT: mutex_ must be held when calling this function.
|
|
|
|
int num_running_flushes() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
return num_running_flushes_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns the number of currently running compactions.
|
|
|
|
// REQUIREMENT: mutex_ must be held when calling this function.
|
|
|
|
int num_running_compactions() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
return num_running_compactions_;
|
|
|
|
}
|
|
|
|
|
2017-03-29 18:42:56 +00:00
|
|
|
const WriteController& write_controller() { return write_controller_; }
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
// hollow transactions shell used for recovery.
|
|
|
|
// these will then be passed to TransactionDB so that
|
|
|
|
// locks can be reacquired before writing can resume.
|
|
|
|
struct RecoveredTransaction {
|
|
|
|
std::string name_;
|
2018-07-07 00:17:36 +00:00
|
|
|
bool unprepared_;
|
|
|
|
|
|
|
|
struct BatchInfo {
|
|
|
|
uint64_t log_number_;
|
|
|
|
// TODO(lth): For unprepared, the memory usage here can be big for
|
|
|
|
// unprepared transactions. This is only useful for rollbacks, and we
|
|
|
|
// can in theory just keep keyset for that.
|
|
|
|
WriteBatch* batch_;
|
|
|
|
// Number of sub-batches. A new sub-batch is created if txn attempts to
|
|
|
|
// insert a duplicate key,seq to memtable. This is currently used in
|
|
|
|
// WritePreparedTxn/WriteUnpreparedTxn.
|
|
|
|
size_t batch_cnt_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// This maps the seq of the first key in the batch to BatchInfo, which
|
|
|
|
// contains WriteBatch and other information relevant to the batch.
|
|
|
|
//
|
|
|
|
// For WriteUnprepared, batches_ can have size greater than 1, but for
|
|
|
|
// other write policies, it must be of size 1.
|
|
|
|
std::map<SequenceNumber, BatchInfo> batches_;
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
explicit RecoveredTransaction(const uint64_t log, const std::string& name,
|
2018-03-05 18:48:29 +00:00
|
|
|
WriteBatch* batch, SequenceNumber seq,
|
2018-07-07 00:17:36 +00:00
|
|
|
size_t batch_cnt, bool unprepared)
|
|
|
|
: name_(name), unprepared_(unprepared) {
|
|
|
|
batches_[seq] = {log, batch, batch_cnt};
|
|
|
|
}
|
|
|
|
|
|
|
|
~RecoveredTransaction() {
|
|
|
|
for (auto& it : batches_) {
|
|
|
|
delete it.second.batch_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
|
|
|
|
size_t batch_cnt, bool unprepared) {
|
|
|
|
assert(batches_.count(seq) == 0);
|
|
|
|
batches_[seq] = {log_number, batch, batch_cnt};
|
|
|
|
// Prior state must be unprepared, since the prepare batch must be the
|
|
|
|
// last batch.
|
|
|
|
assert(unprepared_);
|
|
|
|
unprepared_ = unprepared;
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
};
|
|
|
|
|
2016-09-23 23:34:04 +00:00
|
|
|
bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2016-04-18 18:15:50 +00:00
|
|
|
std::unordered_map<std::string, RecoveredTransaction*>
|
|
|
|
recovered_transactions() {
|
|
|
|
return recovered_transactions_;
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
|
|
|
|
auto it = recovered_transactions_.find(name);
|
|
|
|
if (it == recovered_transactions_.end()) {
|
|
|
|
return nullptr;
|
|
|
|
} else {
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
|
2018-03-05 18:48:29 +00:00
|
|
|
WriteBatch* batch, SequenceNumber seq,
|
2018-07-07 00:17:36 +00:00
|
|
|
size_t batch_cnt, bool unprepared_batch) {
|
|
|
|
// For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
|
|
|
|
// times for every unprepared batch encountered during recovery.
|
|
|
|
//
|
|
|
|
// If the transaction is prepared, then the last call to
|
|
|
|
// InsertRecoveredTransaction will have unprepared_batch = false.
|
|
|
|
auto rtxn = recovered_transactions_.find(name);
|
|
|
|
if (rtxn == recovered_transactions_.end()) {
|
|
|
|
recovered_transactions_[name] = new RecoveredTransaction(
|
|
|
|
log, name, batch, seq, batch_cnt, unprepared_batch);
|
|
|
|
} else {
|
|
|
|
rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
|
|
|
|
}
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
|
2016-04-18 18:11:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void DeleteRecoveredTransaction(const std::string& name) {
|
|
|
|
auto it = recovered_transactions_.find(name);
|
|
|
|
assert(it != recovered_transactions_.end());
|
|
|
|
auto* trx = it->second;
|
|
|
|
recovered_transactions_.erase(it);
|
2018-07-07 00:17:36 +00:00
|
|
|
for (const auto& info : trx->batches_) {
|
|
|
|
logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
|
|
|
|
info.second.log_number_);
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
delete trx;
|
|
|
|
}
|
|
|
|
|
2016-05-13 17:37:46 +00:00
|
|
|
void DeleteAllRecoveredTransactions() {
|
|
|
|
for (auto it = recovered_transactions_.begin();
|
2019-05-15 20:14:18 +00:00
|
|
|
it != recovered_transactions_.end(); ++it) {
|
2016-05-13 17:37:46 +00:00
|
|
|
delete it->second;
|
|
|
|
}
|
|
|
|
recovered_transactions_.clear();
|
|
|
|
}
|
|
|
|
|
2016-08-04 00:42:06 +00:00
|
|
|
void AddToLogsToFreeQueue(log::Writer* log_writer) {
|
2022-07-21 20:35:36 +00:00
|
|
|
mutex_.AssertHeld();
|
2016-08-04 00:42:06 +00:00
|
|
|
logs_to_free_queue_.push_back(log_writer);
|
|
|
|
}
|
2017-10-06 17:26:38 +00:00
|
|
|
|
2019-12-17 21:20:42 +00:00
|
|
|
void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
|
|
|
|
superversions_to_free_queue_.push_back(sv);
|
|
|
|
}
|
|
|
|
|
2017-10-06 17:26:38 +00:00
|
|
|
void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
|
|
|
|
|
2019-01-16 05:32:15 +00:00
|
|
|
// Fill JobContext with snapshot information needed by flush and compaction.
|
|
|
|
void GetSnapshotContext(JobContext* job_context,
|
|
|
|
std::vector<SequenceNumber>* snapshot_seqs,
|
|
|
|
SequenceNumber* earliest_write_conflict_snapshot,
|
|
|
|
SnapshotChecker** snapshot_checker);
|
|
|
|
|
2018-03-28 19:01:09 +00:00
|
|
|
// Not thread-safe.
|
|
|
|
void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
|
|
|
|
|
2019-05-04 00:26:20 +00:00
|
|
|
InstrumentedMutex* mutex() const { return &mutex_; }
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// Initialize a brand new DB. The DB directory is expected to be empty before
|
2020-07-10 20:39:47 +00:00
|
|
|
// calling it. Push new manifest file name into `new_filenames`.
|
|
|
|
Status NewDB(std::vector<std::string>* new_filenames);
|
2016-06-24 18:19:40 +00:00
|
|
|
|
2017-11-11 01:18:01 +00:00
|
|
|
// This is to be used only by internal rocksdb classes.
|
|
|
|
static Status Open(const DBOptions& db_options, const std::string& name,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
2018-06-29 01:46:39 +00:00
|
|
|
const bool seq_per_batch, const bool batch_per_txn);
|
2017-11-11 01:18:01 +00:00
|
|
|
|
2020-03-03 00:14:00 +00:00
|
|
|
static IOStatus CreateAndNewDirectory(
|
|
|
|
FileSystem* fs, const std::string& dirname,
|
|
|
|
std::unique_ptr<FSDirectory>* directory);
|
2018-04-06 02:49:06 +00:00
|
|
|
|
2019-02-20 23:46:59 +00:00
|
|
|
// find stats map from stats_history_ with smallest timestamp in
|
|
|
|
// the range of [start_time, end_time)
|
|
|
|
bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
|
|
|
|
uint64_t* new_time,
|
|
|
|
std::map<std::string, uint64_t>* stats_map);
|
|
|
|
|
2019-08-15 23:59:42 +00:00
|
|
|
// Print information of all tombstones of all iterators to the std::string
|
|
|
|
// This is only used by ldb. The output might be capped. Tombstones
|
|
|
|
// printed out are not guaranteed to be in any order.
|
|
|
|
Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
|
|
|
|
int max_entries_to_print,
|
|
|
|
std::string* out_str);
|
|
|
|
|
2021-12-10 19:03:39 +00:00
|
|
|
VersionSet* GetVersionSet() const { return versions_.get(); }
|
|
|
|
|
2023-05-26 00:25:51 +00:00
|
|
|
Status WaitForCompact(
|
|
|
|
const WaitForCompactOptions& wait_for_compact_options) override;
|
2022-02-01 17:00:46 +00:00
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
#ifndef NDEBUG
|
|
|
|
// Compact any files in the named level that overlap [*begin, *end]
|
|
|
|
Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
|
|
|
|
ColumnFamilyHandle* column_family = nullptr,
|
|
|
|
bool disallow_trivial_move = false);
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
Status TEST_SwitchWAL();
|
2019-05-24 20:05:58 +00:00
|
|
|
|
|
|
|
bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
|
|
|
|
|
|
|
|
bool TEST_IsLogGettingFlushed() {
|
|
|
|
return alive_log_files_.begin()->getting_flushed;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
|
|
|
|
|
|
|
|
// Force current memtable contents to be flushed.
|
|
|
|
Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
|
|
|
|
ColumnFamilyHandle* cfh = nullptr);
|
|
|
|
|
2019-07-01 21:04:10 +00:00
|
|
|
Status TEST_FlushMemTable(ColumnFamilyData* cfd,
|
|
|
|
const FlushOptions& flush_opts);
|
|
|
|
|
|
|
|
// Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
|
|
|
|
// is because in certain cases, we can flush column families, wait for the
|
|
|
|
// flush to complete, but delete the column family handle before the wait
|
|
|
|
// finishes. For example in CompactRange.
|
Fix bug of prematurely excluded CF in atomic flush contains unflushed data that should've been included in the atomic flush (#11148)
Summary:
**Context:**
Atomic flush should guarantee recoverability of all data of seqno up to the max seqno of the flush. It achieves this by ensuring all such data are flushed by the time this atomic flush finishes through `SelectColumnFamiliesForAtomicFlush()`. However, our crash test exposed the following case where an excluded CF from an atomic flush contains unflushed data of seqno less than the max seqno of that atomic flush and loses its data with `WriteOptions::DisableWAL=true` in face of a crash right after the atomic flush finishes .
```
./db_stress --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 10
kill $pid
sleep 0.2
./db_stress --ops_per_thread=1 --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 40
kill $pid
sleep 0.2
Verification failed for column family 6 key 0000000000000239000000000000012B0000000000000138 (56622): value_from_db: , value_from_expected: 4A6331754E4F4C4D42434041464744455A5B58595E5F5C5D5253505156575455, msg: Value not found: NotFound:
Crash-recovery verification failed :(
No writes or ops?
Verification failed :(
```
The bug is due to the following:
- When atomic flush is used, an empty CF is legally [excluded](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L39) in `SelectColumnFamiliesForAtomicFlush` as the first step of `DBImpl::FlushForGetLiveFiles` before [passing](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L42) the included CFDs to `AtomicFlushMemTables`.
- But [later](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2133) in `AtomicFlushMemTables`, `WaitUntilFlushWouldNotStallWrites` will [release the db mutex](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2403), during which data@seqno N can be inserted into the excluded CF and data@seqno M can be inserted into one of the included CFs, where M > N.
- However, data@seqno N in an already-excluded CF is thus excluded from this atomic flush while we seqno N is less than seqno M.
**Summary:**
- Replace `SelectColumnFamiliesForAtomicFlush()`-before-`AtomicFlushMemTables()` with `SelectColumnFamiliesForAtomicFlush()`-after-wait-within-`AtomicFlushMemTables()` so we ensure no write affecting the recoverability of this atomic job (i.e, change to max seqno of this atomic flush or insertion of data with less seqno than the max seqno of the atomic flush to excluded CF) can happen after calling `SelectColumnFamiliesForAtomicFlush()`.
- For above, refactored and clarified comments on `SelectColumnFamiliesForAtomicFlush()` and `AtomicFlushMemTables()` for clearer semantics of passed-in CFDs to atomic-flush
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11148
Test Plan:
- New unit test failed before the fix and passes after
- Make check
- Rehearsal stress test
Reviewed By: ajkr
Differential Revision: D42799871
Pulled By: hx235
fbshipit-source-id: 13636b63e9c25c5895857afc36ea580d57f6d644
2023-03-14 23:53:20 +00:00
|
|
|
Status TEST_AtomicFlushMemTables(
|
|
|
|
const autovector<ColumnFamilyData*>& provided_candidate_cfds,
|
|
|
|
const FlushOptions& flush_opts);
|
2019-07-01 21:04:10 +00:00
|
|
|
|
2021-12-23 05:58:11 +00:00
|
|
|
// Wait for background threads to complete scheduled work.
|
|
|
|
Status TEST_WaitForBackgroundWork();
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// Wait for memtable compaction
|
|
|
|
Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
|
|
|
|
|
2023-05-26 00:25:51 +00:00
|
|
|
Status TEST_WaitForCompact();
|
|
|
|
Status TEST_WaitForCompact(
|
|
|
|
const WaitForCompactOptions& wait_for_compact_options);
|
2019-05-24 20:05:58 +00:00
|
|
|
|
2021-11-03 19:20:19 +00:00
|
|
|
// Wait for any background purge
|
|
|
|
Status TEST_WaitForPurge();
|
|
|
|
|
2021-01-28 01:56:17 +00:00
|
|
|
// Get the background error status
|
|
|
|
Status TEST_GetBGError();
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// Return the maximum overlapping data (in bytes) at next level for any
|
|
|
|
// file at a level >= 1.
|
2021-07-08 17:07:37 +00:00
|
|
|
uint64_t TEST_MaxNextLevelOverlappingBytes(
|
2019-05-24 20:05:58 +00:00
|
|
|
ColumnFamilyHandle* column_family = nullptr);
|
|
|
|
|
|
|
|
// Return the current manifest file no.
|
|
|
|
uint64_t TEST_Current_Manifest_FileNo();
|
|
|
|
|
|
|
|
// Returns the number that'll be assigned to the next file that's created.
|
|
|
|
uint64_t TEST_Current_Next_FileNo();
|
|
|
|
|
|
|
|
// get total level0 file size. Only for testing.
|
|
|
|
uint64_t TEST_GetLevel0TotalSize();
|
|
|
|
|
2021-06-28 15:12:32 +00:00
|
|
|
void TEST_GetFilesMetaData(
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
std::vector<std::vector<FileMetaData>>* metadata,
|
|
|
|
std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata = nullptr);
|
2019-05-24 20:05:58 +00:00
|
|
|
|
|
|
|
void TEST_LockMutex();
|
|
|
|
|
|
|
|
void TEST_UnlockMutex();
|
|
|
|
|
2024-01-25 22:40:18 +00:00
|
|
|
InstrumentedMutex* TEST_Mutex() { return &mutex_; }
|
|
|
|
|
2023-02-03 20:08:37 +00:00
|
|
|
void TEST_SignalAllBgCv();
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
// REQUIRES: mutex locked
|
|
|
|
void* TEST_BeginWrite();
|
|
|
|
|
|
|
|
// REQUIRES: mutex locked
|
|
|
|
// pass the pointer that you got from TEST_BeginWrite()
|
|
|
|
void TEST_EndWrite(void* w);
|
|
|
|
|
|
|
|
uint64_t TEST_MaxTotalInMemoryState() const {
|
|
|
|
return max_total_in_memory_state_;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t TEST_LogsToFreeSize();
|
|
|
|
|
|
|
|
uint64_t TEST_LogfileNumber();
|
|
|
|
|
|
|
|
uint64_t TEST_total_log_size() const { return total_log_size_; }
|
|
|
|
|
|
|
|
// Returns column family name to ImmutableCFOptions map.
|
|
|
|
Status TEST_GetAllImmutableCFOptions(
|
|
|
|
std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
|
|
|
|
|
|
|
|
// Return the lastest MutableCFOptions of a column family
|
|
|
|
Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
|
|
|
|
MutableCFOptions* mutable_cf_options);
|
|
|
|
|
|
|
|
Cache* TEST_table_cache() { return table_cache_.get(); }
|
|
|
|
|
|
|
|
WriteController& TEST_write_controler() { return write_controller_; }
|
|
|
|
|
|
|
|
uint64_t TEST_FindMinLogContainingOutstandingPrep();
|
|
|
|
uint64_t TEST_FindMinPrepLogReferencedByMemTable();
|
|
|
|
size_t TEST_PreparedSectionCompletedSize();
|
|
|
|
size_t TEST_LogsWithPrepSize();
|
|
|
|
|
|
|
|
int TEST_BGCompactionsAllowed() const;
|
|
|
|
int TEST_BGFlushesAllowed() const;
|
|
|
|
size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
|
2022-12-12 18:37:55 +00:00
|
|
|
void TEST_WaitForPeriodicTaskRun(std::function<void()> callback) const;
|
2022-07-19 02:08:39 +00:00
|
|
|
SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
|
2023-11-11 16:11:11 +00:00
|
|
|
const autovector<uint64_t>& TEST_GetFilesToQuarantine() const;
|
2019-06-17 22:17:43 +00:00
|
|
|
size_t TEST_EstimateInMemoryStatsHistorySize() const;
|
2020-05-04 22:05:34 +00:00
|
|
|
|
2020-11-17 23:54:49 +00:00
|
|
|
uint64_t TEST_GetCurrentLogNumber() const {
|
|
|
|
InstrumentedMutexLock l(mutex());
|
|
|
|
assert(!logs_.empty());
|
|
|
|
return logs_.back().number;
|
|
|
|
}
|
|
|
|
|
2020-05-07 16:29:21 +00:00
|
|
|
const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
|
|
|
|
return files_grabbed_for_purge_;
|
|
|
|
}
|
2020-08-15 03:11:35 +00:00
|
|
|
|
2022-08-26 01:52:37 +00:00
|
|
|
const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const;
|
2020-08-15 03:11:35 +00:00
|
|
|
|
Offpeak in db option (#11893)
Summary:
RocksDB's primary function is to facilitate read and write operations. Compactions, while essential for minimizing read amplifications and optimizing storage, can sometimes compete with these primary tasks. Especially during periods of high read/write traffic, it's vital to ensure that primary operations receive priority, avoiding any potential disruptions or slowdowns. Conversely, during off-peak times when traffic is minimal, it's an opportune moment to tackle low-priority tasks like TTL based compactions, optimizing resource usage.
In this PR, we are incorporating the concept of off-peak time into RocksDB by introducing `daily_offpeak_time_utc` within the DBOptions. This setting is formatted as "HH:mm-HH:mm" where the first one before "-" is the start time and the second one is the end time, inclusive. It will be later used for resource optimization in subsequent PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11893
Test Plan:
- New Unit Test Added - `DBOptionsTest::OffPeakTimes`
- Existing Unit Test Updated - `OptionsTest`, `OptionsSettableTest`
Reviewed By: pdillinger
Differential Revision: D49714553
Pulled By: jaykorean
fbshipit-source-id: fef51ea7c0fede6431c715bff116ddbb567c8752
2023-09-29 20:03:39 +00:00
|
|
|
static Status TEST_ValidateOptions(const DBOptions& db_options) {
|
|
|
|
return ValidateOptions(db_options);
|
|
|
|
}
|
|
|
|
|
2019-05-24 20:05:58 +00:00
|
|
|
#endif // NDEBUG
|
2019-05-31 04:29:44 +00:00
|
|
|
|
2020-08-15 03:11:35 +00:00
|
|
|
// persist stats to column family "_persistent_stats"
|
|
|
|
void PersistStats();
|
|
|
|
|
|
|
|
// dump rocksdb.stats to LOG
|
|
|
|
void DumpStats();
|
|
|
|
|
2020-10-02 02:12:26 +00:00
|
|
|
// flush LOG out of application buffer
|
|
|
|
void FlushInfoLog();
|
|
|
|
|
Bootstrap, pre-populate seqno_to_time_mapping (#11922)
Summary:
This change has two primary goals (follow-up to https://github.com/facebook/rocksdb/issues/11917, https://github.com/facebook/rocksdb/issues/11920):
* Ensure the DB seqno_to_time_mapping has entries that allow us to put a good time lower bound on any writes that happen after setting up preserve/preclude options (either in a new DB, new CF, SetOptions, etc.) and haven't yet aged out of that time window. This allows us to remove a bunch of work-arounds in tests.
* For new DBs using preserve/preclude options, automatically reserve some sequence numbers and pre-map them to cover the time span back to the preserve/preclude cut-off time. In the future, this will allow us to import data from another DB by key, value, and write time by assigning an appropriate seqno in this DB for that write time.
Note that the pre-population (historical mappings) does not happen if the original options at DB Open time do not have preserve/preclude, so it is recommended to create initial column families at that time with create_missing_column_families, to take advantage of this (future) feature. (Adding these historical mappings after DB Open would risk non-monotonic seqno_to_time_mapping, which is dubious if not dangerous.)
Recommended follow-up:
* Solve existing race conditions (not memory safety) where parallel operations like CreateColumnFamily or SetDBOptions could leave the wrong setting in effect.
* Make SeqnoToTimeMapping more gracefully handle a possible case in which too many mappings are added for the time range of concern. It seems like there could be cases where data is massively excluded from the cold tier because of entries falling off the front of the mapping list (causing GetProximalSeqnoBeforeTime() to return 0). (More investigation needed.)
No release note for the minor bug fix because this is still an experimental feature with limited usage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11922
Test Plan: tests added / updated
Reviewed By: jowlyzhang
Differential Revision: D49956563
Pulled By: pdillinger
fbshipit-source-id: 92beb918c3a298fae9ca8e509717b1067caa1519
2023-10-06 15:21:21 +00:00
|
|
|
// record current sequence number to time mapping. If
|
|
|
|
// populate_historical_seconds > 0 then pre-populate all the
|
|
|
|
// sequence numbers from [1, last] to map to [now minus
|
|
|
|
// populate_historical_seconds, now].
|
|
|
|
void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds);
|
2022-07-15 04:49:34 +00:00
|
|
|
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
// Everytime DB's seqno to time mapping changed (which already hold the db
|
|
|
|
// mutex), we install a new SuperVersion in each column family with a shared
|
|
|
|
// copy of the new mapping while holding the db mutex.
|
|
|
|
// This is done for all column families even though the column family does not
|
|
|
|
// explicitly enabled the
|
|
|
|
// `preclude_last_level_data_seconds` or `preserve_internal_time_seconds`
|
|
|
|
// features.
|
|
|
|
// This mapping supports iterators to fulfill the
|
|
|
|
// "rocksdb.iterator.write-time" iterator property for entries in memtables.
|
|
|
|
//
|
|
|
|
// Since this new SuperVersion doesn't involve an LSM tree shape change, we
|
|
|
|
// don't schedule work after installing this SuperVersion. It returns the used
|
|
|
|
// `SuperVersionContext` for clean up after release mutex.
|
|
|
|
void InstallSeqnoToTimeMappingInSV(
|
|
|
|
std::vector<SuperVersionContext>* sv_contexts);
|
|
|
|
|
2021-04-21 20:53:05 +00:00
|
|
|
// Interface to block and signal the DB in case of stalling writes by
|
|
|
|
// WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
|
|
|
|
// When DB needs to be blocked or signalled by WriteBufferManager,
|
|
|
|
// state_ is changed accordingly.
|
|
|
|
class WBMStallInterface : public StallInterface {
|
|
|
|
public:
|
|
|
|
enum State {
|
|
|
|
BLOCKED = 0,
|
|
|
|
RUNNING,
|
|
|
|
};
|
|
|
|
|
|
|
|
WBMStallInterface() : state_cv_(&state_mutex_) {
|
|
|
|
MutexLock lock(&state_mutex_);
|
|
|
|
state_ = State::RUNNING;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetState(State state) {
|
|
|
|
MutexLock lock(&state_mutex_);
|
|
|
|
state_ = state;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Change the state_ to State::BLOCKED and wait until its state is
|
|
|
|
// changed by WriteBufferManager. When stall is cleared, Signal() is
|
|
|
|
// called to change the state and unblock the DB.
|
|
|
|
void Block() override {
|
|
|
|
MutexLock lock(&state_mutex_);
|
|
|
|
while (state_ == State::BLOCKED) {
|
|
|
|
TEST_SYNC_POINT("WBMStallInterface::BlockDB");
|
|
|
|
state_cv_.Wait();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Called from WriteBufferManager. This function changes the state_
|
|
|
|
// to State::RUNNING indicating the stall is cleared and DB can proceed.
|
|
|
|
void Signal() override {
|
Fix race in WriteBufferManager (#9009)
Summary:
EndWriteStall has a data race: `queue_.empty()` is checked outside of the
mutex, so once we enter the critical section another thread may already have
cleared the list, and accessing the `front()` is undefined behavior (and causes
interesting crashes under high concurrency).
This PR fixes the bug, and also rewrites the logic to make it easier to reason
about it. It also fixes another subtle bug: if some writers are stalled and
`SetBufferSize(0)` is called, which disables the WBM, the writer are not
unblocked because of an early `enabled()` check in `EndWriteStall()`.
It doesn't significantly change the locking behavior, as before writers won't
lock unless entering a stall condition, and `FreeMem` almost always locks if
stalling is allowed, but that is inevitable with the current design. Liveness is
guaranteed by the fact that if some writes are blocked, eventually all writes
will be blocked due to `stall_active_`, and eventually all memory is freed.
While at it, do a couple of optimizations:
- In `WBMStallInterface::Signal()` signal the CV only after releasing the
lock. Signaling under the lock is a common pitfall, as it causes the woken-up
thread to immediately go back to sleep because the mutex is still locked by
the awaker.
- Move all allocations and deallocations outside of the lock.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9009
Test Plan:
```
USE_CLANG=1 make -j64 all check
```
Reviewed By: akankshamahajan15
Differential Revision: D31550668
Pulled By: ot
fbshipit-source-id: 5125387c3dc7ecaaa2b8bbc736e58c4156698580
2021-10-12 07:14:41 +00:00
|
|
|
{
|
|
|
|
MutexLock lock(&state_mutex_);
|
|
|
|
state_ = State::RUNNING;
|
|
|
|
}
|
2021-04-21 20:53:05 +00:00
|
|
|
state_cv_.Signal();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Conditional variable and mutex to block and
|
|
|
|
// signal the DB during stalling process.
|
|
|
|
port::Mutex state_mutex_;
|
|
|
|
port::CondVar state_cv_;
|
|
|
|
// state represting whether DB is running or blocked because of stall by
|
|
|
|
// WriteBufferManager.
|
|
|
|
State state_;
|
|
|
|
};
|
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-17 01:13:55 +00:00
|
|
|
static void TEST_ResetDbSessionIdGen();
|
2021-08-18 18:32:00 +00:00
|
|
|
static std::string GenerateDbSessionId(Env* env);
|
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
bool seq_per_batch() const { return seq_per_batch_; }
|
|
|
|
|
2012-12-18 21:05:39 +00:00
|
|
|
protected:
|
2012-11-06 03:18:49 +00:00
|
|
|
const std::string dbname_;
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-17 01:13:55 +00:00
|
|
|
// TODO(peterd): unify with VersionSet::db_id_
|
2019-09-03 15:50:47 +00:00
|
|
|
std::string db_id_;
|
2020-06-15 17:45:03 +00:00
|
|
|
// db_session_id_ is an identifier that gets reset
|
|
|
|
// every time the DB is opened
|
|
|
|
std::string db_session_id_;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<VersionSet> versions_;
|
2018-01-16 18:57:56 +00:00
|
|
|
// Flag to check whether we allocated and own the info log file
|
|
|
|
bool own_info_log_;
|
2022-06-22 15:26:38 +00:00
|
|
|
Status init_logger_creation_s_;
|
2016-10-14 19:25:39 +00:00
|
|
|
const DBOptions initial_db_options_;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
Env* const env_;
|
2020-08-05 01:40:19 +00:00
|
|
|
std::shared_ptr<IOTracer> io_tracer_;
|
2016-09-23 23:34:04 +00:00
|
|
|
const ImmutableDBOptions immutable_db_options_;
|
2020-08-13 00:28:10 +00:00
|
|
|
FileSystemPtr fs_;
|
2016-09-23 23:34:04 +00:00
|
|
|
MutableDBOptions mutable_db_options_;
|
2014-07-28 19:05:36 +00:00
|
|
|
Statistics* stats_;
|
2016-04-18 18:11:51 +00:00
|
|
|
std::unordered_map<std::string, RecoveredTransaction*>
|
|
|
|
recovered_transactions_;
|
2018-08-01 07:14:43 +00:00
|
|
|
std::unique_ptr<Tracer> tracer_;
|
|
|
|
InstrumentedMutex trace_mutex_;
|
2019-06-13 22:39:52 +00:00
|
|
|
BlockCacheTracer block_cache_tracer_;
|
2012-11-06 03:18:49 +00:00
|
|
|
|
2022-06-07 01:32:26 +00:00
|
|
|
// constant false canceled flag, used when the compaction is not manual
|
|
|
|
const std::atomic<bool> kManualCompactionCanceledFalse_{false};
|
|
|
|
|
2019-03-26 23:41:31 +00:00
|
|
|
// State below is protected by mutex_
|
|
|
|
// With two_write_queues enabled, some of the variables that accessed during
|
|
|
|
// WriteToWAL need different synchronization: log_empty_, alive_log_files_,
|
|
|
|
// logs_, logfile_number_. Refer to the definition of each variable below for
|
|
|
|
// more description.
|
2022-02-27 19:36:54 +00:00
|
|
|
//
|
|
|
|
// `mutex_` can be a hot lock in some workloads, so it deserves dedicated
|
|
|
|
// cachelines.
|
|
|
|
mutable CacheAlignedInstrumentedMutex mutex_;
|
2019-03-26 23:41:31 +00:00
|
|
|
|
|
|
|
ColumnFamilyHandleImpl* default_cf_handle_;
|
|
|
|
InternalStats* default_cf_internal_stats_;
|
|
|
|
|
2021-04-22 20:01:00 +00:00
|
|
|
// table_cache_ provides its own synchronization
|
|
|
|
std::shared_ptr<Cache> table_cache_;
|
|
|
|
|
|
|
|
ErrorHandler error_handler_;
|
|
|
|
|
|
|
|
// Unified interface for logging events
|
|
|
|
EventLogger event_logger_;
|
|
|
|
|
2019-03-26 23:41:31 +00:00
|
|
|
// only used for dynamically adjusting max_total_wal_size. it is a sum of
|
|
|
|
// [write_buffer_size * max_write_buffer_number] over all column families
|
2022-07-21 20:35:36 +00:00
|
|
|
std::atomic<uint64_t> max_total_in_memory_state_;
|
2019-03-26 23:41:31 +00:00
|
|
|
|
|
|
|
// The options to access storage files
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
const FileOptions file_options_;
|
2019-03-26 23:41:31 +00:00
|
|
|
|
|
|
|
// Additonal options for compaction and flush
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
FileOptions file_options_for_compaction_;
|
2019-03-26 23:41:31 +00:00
|
|
|
|
2019-04-24 19:05:29 +00:00
|
|
|
std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
|
|
|
|
|
|
|
|
// Increase the sequence number after writing each batch, whether memtable is
|
|
|
|
// disabled for that or not. Otherwise the sequence number is increased after
|
|
|
|
// writing each key into memtable. This implies that when disable_memtable is
|
|
|
|
// set, the seq is not increased at all.
|
|
|
|
//
|
|
|
|
// Default: false
|
|
|
|
const bool seq_per_batch_;
|
|
|
|
// This determines during recovery whether we expect one writebatch per
|
|
|
|
// recovered transaction, or potentially multiple writebatches per
|
|
|
|
// transaction. For WriteUnprepared, this is set to false, since multiple
|
|
|
|
// batches can exist per transaction.
|
|
|
|
//
|
|
|
|
// Default: true
|
|
|
|
const bool batch_per_txn_;
|
|
|
|
|
2021-04-22 20:01:00 +00:00
|
|
|
// Each flush or compaction gets its own job id. this counter makes sure
|
|
|
|
// they're unique
|
|
|
|
std::atomic<int> next_job_id_;
|
|
|
|
|
|
|
|
std::atomic<bool> shutting_down_;
|
|
|
|
|
2023-08-11 19:30:48 +00:00
|
|
|
// No new background jobs can be queued if true. This is used to prevent new
|
|
|
|
// background jobs from being queued after WaitForCompact() completes waiting
|
|
|
|
// all background jobs then attempts to close when close_db_ option is true.
|
|
|
|
bool reject_new_background_jobs_;
|
|
|
|
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
// RecoveryContext struct stores the context about version edits along
|
|
|
|
// with corresponding column_family_data and column_family_options.
|
|
|
|
class RecoveryContext {
|
|
|
|
public:
|
|
|
|
~RecoveryContext() {
|
|
|
|
for (auto& edit_list : edit_lists_) {
|
|
|
|
for (auto* edit : edit_list) {
|
|
|
|
delete edit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
if (map_.find(cfd->GetID()) == map_.end()) {
|
|
|
|
uint32_t size = static_cast<uint32_t>(map_.size());
|
|
|
|
map_.emplace(cfd->GetID(), size);
|
|
|
|
cfds_.emplace_back(cfd);
|
|
|
|
mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
|
|
|
|
edit_lists_.emplace_back(autovector<VersionEdit*>());
|
|
|
|
}
|
|
|
|
uint32_t i = map_[cfd->GetID()];
|
|
|
|
edit_lists_[i].emplace_back(new VersionEdit(edit));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unordered_map<uint32_t, uint32_t> map_; // cf_id to index;
|
|
|
|
autovector<ColumnFamilyData*> cfds_;
|
|
|
|
autovector<const MutableCFOptions*> mutable_cf_opts_;
|
|
|
|
autovector<autovector<VersionEdit*>> edit_lists_;
|
2023-10-28 16:50:52 +00:00
|
|
|
// Stale SST files to delete found upon recovery. This stores a mapping from
|
|
|
|
// such a file's absolute path to its parent directory.
|
|
|
|
std::unordered_map<std::string, std::string> files_to_delete_;
|
Use manifest to persist pre-allocated seqnos (#11995)
Summary:
... and other fixes for crash test after https://github.com/facebook/rocksdb/issues/11922.
* When pre-allocating sequence numbers for establishing a time history, record that last sequence number in the manifest so that it is (most likely) restored on recovery even if no user writes were made or were recovered (e.g. no WAL).
* When pre-allocating sequence numbers for establishing a time history, only do this for actually new DBs.
* Remove the feature that ensures non-zero sequence number on creating the first column family with preserve/preclude option after initial DB::Open. Until fixed in a way compatible with the crash test, this creates a gap where some data written with active preserve/preclude option won't have a known associated time.
Together, these ensure we don't upset the crash test by manipulating sequence numbers after initial DB creation (esp when re-opening with different options). (The crash test expects that the seqno after re-open corresponds to a known point in time from previous crash test operation, matching an expected DB state.)
Follow-up work:
* Re-fill the gap to ensure all data written under preserve/preclude settings have a known time estimate.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11995
Test Plan:
Added to unit test SeqnoTimeTablePropTest.PrePopulateInDB
Verified fixes two crash test scenarios:
## 1st reproducer
First apply
```
diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc
index b483e154c..ef63b8d6c 100644
--- a/db_stress_tool/expected_state.cc
+++ b/db_stress_tool/expected_state.cc
@@ -333,6 +333,7 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) {
s = NewFileTraceWriter(Env::Default(), soptions, trace_file_path,
&trace_writer);
}
+ if (getenv("CRASH")) assert(false);
if (s.ok()) {
TraceOptions trace_opts;
trace_opts.filter |= kTraceFilterGet;
```
Then
```
mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_expected
mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_whitebox
rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/*
CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=36000
./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --preserve_internal_time_seconds=0
```
Without the fix you get
```
...
DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox]
(Re-)verified 34 unique IDs
Error restoring historical expected values: Corruption: DB is older than any restorable expected state
```
## 2nd reproducer
First apply
```
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 62ddead7b..f2654980f 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1126,6 +1126,7 @@ void StressTest::OperateDb(ThreadState* thread) {
// OPERATION write
TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
value);
+ if (getenv("CRASH")) assert(false);
} else if (prob_op < del_bound) {
assert(write_bound <= prob_op);
// OPERATION delete
```
Then
```
rm -rf /dev/shm/rocksdb_test/rocksdb_crashtest_*/*
CRASH=1 ./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=1 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=0
./db_stress --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --destroy_db_initially=0 --manual_wal_flush_one_in=1000000 --clear_column_family_one_in=0 --disable_wal=1 --reopen=0 --preserve_internal_time_seconds=3600
```
Without the fix you get
```
DB path: [/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox]
(Re-)verified 34 unique IDs
db_stress: db_stress_tool/expected_state.cc:380: virtual rocksdb::{anonymous}::ExpectedStateTraceRecordHandler::~
ExpectedStateTraceRecordHandler(): Assertion `IsDone()' failed.
```
Reviewed By: jowlyzhang
Differential Revision: D50533346
Pulled By: pdillinger
fbshipit-source-id: 1056be45c5b9e537c8c601b28c4b27431a782477
2023-10-23 16:20:59 +00:00
|
|
|
bool is_new_db_ = false;
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
};
|
|
|
|
|
2023-10-16 15:58:47 +00:00
|
|
|
// Persist options to options file. Must be holding options_mutex_.
|
|
|
|
// Will lock DB mutex if !db_mutex_already_held.
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
Status WriteOptionsFile(const WriteOptions& write_options,
|
|
|
|
bool db_mutex_already_held);
|
2015-12-09 01:01:02 +00:00
|
|
|
|
2020-12-02 20:59:23 +00:00
|
|
|
Status CompactRangeInternal(const CompactRangeOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
2022-03-12 00:13:23 +00:00
|
|
|
const Slice* begin, const Slice* end,
|
|
|
|
const std::string& trim_ts);
|
2020-12-02 20:59:23 +00:00
|
|
|
|
2015-12-09 01:01:02 +00:00
|
|
|
// The following two functions can only be called when:
|
|
|
|
// 1. WriteThread::Writer::EnterUnbatched() is used.
|
|
|
|
// 2. db_mutex is NOT held
|
2015-11-11 06:58:01 +00:00
|
|
|
Status RenameTempFileToOptionsFile(const std::string& file_name);
|
|
|
|
Status DeleteObsoleteOptionsFiles();
|
|
|
|
|
2017-04-18 19:00:36 +00:00
|
|
|
void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
|
|
|
|
const MutableCFOptions& mutable_cf_options,
|
2023-01-24 17:54:04 +00:00
|
|
|
int job_id, FlushReason flush_reason);
|
2017-04-18 19:00:36 +00:00
|
|
|
|
2019-10-16 17:39:00 +00:00
|
|
|
void NotifyOnFlushCompleted(
|
|
|
|
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
|
|
|
|
std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
|
2019-03-27 23:13:08 +00:00
|
|
|
void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
|
|
|
|
const Status& st,
|
|
|
|
const CompactionJobStats& job_stats, int job_id);
|
2018-10-11 00:30:22 +00:00
|
|
|
|
2019-03-27 23:13:08 +00:00
|
|
|
void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
|
|
|
|
const Status& st,
|
2015-06-03 00:07:16 +00:00
|
|
|
const CompactionJobStats& job_stats,
|
2015-06-03 00:35:39 +00:00
|
|
|
int job_id);
|
2016-06-10 02:03:10 +00:00
|
|
|
void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
|
2016-06-02 18:57:31 +00:00
|
|
|
const MemTableInfo& mem_table_info);
|
2015-01-27 22:44:02 +00:00
|
|
|
|
2016-12-06 21:56:17 +00:00
|
|
|
void NotifyOnExternalFileIngested(
|
|
|
|
ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
|
2021-10-16 17:03:19 +00:00
|
|
|
|
2023-05-31 19:53:51 +00:00
|
|
|
Status FlushAllColumnFamilies(const FlushOptions& flush_options,
|
|
|
|
FlushReason flush_reason);
|
|
|
|
|
2022-08-29 20:36:23 +00:00
|
|
|
virtual Status FlushForGetLiveFiles();
|
2016-12-06 21:56:17 +00:00
|
|
|
|
2014-11-20 18:49:32 +00:00
|
|
|
void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
|
|
|
|
|
|
|
|
void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
|
|
|
|
|
|
|
|
void EraseThreadStatusDbInfo() const;
|
|
|
|
|
2017-11-15 16:22:54 +00:00
|
|
|
// If disable_memtable is set the application logic must guarantee that the
|
2018-04-08 04:55:42 +00:00
|
|
|
// batch will still be skipped from memtable during the recovery. An excption
|
|
|
|
// to this is seq_per_batch_ mode, in which since each batch already takes one
|
|
|
|
// seq, it is ok for the batch to write to memtable during recovery as long as
|
|
|
|
// it only takes one sequence number: i.e., no duplicate keys.
|
|
|
|
// In WriteCommitted it is guarnateed since disable_memtable is used for
|
|
|
|
// prepare batch which will be written to memtable later during the commit,
|
|
|
|
// and in WritePrepared it is guaranteed since it will be used only for WAL
|
|
|
|
// markers which will never be written to memtable. If the commit marker is
|
|
|
|
// accompanied with CommitTimeWriteBatch that is not written to memtable as
|
|
|
|
// long as it has no duplicate keys, it does not violate the one-seq-per-batch
|
|
|
|
// policy.
|
|
|
|
// batch_cnt is expected to be non-zero in seq_per_batch mode and
|
|
|
|
// indicates the number of sub-patches. A sub-patch is a subset of the write
|
|
|
|
// batch that does not have duplicate keys.
|
2015-05-29 21:36:35 +00:00
|
|
|
Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
|
2016-04-18 18:11:51 +00:00
|
|
|
WriteCallback* callback = nullptr,
|
|
|
|
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
|
2017-12-01 07:39:56 +00:00
|
|
|
bool disable_memtable = false, uint64_t* seq_used = nullptr,
|
2018-02-06 02:32:54 +00:00
|
|
|
size_t batch_cnt = 0,
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
PreReleaseCallback* pre_release_callback = nullptr,
|
|
|
|
PostMemTableCallback* post_memtable_callback = nullptr);
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2017-05-19 21:24:23 +00:00
|
|
|
Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
|
|
|
|
WriteCallback* callback = nullptr,
|
|
|
|
uint64_t* log_used = nullptr, uint64_t log_ref = 0,
|
2017-08-16 23:49:11 +00:00
|
|
|
bool disable_memtable = false,
|
|
|
|
uint64_t* seq_used = nullptr);
|
2017-05-19 21:24:23 +00:00
|
|
|
|
2019-05-14 00:43:47 +00:00
|
|
|
// Write only to memtables without joining any write queue
|
|
|
|
Status UnorderedWriteMemtable(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch, WriteCallback* callback,
|
|
|
|
uint64_t log_ref, SequenceNumber seq,
|
|
|
|
const size_t sub_batch_cnt);
|
|
|
|
|
|
|
|
// Whether the batch requires to be assigned with an order
|
|
|
|
enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
|
|
|
|
// Whether it requires publishing last sequence or not
|
|
|
|
enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
|
|
|
|
|
|
|
|
// Join the write_thread to write the batch only to the WAL. It is the
|
|
|
|
// responsibility of the caller to also write the write batch to the memtable
|
|
|
|
// if it required.
|
|
|
|
//
|
|
|
|
// sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
|
|
|
|
// indicating the number of sub-batches in my_batch. A sub-patch is a subset
|
|
|
|
// of the write batch that does not have duplicate keys. When seq_per_batch is
|
|
|
|
// not set, each key is a separate sub_batch. Otherwise each duplicate key
|
|
|
|
// marks start of a new sub-batch.
|
|
|
|
Status WriteImplWALOnly(
|
|
|
|
WriteThread* write_thread, const WriteOptions& options,
|
|
|
|
WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
|
|
|
|
const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
|
|
|
|
PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
|
|
|
|
const PublishLastSeq publish_last_seq, const bool disable_memtable);
|
2017-06-24 21:06:43 +00:00
|
|
|
|
2017-11-02 00:23:52 +00:00
|
|
|
// write cached_recoverable_state_ to memtable if it is not empty
|
|
|
|
// The writer must be the leader in write_thread_ and holding mutex_
|
|
|
|
Status WriteRecoverableState();
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2018-02-23 21:50:02 +00:00
|
|
|
// Actual implementation of Close()
|
|
|
|
Status CloseImpl();
|
|
|
|
|
2019-03-26 23:41:31 +00:00
|
|
|
// Recover the descriptor from persistent storage. May do a significant
|
|
|
|
// amount of work to recover recently logged updates. Any changes to
|
|
|
|
// be made to the descriptor are added to *edit.
|
2020-01-29 19:39:11 +00:00
|
|
|
// recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
|
|
|
|
// skipped.
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
// recovery_ctx stores the context about version edits and all those
|
|
|
|
// edits are persisted to new Manifest after successfully syncing the new WAL.
|
2019-03-26 23:41:31 +00:00
|
|
|
virtual Status Recover(
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
2020-09-17 22:39:25 +00:00
|
|
|
bool read_only = false, bool error_if_wal_file_exists = false,
|
|
|
|
bool error_if_data_exists_in_wals = false,
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
uint64_t* recovered_seq = nullptr,
|
|
|
|
RecoveryContext* recovery_ctx = nullptr);
|
2019-03-26 23:41:31 +00:00
|
|
|
|
2019-12-03 01:43:37 +00:00
|
|
|
virtual bool OwnTablesAndLogs() const { return true; }
|
|
|
|
|
2022-06-15 22:39:49 +00:00
|
|
|
// Setup DB identity file, and write DB ID to manifest if necessary.
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
Status SetupDBId(const WriteOptions& write_options, bool read_only,
|
|
|
|
RecoveryContext* recovery_ctx);
|
2022-06-15 22:39:49 +00:00
|
|
|
// Assign db_id_ and write DB ID to manifest if necessary.
|
|
|
|
void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx);
|
Fix a recovery corner case (#7621)
Summary:
Consider the following sequence of events:
1. Db flushed an SST with file number N, appended to MANIFEST, and tried to sync the MANIFEST.
2. Syncing MANIFEST failed and db crashed.
3. Db tried to recover with this MANIFEST. In the meantime, no entry about the newly-flushed SST was found in the MANIFEST. Therefore, RocksDB replayed WAL and tried to flush to an SST file reusing the same file number N. This failed because file system does not support overwrite. Then Db deleted this file.
4. Db crashed again.
5. Db tried to recover. When db read the MANIFEST, there was an entry referencing N.sst. This could happen probably because the append in step 1 finally reached the MANIFEST and became visible. Since N.sst had been deleted in step 3, recovery failed.
It is possible that N.sst created in step 1 is valid. Although step 3 would still fail since the MANIFEST was not synced properly in step 1 and 2, deleting N.sst would make it impossible for the db to recover even if the remaining part of MANIFEST was appended and visible after step 5.
After this PR, in step 3, immediately after recovering from MANIFEST, a new MANIFEST is created, then we find that N.sst is not referenced in the MANIFEST, so we delete it, and we'll not reuse N as file number. Then in step 5, since the new MANIFEST does not contain N.sst, the recovery failure situation in step 5 won't happen.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7621
Test Plan:
1. some tests are updated, because these tests assume that new MANIFEST is created after WAL recovery.
2. a new unit test is added in db_basic_test to simulate step 3.
Reviewed By: riversand963
Differential Revision: D24668144
Pulled By: cheng-chang
fbshipit-source-id: 90d7487fbad2bc3714f5ede46ea949895b15ae3b
2020-11-08 05:54:55 +00:00
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
// REQUIRES: db mutex held when calling this function, but the db mutex can
|
|
|
|
// be released and re-acquired. Db mutex will be held when the function
|
|
|
|
// returns.
|
Fix a recovery corner case (#7621)
Summary:
Consider the following sequence of events:
1. Db flushed an SST with file number N, appended to MANIFEST, and tried to sync the MANIFEST.
2. Syncing MANIFEST failed and db crashed.
3. Db tried to recover with this MANIFEST. In the meantime, no entry about the newly-flushed SST was found in the MANIFEST. Therefore, RocksDB replayed WAL and tried to flush to an SST file reusing the same file number N. This failed because file system does not support overwrite. Then Db deleted this file.
4. Db crashed again.
5. Db tried to recover. When db read the MANIFEST, there was an entry referencing N.sst. This could happen probably because the append in step 1 finally reached the MANIFEST and became visible. Since N.sst had been deleted in step 3, recovery failed.
It is possible that N.sst created in step 1 is valid. Although step 3 would still fail since the MANIFEST was not synced properly in step 1 and 2, deleting N.sst would make it impossible for the db to recover even if the remaining part of MANIFEST was appended and visible after step 5.
After this PR, in step 3, immediately after recovering from MANIFEST, a new MANIFEST is created, then we find that N.sst is not referenced in the MANIFEST, so we delete it, and we'll not reuse N as file number. Then in step 5, since the new MANIFEST does not contain N.sst, the recovery failure situation in step 5 won't happen.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7621
Test Plan:
1. some tests are updated, because these tests assume that new MANIFEST is created after WAL recovery.
2. a new unit test is added in db_basic_test to simulate step 3.
Reviewed By: riversand963
Differential Revision: D24668144
Pulled By: cheng-chang
fbshipit-source-id: 90d7487fbad2bc3714f5ede46ea949895b15ae3b
2020-11-08 05:54:55 +00:00
|
|
|
// After recovery, there may be SST files in db/cf paths that are
|
|
|
|
// not referenced in the MANIFEST (e.g.
|
|
|
|
// 1. It's best effort recovery;
|
|
|
|
// 2. The VersionEdits referencing the SST files are appended to
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
// RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
|
Fix a recovery corner case (#7621)
Summary:
Consider the following sequence of events:
1. Db flushed an SST with file number N, appended to MANIFEST, and tried to sync the MANIFEST.
2. Syncing MANIFEST failed and db crashed.
3. Db tried to recover with this MANIFEST. In the meantime, no entry about the newly-flushed SST was found in the MANIFEST. Therefore, RocksDB replayed WAL and tried to flush to an SST file reusing the same file number N. This failed because file system does not support overwrite. Then Db deleted this file.
4. Db crashed again.
5. Db tried to recover. When db read the MANIFEST, there was an entry referencing N.sst. This could happen probably because the append in step 1 finally reached the MANIFEST and became visible. Since N.sst had been deleted in step 3, recovery failed.
It is possible that N.sst created in step 1 is valid. Although step 3 would still fail since the MANIFEST was not synced properly in step 1 and 2, deleting N.sst would make it impossible for the db to recover even if the remaining part of MANIFEST was appended and visible after step 5.
After this PR, in step 3, immediately after recovering from MANIFEST, a new MANIFEST is created, then we find that N.sst is not referenced in the MANIFEST, so we delete it, and we'll not reuse N as file number. Then in step 5, since the new MANIFEST does not contain N.sst, the recovery failure situation in step 5 won't happen.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7621
Test Plan:
1. some tests are updated, because these tests assume that new MANIFEST is created after WAL recovery.
2. a new unit test is added in db_basic_test to simulate step 3.
Reviewed By: riversand963
Differential Revision: D24668144
Pulled By: cheng-chang
fbshipit-source-id: 90d7487fbad2bc3714f5ede46ea949895b15ae3b
2020-11-08 05:54:55 +00:00
|
|
|
// still not synced to MANIFEST during recovery.)
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
// It stores the SST files to be deleted in RecoveryContext. In the
|
2020-03-21 02:17:54 +00:00
|
|
|
// meantime, we find out the largest file number present in the paths, and
|
|
|
|
// bump up the version set's next_file_number_ to be 1 + largest_file_number.
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
// recovery_ctx stores the context about version edits and files to be
|
|
|
|
// deleted. All those edits are persisted to new Manifest after successfully
|
|
|
|
// syncing the new WAL.
|
|
|
|
Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);
|
2020-03-21 02:17:54 +00:00
|
|
|
|
2020-06-15 17:45:03 +00:00
|
|
|
// SetDbSessionId() should be called in the constuctor DBImpl()
|
|
|
|
// to ensure that db_session_id_ gets updated every time the DB is opened
|
|
|
|
void SetDbSessionId();
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const;
|
2023-09-13 23:34:18 +00:00
|
|
|
Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& ts) const;
|
|
|
|
|
|
|
|
// Check that the read timestamp `ts` is at or above the `full_history_ts_low`
|
|
|
|
// timestamp in a `SuperVersion`. It's necessary to do this check after
|
|
|
|
// grabbing the SuperVersion. If the check passed, the referenced SuperVersion
|
|
|
|
// this read holds on to can ensure the read won't be affected if
|
|
|
|
// `full_history_ts_low` is increased concurrently, and this achieves that
|
|
|
|
// without explicitly locking by piggybacking the SuperVersion.
|
|
|
|
Status FailIfReadCollapsedHistory(const ColumnFamilyData* cfd,
|
|
|
|
const SuperVersion* sv,
|
|
|
|
const Slice& ts) const;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
// recovery_ctx stores the context about version edits and
|
|
|
|
// LogAndApplyForRecovery persist all those edits to new Manifest after
|
|
|
|
// successfully syncing new WAL.
|
|
|
|
// LogAndApplyForRecovery should be called only once during recovery and it
|
|
|
|
// should be called when RocksDB writes to a first new MANIFEST since this
|
|
|
|
// recovery.
|
|
|
|
Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
|
|
|
|
|
2022-06-21 21:51:56 +00:00
|
|
|
void InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
|
|
|
|
|
|
|
|
// Return true to proceed with current WAL record whose content is stored in
|
|
|
|
// `batch`. Return false to skip current WAL record.
|
|
|
|
bool InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
|
|
|
|
const std::string& wal_fname,
|
|
|
|
log::Reader::Reporter& reporter,
|
|
|
|
Status& status, bool& stop_replay,
|
|
|
|
WriteBatch& batch);
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
|
|
|
friend class DB;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
friend class ErrorHandler;
|
2014-03-27 18:59:37 +00:00
|
|
|
friend class InternalStats;
|
2017-08-07 23:07:40 +00:00
|
|
|
friend class PessimisticTransaction;
|
2018-04-03 03:19:21 +00:00
|
|
|
friend class TransactionBaseImpl;
|
2017-08-07 23:07:40 +00:00
|
|
|
friend class WriteCommittedTxn;
|
|
|
|
friend class WritePreparedTxn;
|
2017-09-28 23:43:04 +00:00
|
|
|
friend class WritePreparedTxnDB;
|
2017-09-11 15:58:52 +00:00
|
|
|
friend class WriteBatchWithIndex;
|
2018-07-07 00:17:36 +00:00
|
|
|
friend class WriteUnpreparedTxnDB;
|
2018-07-24 07:09:18 +00:00
|
|
|
friend class WriteUnpreparedTxn;
|
|
|
|
|
2014-05-30 21:31:55 +00:00
|
|
|
friend class ForwardIterator;
|
2014-03-09 05:12:13 +00:00
|
|
|
friend struct SuperVersion;
|
2014-09-25 18:14:01 +00:00
|
|
|
friend class CompactedDBImpl;
|
Delay bottommost level single file compactions (#11701)
Summary:
For leveled compaction, RocksDB has a special kind of compaction with reason "kBottommmostFiles" that compacts bottommost level files to clear data held by snapshots (more detail in https://github.com/facebook/rocksdb/issues/3009). Such compactions can happen soon after a relevant snapshot is released. For some use cases, a bottommost file may contain only a small amount of keys that can be cleared, so compacting such a file has a high write amp. In addition, these bottommost files may be compacted in compactions with reason other than "kBottommmostFiles" if we wait for some time (so that enough data is ingested to trigger such a compaction). This PR introduces an option `bottommost_file_compaction_delay` to specify the delay of these bottommost level single file compactions.
* The main change is in `VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction()` where we only add a file to `bottommost_files_marked_for_compaction_` if it oldest_snapshot is larger than its non-zero largest_seqno **and** the file is old enough. Note that if a file is not old enough but its largest_seqno is less than oldest_snapshot, we exclude it from the calculation of `bottommost_files_mark_threshold_`. This makes the change simpler, but such a file's eligibility for compaction will only be checked the next time `ComputeBottommostFilesMarkedForCompaction()` is called. This happens when a new Version is created (compaction, flush, SetOptions()...), a new enough snapshot is released (`VersionStorageInfo::UpdateOldestSnapshot()`) or when a compaction is picked and compaction score has to be re-calculated.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11701
Test Plan:
* Add two unit tests to test when bottommost_file_compaction_delay > 0.
* Ran crash test with the new option.
Reviewed By: jaykorean, ajkr
Differential Revision: D48331564
Pulled By: cbi42
fbshipit-source-id: c584f3dc5f6354fce3ed65f4c6366dc450b15ba8
2023-08-17 00:45:44 +00:00
|
|
|
#ifndef NDEBUG
|
2018-03-26 23:16:59 +00:00
|
|
|
friend class DBTest_ConcurrentFlushWAL_Test;
|
2018-10-10 05:50:59 +00:00
|
|
|
friend class DBTest_MixedSlowdownOptionsStop_Test;
|
2019-03-26 02:14:04 +00:00
|
|
|
friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
|
2019-06-04 05:37:40 +00:00
|
|
|
friend class DBCompactionTest_CompactionDuringShutdown_Test;
|
Delay bottommost level single file compactions (#11701)
Summary:
For leveled compaction, RocksDB has a special kind of compaction with reason "kBottommmostFiles" that compacts bottommost level files to clear data held by snapshots (more detail in https://github.com/facebook/rocksdb/issues/3009). Such compactions can happen soon after a relevant snapshot is released. For some use cases, a bottommost file may contain only a small amount of keys that can be cleared, so compacting such a file has a high write amp. In addition, these bottommost files may be compacted in compactions with reason other than "kBottommmostFiles" if we wait for some time (so that enough data is ingested to trigger such a compaction). This PR introduces an option `bottommost_file_compaction_delay` to specify the delay of these bottommost level single file compactions.
* The main change is in `VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction()` where we only add a file to `bottommost_files_marked_for_compaction_` if it oldest_snapshot is larger than its non-zero largest_seqno **and** the file is old enough. Note that if a file is not old enough but its largest_seqno is less than oldest_snapshot, we exclude it from the calculation of `bottommost_files_mark_threshold_`. This makes the change simpler, but such a file's eligibility for compaction will only be checked the next time `ComputeBottommostFilesMarkedForCompaction()` is called. This happens when a new Version is created (compaction, flush, SetOptions()...), a new enough snapshot is released (`VersionStorageInfo::UpdateOldestSnapshot()`) or when a compaction is picked and compaction score has to be re-calculated.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11701
Test Plan:
* Add two unit tests to test when bottommost_file_compaction_delay > 0.
* Ran crash test with the new option.
Reviewed By: jaykorean, ajkr
Differential Revision: D48331564
Pulled By: cbi42
fbshipit-source-id: c584f3dc5f6354fce3ed65f4c6366dc450b15ba8
2023-08-17 00:45:44 +00:00
|
|
|
friend class DBCompactionTest_DelayCompactBottomLevelFilesWithDeletions_Test;
|
|
|
|
friend class DBCompactionTest_DisableCompactBottomLevelFiles_Test;
|
2019-06-17 22:17:43 +00:00
|
|
|
friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
|
2017-10-03 16:08:07 +00:00
|
|
|
friend class DBTest2_ReadCallbackTest_Test;
|
2020-06-30 19:29:01 +00:00
|
|
|
friend class WriteCallbackPTest_WriteWithCallbackTest_Test;
|
2015-05-29 21:36:35 +00:00
|
|
|
friend class XFTransactionWriteHandler;
|
2017-10-03 16:08:07 +00:00
|
|
|
friend class DBBlobIndexTest;
|
2018-07-24 07:09:18 +00:00
|
|
|
friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
|
2015-05-29 21:36:35 +00:00
|
|
|
#endif
|
2019-05-31 04:29:44 +00:00
|
|
|
|
2012-03-09 00:23:21 +00:00
|
|
|
struct CompactionState;
|
2019-05-31 04:29:44 +00:00
|
|
|
struct PrepickedCompaction;
|
|
|
|
struct PurgeFileInfo;
|
2014-09-05 22:20:05 +00:00
|
|
|
|
2017-04-04 17:19:33 +00:00
|
|
|
struct WriteContext {
|
2017-10-06 01:00:38 +00:00
|
|
|
SuperVersionContext superversion_context;
|
2017-04-04 17:19:33 +00:00
|
|
|
autovector<MemTable*> memtables_to_free_;
|
|
|
|
|
2017-10-06 01:00:38 +00:00
|
|
|
explicit WriteContext(bool create_superversion = false)
|
|
|
|
: superversion_context(create_superversion) {}
|
|
|
|
|
2017-04-04 17:19:33 +00:00
|
|
|
~WriteContext() {
|
2017-10-06 01:00:38 +00:00
|
|
|
superversion_context.Clean();
|
2017-04-04 17:19:33 +00:00
|
|
|
for (auto& m : memtables_to_free_) {
|
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2019-05-31 04:29:44 +00:00
|
|
|
struct LogFileNumberSize {
|
|
|
|
explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
|
2021-03-28 16:58:42 +00:00
|
|
|
LogFileNumberSize() {}
|
2019-05-31 04:29:44 +00:00
|
|
|
void AddSize(uint64_t new_size) { size += new_size; }
|
|
|
|
uint64_t number;
|
|
|
|
uint64_t size = 0;
|
|
|
|
bool getting_flushed = false;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct LogWriterNumber {
|
|
|
|
// pass ownership of _writer
|
|
|
|
LogWriterNumber(uint64_t _number, log::Writer* _writer)
|
|
|
|
: number(_number), writer(_writer) {}
|
|
|
|
|
|
|
|
log::Writer* ReleaseWriter() {
|
|
|
|
auto* w = writer;
|
|
|
|
writer = nullptr;
|
|
|
|
return w;
|
|
|
|
}
|
|
|
|
Status ClearWriter() {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
|
|
Status s = writer->WriteBuffer(WriteOptions());
|
2019-05-31 04:29:44 +00:00
|
|
|
delete writer;
|
|
|
|
writer = nullptr;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2022-06-17 23:45:28 +00:00
|
|
|
bool IsSyncing() { return getting_synced; }
|
|
|
|
|
|
|
|
uint64_t GetPreSyncSize() {
|
|
|
|
assert(getting_synced);
|
|
|
|
return pre_sync_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PrepareForSync() {
|
|
|
|
assert(!getting_synced);
|
|
|
|
// Size is expected to be monotonically increasing.
|
|
|
|
assert(writer->file()->GetFlushedSize() >= pre_sync_size);
|
|
|
|
getting_synced = true;
|
|
|
|
pre_sync_size = writer->file()->GetFlushedSize();
|
|
|
|
}
|
|
|
|
|
|
|
|
void FinishSync() {
|
|
|
|
assert(getting_synced);
|
|
|
|
getting_synced = false;
|
|
|
|
}
|
|
|
|
|
2019-05-31 04:29:44 +00:00
|
|
|
uint64_t number;
|
|
|
|
// Visual Studio doesn't support deque's member to be noncopyable because
|
|
|
|
// of a std::unique_ptr as a member.
|
|
|
|
log::Writer* writer; // own
|
2022-06-17 23:45:28 +00:00
|
|
|
|
|
|
|
private:
|
2019-05-31 04:29:44 +00:00
|
|
|
// true for some prefix of logs_
|
|
|
|
bool getting_synced = false;
|
2022-06-17 23:45:28 +00:00
|
|
|
// The size of the file before the sync happens. This amount is guaranteed
|
|
|
|
// to be persisted even if appends happen during sync so it can be used for
|
|
|
|
// tracking the synced size in MANIFEST.
|
|
|
|
uint64_t pre_sync_size = 0;
|
2019-05-31 04:29:44 +00:00
|
|
|
};
|
|
|
|
|
2022-07-21 20:35:36 +00:00
|
|
|
struct LogContext {
|
|
|
|
explicit LogContext(bool need_sync = false)
|
|
|
|
: need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
|
|
|
|
bool need_log_sync = false;
|
|
|
|
bool need_log_dir_sync = false;
|
|
|
|
log::Writer* writer = nullptr;
|
|
|
|
LogFileNumberSize* log_file_number_size = nullptr;
|
|
|
|
};
|
|
|
|
|
2019-05-31 04:29:44 +00:00
|
|
|
// PurgeFileInfo is a structure to hold information of files to be deleted in
|
2019-12-18 04:07:21 +00:00
|
|
|
// purge_files_
|
2019-05-31 04:29:44 +00:00
|
|
|
struct PurgeFileInfo {
|
|
|
|
std::string fname;
|
|
|
|
std::string dir_to_sync;
|
|
|
|
FileType type;
|
|
|
|
uint64_t number;
|
|
|
|
int job_id;
|
|
|
|
PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
|
|
|
|
int jid)
|
|
|
|
: fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Argument required by background flush thread.
|
|
|
|
struct BGFlushArg {
|
|
|
|
BGFlushArg()
|
2023-01-24 17:54:04 +00:00
|
|
|
: cfd_(nullptr),
|
|
|
|
max_memtable_id_(0),
|
|
|
|
superversion_context_(nullptr),
|
|
|
|
flush_reason_(FlushReason::kOthers) {}
|
2019-05-31 04:29:44 +00:00
|
|
|
BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
|
2023-01-24 17:54:04 +00:00
|
|
|
SuperVersionContext* superversion_context,
|
|
|
|
FlushReason flush_reason)
|
2019-05-31 04:29:44 +00:00
|
|
|
: cfd_(cfd),
|
|
|
|
max_memtable_id_(max_memtable_id),
|
2023-01-24 17:54:04 +00:00
|
|
|
superversion_context_(superversion_context),
|
|
|
|
flush_reason_(flush_reason) {}
|
2019-05-31 04:29:44 +00:00
|
|
|
|
|
|
|
// Column family to flush.
|
|
|
|
ColumnFamilyData* cfd_;
|
|
|
|
// Maximum ID of memtable to flush. In this column family, memtables with
|
|
|
|
// IDs smaller than this value must be flushed before this flush completes.
|
|
|
|
uint64_t max_memtable_id_;
|
|
|
|
// Pointer to a SuperVersionContext object. After flush completes, RocksDB
|
|
|
|
// installs a new superversion for the column family. This operation
|
|
|
|
// requires a SuperVersionContext object (currently embedded in JobContext).
|
|
|
|
SuperVersionContext* superversion_context_;
|
2023-01-24 17:54:04 +00:00
|
|
|
FlushReason flush_reason_;
|
2019-05-31 04:29:44 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Argument passed to flush thread.
|
|
|
|
struct FlushThreadArg {
|
|
|
|
DBImpl* db_;
|
|
|
|
|
|
|
|
Env::Priority thread_pri_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Information for a manual compaction
|
|
|
|
struct ManualCompactionState {
|
2022-03-13 04:07:04 +00:00
|
|
|
ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
|
|
|
|
int _output_level, uint32_t _output_path_id,
|
|
|
|
bool _exclusive, bool _disallow_trivial_move,
|
|
|
|
std::atomic<bool>* _canceled)
|
|
|
|
: cfd(_cfd),
|
|
|
|
input_level(_input_level),
|
|
|
|
output_level(_output_level),
|
|
|
|
output_path_id(_output_path_id),
|
|
|
|
exclusive(_exclusive),
|
|
|
|
disallow_trivial_move(_disallow_trivial_move),
|
2022-06-07 01:32:26 +00:00
|
|
|
canceled(_canceled ? *_canceled : canceled_internal_storage) {}
|
|
|
|
// When _canceled is not provided by ther user, we assign the reference of
|
|
|
|
// canceled_internal_storage to it to consolidate canceled and
|
|
|
|
// manual_compaction_paused since DisableManualCompaction() might be
|
|
|
|
// called
|
2022-03-13 04:07:04 +00:00
|
|
|
|
2019-05-31 04:29:44 +00:00
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
int input_level;
|
|
|
|
int output_level;
|
|
|
|
uint32_t output_path_id;
|
|
|
|
Status status;
|
2022-03-13 04:07:04 +00:00
|
|
|
bool done = false;
|
|
|
|
bool in_progress = false; // compaction request being processed?
|
|
|
|
bool incomplete = false; // only part of requested range compacted
|
|
|
|
bool exclusive; // current behavior of only one manual
|
|
|
|
bool disallow_trivial_move; // Force actual compaction to run
|
|
|
|
const InternalKey* begin = nullptr; // nullptr means beginning of key range
|
|
|
|
const InternalKey* end = nullptr; // nullptr means end of key range
|
|
|
|
InternalKey* manual_end = nullptr; // how far we are compacting
|
2021-06-07 18:40:31 +00:00
|
|
|
InternalKey tmp_storage; // Used to keep track of compaction progress
|
|
|
|
InternalKey tmp_storage1; // Used to keep track of compaction progress
|
2022-06-07 01:32:26 +00:00
|
|
|
|
|
|
|
// When the user provides a canceled pointer in CompactRangeOptions, the
|
|
|
|
// above varaibe is the reference of the user-provided
|
|
|
|
// `canceled`, otherwise, it is the reference of canceled_internal_storage
|
|
|
|
std::atomic<bool> canceled_internal_storage = false;
|
|
|
|
std::atomic<bool>& canceled; // Compaction canceled pointer reference
|
2019-05-31 04:29:44 +00:00
|
|
|
};
|
|
|
|
struct PrepickedCompaction {
|
|
|
|
// background compaction takes ownership of `compaction`.
|
|
|
|
Compaction* compaction;
|
|
|
|
// caller retains ownership of `manual_compaction_state` as it is reused
|
|
|
|
// across background compactions.
|
2022-03-15 19:31:14 +00:00
|
|
|
ManualCompactionState* manual_compaction_state; // nullptr if non-manual
|
2019-05-31 04:29:44 +00:00
|
|
|
// task limiter token is requested during compaction picking.
|
|
|
|
std::unique_ptr<TaskLimiterToken> task_token;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct CompactionArg {
|
|
|
|
// caller retains ownership of `db`.
|
|
|
|
DBImpl* db;
|
|
|
|
// background compaction takes ownership of `prepicked_compaction`.
|
|
|
|
PrepickedCompaction* prepicked_compaction;
|
Fix possible hang issue in ~DBImpl() when flush is scheduled in LOW pool (#8125)
Summary:
In DBImpl::CloseHelper, we wait for bg_compaction_scheduled_
and bg_flush_scheduled_ to drop to 0. Unschedule is called prior
to cancel any unscheduled flushes/compactions. It is assumed that
anything in the high priority is a flush, and anything in the low
priority pool is a compaction. This assumption, however, is broken when
the high-pri pool is full.
As a result, bg_compaction_scheduled_ can go < 0 and bg_flush_scheduled_
will remain > 0 and DB can be in hang state.
The fix is, we decrement the `bg_{flush,compaction,bottom_compaction}_scheduled_`
inside the `Unschedule{Flush,Compaction,BottomCompaction}Callback()`s. DB
`mutex_` will make the counts atomic in `Unschedule`.
Related discussion: https://github.com/facebook/rocksdb/issues/7928
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8125
Test Plan: Added new test case which hangs without the fix.
Reviewed By: jay-zhuang
Differential Revision: D27390043
Pulled By: ajkr
fbshipit-source-id: 78a367fba9a59ac5607ad24bd1c46dc16d5ec110
2021-03-31 01:34:11 +00:00
|
|
|
Env::Priority compaction_pri_;
|
2019-05-31 04:29:44 +00:00
|
|
|
};
|
2016-06-22 01:41:23 +00:00
|
|
|
|
2019-06-17 22:17:43 +00:00
|
|
|
// Initialize the built-in column family for persistent stats. Depending on
|
|
|
|
// whether on-disk persistent stats have been enabled before, it may either
|
|
|
|
// create a new column family and column family handle or just a column family
|
|
|
|
// handle.
|
|
|
|
// Required: DB mutex held
|
|
|
|
Status InitPersistStatsColumnFamily();
|
|
|
|
|
|
|
|
// Persistent Stats column family has two format version key which are used
|
|
|
|
// for compatibility check. Write format version if it's created for the
|
|
|
|
// first time, read format version and check compatibility if recovering
|
|
|
|
// from disk. This function requires DB mutex held at entrance but may
|
|
|
|
// release and re-acquire DB mutex in the process.
|
|
|
|
// Required: DB mutex held
|
|
|
|
Status PersistentStatsProcessFormatVersion();
|
|
|
|
|
2020-09-18 03:22:35 +00:00
|
|
|
Status ResumeImpl(DBRecoverContext context);
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
void MaybeIgnoreError(Status* s) const;
|
|
|
|
|
2012-11-26 21:56:45 +00:00
|
|
|
const Status CreateArchivalDirectory();
|
|
|
|
|
2023-10-04 21:14:22 +00:00
|
|
|
// Create a column family, without some of the follow-up work yet
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
Status CreateColumnFamilyImpl(const ReadOptions& read_options,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
const ColumnFamilyOptions& cf_options,
|
2017-05-08 05:12:55 +00:00
|
|
|
const std::string& cf_name,
|
|
|
|
ColumnFamilyHandle** handle);
|
|
|
|
|
2023-10-04 21:14:22 +00:00
|
|
|
// Follow-up work to user creating a column family or (families)
|
|
|
|
Status WrapUpCreateColumnFamilies(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions& read_options, const WriteOptions& write_options,
|
2023-10-04 21:14:22 +00:00
|
|
|
const std::vector<const ColumnFamilyOptions*>& cf_options);
|
|
|
|
|
2017-05-08 05:12:55 +00:00
|
|
|
Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Delete any unneeded files and stale in-memory entries.
|
|
|
|
void DeleteObsoleteFiles();
|
2016-06-22 01:41:23 +00:00
|
|
|
// Delete obsolete files and log status and information of file deletion
|
2017-09-07 21:11:15 +00:00
|
|
|
void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
|
2018-04-26 20:51:39 +00:00
|
|
|
const std::string& path_to_sync, FileType type,
|
|
|
|
uint64_t number);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2014-11-07 19:50:34 +00:00
|
|
|
// Background process needs to call
|
|
|
|
// auto x = CaptureCurrentFileNumberInPendingOutputs()
|
2016-03-04 02:25:07 +00:00
|
|
|
// auto file_num = versions_->NewFileNumber();
|
2014-11-07 19:50:34 +00:00
|
|
|
// <do something>
|
|
|
|
// ReleaseFileNumberFromPendingOutputs(x)
|
2016-03-04 02:25:07 +00:00
|
|
|
// This will protect any file with number `file_num` or greater from being
|
|
|
|
// deleted while <do something> is running.
|
2014-11-07 19:50:34 +00:00
|
|
|
// -----------
|
|
|
|
// This function will capture current file number and append it to
|
|
|
|
// pending_outputs_. This will prevent any background process to delete any
|
|
|
|
// file created after this point.
|
|
|
|
std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
|
|
|
|
// This function should be called with the result of
|
|
|
|
// CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
|
|
|
|
// created between the calls CaptureCurrentFileNumberInPendingOutputs() and
|
|
|
|
// ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
|
|
|
|
// and blocked by any other pending_outputs_ calls)
|
2019-10-08 21:18:48 +00:00
|
|
|
void ReleaseFileNumberFromPendingOutputs(
|
|
|
|
std::unique_ptr<std::list<uint64_t>::iterator>& v);
|
2014-11-07 19:50:34 +00:00
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
IOStatus SyncClosedLogs(const WriteOptions& write_options,
|
|
|
|
JobContext* job_context, VersionEdit* synced_wals,
|
2023-10-12 23:55:25 +00:00
|
|
|
bool error_recovery_in_prog);
|
2016-07-19 22:12:46 +00:00
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
// Flush the in-memory write buffer to storage. Switches to a new
|
2018-08-24 20:17:29 +00:00
|
|
|
// log-file/memtable and writes a new descriptor iff successful. Then
|
|
|
|
// installs a new super version for the column family.
|
2019-01-31 19:53:29 +00:00
|
|
|
Status FlushMemTableToOutputFile(
|
|
|
|
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
|
2023-01-24 17:54:04 +00:00
|
|
|
bool* madeProgress, JobContext* job_context, FlushReason flush_reason,
|
2019-01-31 19:53:29 +00:00
|
|
|
SuperVersionContext* superversion_context,
|
|
|
|
std::vector<SequenceNumber>& snapshot_seqs,
|
|
|
|
SequenceNumber earliest_write_conflict_snapshot,
|
2019-03-20 00:24:09 +00:00
|
|
|
SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
|
|
|
|
Env::Priority thread_pri);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2018-08-24 20:17:29 +00:00
|
|
|
// Flush the memtables of (multiple) column families to multiple files on
|
|
|
|
// persistent storage.
|
|
|
|
Status FlushMemTablesToOutputFiles(
|
|
|
|
const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
|
2019-03-20 00:24:09 +00:00
|
|
|
JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
|
2018-08-24 20:17:29 +00:00
|
|
|
|
2018-10-16 02:59:20 +00:00
|
|
|
Status AtomicFlushMemTablesToOutputFiles(
|
|
|
|
const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
|
2019-03-20 00:24:09 +00:00
|
|
|
JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
|
2018-10-16 02:59:20 +00:00
|
|
|
|
2014-09-09 18:18:50 +00:00
|
|
|
// REQUIRES: log_numbers are sorted in ascending order
|
2020-01-29 19:39:11 +00:00
|
|
|
// corrupted_log_found is set to true if we recover from a corrupted log file.
|
2022-04-26 21:46:53 +00:00
|
|
|
Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
2020-01-29 19:39:11 +00:00
|
|
|
SequenceNumber* next_sequence, bool read_only,
|
Persist the new MANIFEST after successfully syncing the new WAL during recovery (#9922)
Summary:
In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL.
If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9922
Test Plan:
1. Update unit tests to fail without this change
2. make crast_test -j
Branch with unit test and no fix https://github.com/facebook/rocksdb/pull/9942 to keep track of unit test (without fix)
Reviewed By: riversand963
Differential Revision: D36043701
Pulled By: akankshamahajan15
fbshipit-source-id: 5760970db0a0920fb73d3c054a4155733500acd9
2022-06-01 17:52:26 +00:00
|
|
|
bool* corrupted_log_found,
|
|
|
|
RecoveryContext* recovery_ctx);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2012-10-19 21:00:53 +00:00
|
|
|
// The following two methods are used to flush a memtable to
|
2016-03-25 02:39:13 +00:00
|
|
|
// storage. The first one is used at database RecoveryTime (when the
|
2012-10-19 21:00:53 +00:00
|
|
|
// database is opened) and is heavyweight because it holds the mutex
|
|
|
|
// for the entire period. The second method WriteLevel0Table supports
|
|
|
|
// concurrent flush memtables to storage.
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-27 22:20:02 +00:00
|
|
|
Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
|
|
|
MemTable* mem, VersionEdit* edit);
|
2015-05-15 22:52:51 +00:00
|
|
|
|
2021-03-28 16:58:42 +00:00
|
|
|
// Get the size of a log file and, if truncate is true, truncate the
|
|
|
|
// log file to its actual size, thereby freeing preallocated space.
|
|
|
|
// Return success even if truncate fails
|
|
|
|
Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
|
|
|
|
LogFileNumberSize* log);
|
|
|
|
|
2018-09-26 17:34:56 +00:00
|
|
|
// Restore alive_log_files_ and total_log_size_ after recovery.
|
|
|
|
// It needs to run only when there's no flush during recovery
|
|
|
|
// (e.g. avoid_flush_during_recovery=true). May also trigger flush
|
|
|
|
// in case total_log_size > max_total_wal_size.
|
|
|
|
Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
|
|
|
|
|
2015-05-15 22:52:51 +00:00
|
|
|
// num_bytes: for slowdown case, delay time is calculated based on
|
|
|
|
// `num_bytes` going through.
|
Fix DelayWrite() calls for two_write_queues (#11130)
Summary:
PR https://github.com/facebook/rocksdb/issues/11020 fixed a case where it was easy to deadlock the DB with LockWAL() but introduced a bug showing up as a rare assertion failure in the stress test. Specifically, `assert(w->state == STATE_INIT)` in `WriteThread::LinkOne()` called from `BeginWriteStall()`, `DelayWrite()`, `WriteImplWALOnly()`. I haven't been about to generate a unit test that reproduces this failure but I believe the root cause is that DelayWrite() was never meant to be re-entrant, only called from the DB's write_thread_ leader. https://github.com/facebook/rocksdb/issues/11020 introduced a call to DelayWrite() from the nonmem_write_thread_ group leader.
This fix is to make DelayWrite() apply to the specific write queue that it is being called from (inject a dummy write stall entry to the head of the appropriate write queue). WriteController is re-entrant, based on polling and state changes signalled with bg_cv_, so can manage stalling two queues. The only anticipated complication (called out by Andrew in previous PR) is that we don't want timed write delays being injected in parallel for the two queues, because that dimishes the intended throttling effect. Thus, we only allow timed delays for the primary write queue.
HISTORY not updated because this is intended for the same release where the bug was introduced.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11130
Test Plan:
Although I was not able to reproduce the assertion failure, I was able to reproduce a distinct flaw with what I believe is the same root cause: a kind of deadlock if both write queues need to wake up from stopped writes. Only one will be waiting on bg_cv_ (the other waiting in `LinkOne()` for the write queue to open up), so a single SignalAll() will only unblock one of the queues, with the other re-instating the stop until another signal on bg_cv_. A simple unit test is added for this case.
Will also run crash_test_with_multiops_wc_txn for a while looking for issues.
Reviewed By: ajkr
Differential Revision: D42749330
Pulled By: pdillinger
fbshipit-source-id: 4317dd899a93d57c26fd5af7143038f82d4d4d1b
2023-01-25 22:18:27 +00:00
|
|
|
Status DelayWrite(uint64_t num_bytes, WriteThread& write_thread,
|
|
|
|
const WriteOptions& write_options);
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 18:20:25 +00:00
|
|
|
|
2021-04-21 20:53:05 +00:00
|
|
|
// Begin stalling of writes when memory usage increases beyond a certain
|
|
|
|
// threshold.
|
|
|
|
void WriteBufferManagerStallWrites();
|
|
|
|
|
2017-06-05 21:42:34 +00:00
|
|
|
Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
|
|
|
|
WriteBatch* my_batch);
|
|
|
|
|
2020-02-18 21:52:09 +00:00
|
|
|
// REQUIRES: mutex locked and in write thread.
|
2014-09-11 01:46:09 +00:00
|
|
|
Status ScheduleFlushes(WriteContext* context);
|
2014-04-07 18:29:48 +00:00
|
|
|
|
2019-07-01 18:53:25 +00:00
|
|
|
void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
|
|
|
|
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
Status TrimMemtableHistory(WriteContext* context);
|
|
|
|
|
2018-08-24 20:17:29 +00:00
|
|
|
Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
|
2014-08-12 05:10:32 +00:00
|
|
|
|
Fix bug of prematurely excluded CF in atomic flush contains unflushed data that should've been included in the atomic flush (#11148)
Summary:
**Context:**
Atomic flush should guarantee recoverability of all data of seqno up to the max seqno of the flush. It achieves this by ensuring all such data are flushed by the time this atomic flush finishes through `SelectColumnFamiliesForAtomicFlush()`. However, our crash test exposed the following case where an excluded CF from an atomic flush contains unflushed data of seqno less than the max seqno of that atomic flush and loses its data with `WriteOptions::DisableWAL=true` in face of a crash right after the atomic flush finishes .
```
./db_stress --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 10
kill $pid
sleep 0.2
./db_stress --ops_per_thread=1 --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 40
kill $pid
sleep 0.2
Verification failed for column family 6 key 0000000000000239000000000000012B0000000000000138 (56622): value_from_db: , value_from_expected: 4A6331754E4F4C4D42434041464744455A5B58595E5F5C5D5253505156575455, msg: Value not found: NotFound:
Crash-recovery verification failed :(
No writes or ops?
Verification failed :(
```
The bug is due to the following:
- When atomic flush is used, an empty CF is legally [excluded](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L39) in `SelectColumnFamiliesForAtomicFlush` as the first step of `DBImpl::FlushForGetLiveFiles` before [passing](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L42) the included CFDs to `AtomicFlushMemTables`.
- But [later](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2133) in `AtomicFlushMemTables`, `WaitUntilFlushWouldNotStallWrites` will [release the db mutex](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2403), during which data@seqno N can be inserted into the excluded CF and data@seqno M can be inserted into one of the included CFs, where M > N.
- However, data@seqno N in an already-excluded CF is thus excluded from this atomic flush while we seqno N is less than seqno M.
**Summary:**
- Replace `SelectColumnFamiliesForAtomicFlush()`-before-`AtomicFlushMemTables()` with `SelectColumnFamiliesForAtomicFlush()`-after-wait-within-`AtomicFlushMemTables()` so we ensure no write affecting the recoverability of this atomic job (i.e, change to max seqno of this atomic flush or insertion of data with less seqno than the max seqno of the atomic flush to excluded CF) can happen after calling `SelectColumnFamiliesForAtomicFlush()`.
- For above, refactored and clarified comments on `SelectColumnFamiliesForAtomicFlush()` and `AtomicFlushMemTables()` for clearer semantics of passed-in CFDs to atomic-flush
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11148
Test Plan:
- New unit test failed before the fix and passes after
- Make check
- Rehearsal stress test
Reviewed By: ajkr
Differential Revision: D42799871
Pulled By: hx235
fbshipit-source-id: 13636b63e9c25c5895857afc36ea580d57f6d644
2023-03-14 23:53:20 +00:00
|
|
|
// Select and output column families qualified for atomic flush in
|
|
|
|
// `selected_cfds`. If `provided_candidate_cfds` is non-empty, it will be used
|
|
|
|
// as candidate CFs to select qualified ones from. Otherwise, all column
|
|
|
|
// families are used as candidate to select from.
|
|
|
|
//
|
|
|
|
// REQUIRES: mutex held
|
|
|
|
void SelectColumnFamiliesForAtomicFlush(
|
|
|
|
autovector<ColumnFamilyData*>* selected_cfds,
|
|
|
|
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {});
|
2018-10-26 22:06:44 +00:00
|
|
|
|
2012-07-06 18:42:09 +00:00
|
|
|
// Force current memtable contents to be flushed.
|
2016-10-21 00:05:32 +00:00
|
|
|
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
|
2022-10-04 23:43:01 +00:00
|
|
|
FlushReason flush_reason,
|
|
|
|
bool entered_write_thread = false);
|
2012-07-06 18:42:09 +00:00
|
|
|
|
Fix bug of prematurely excluded CF in atomic flush contains unflushed data that should've been included in the atomic flush (#11148)
Summary:
**Context:**
Atomic flush should guarantee recoverability of all data of seqno up to the max seqno of the flush. It achieves this by ensuring all such data are flushed by the time this atomic flush finishes through `SelectColumnFamiliesForAtomicFlush()`. However, our crash test exposed the following case where an excluded CF from an atomic flush contains unflushed data of seqno less than the max seqno of that atomic flush and loses its data with `WriteOptions::DisableWAL=true` in face of a crash right after the atomic flush finishes .
```
./db_stress --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 10
kill $pid
sleep 0.2
./db_stress --ops_per_thread=1 --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 40
kill $pid
sleep 0.2
Verification failed for column family 6 key 0000000000000239000000000000012B0000000000000138 (56622): value_from_db: , value_from_expected: 4A6331754E4F4C4D42434041464744455A5B58595E5F5C5D5253505156575455, msg: Value not found: NotFound:
Crash-recovery verification failed :(
No writes or ops?
Verification failed :(
```
The bug is due to the following:
- When atomic flush is used, an empty CF is legally [excluded](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L39) in `SelectColumnFamiliesForAtomicFlush` as the first step of `DBImpl::FlushForGetLiveFiles` before [passing](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L42) the included CFDs to `AtomicFlushMemTables`.
- But [later](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2133) in `AtomicFlushMemTables`, `WaitUntilFlushWouldNotStallWrites` will [release the db mutex](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2403), during which data@seqno N can be inserted into the excluded CF and data@seqno M can be inserted into one of the included CFs, where M > N.
- However, data@seqno N in an already-excluded CF is thus excluded from this atomic flush while we seqno N is less than seqno M.
**Summary:**
- Replace `SelectColumnFamiliesForAtomicFlush()`-before-`AtomicFlushMemTables()` with `SelectColumnFamiliesForAtomicFlush()`-after-wait-within-`AtomicFlushMemTables()` so we ensure no write affecting the recoverability of this atomic job (i.e, change to max seqno of this atomic flush or insertion of data with less seqno than the max seqno of the atomic flush to excluded CF) can happen after calling `SelectColumnFamiliesForAtomicFlush()`.
- For above, refactored and clarified comments on `SelectColumnFamiliesForAtomicFlush()` and `AtomicFlushMemTables()` for clearer semantics of passed-in CFDs to atomic-flush
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11148
Test Plan:
- New unit test failed before the fix and passes after
- Make check
- Rehearsal stress test
Reviewed By: ajkr
Differential Revision: D42799871
Pulled By: hx235
fbshipit-source-id: 13636b63e9c25c5895857afc36ea580d57f6d644
2023-03-14 23:53:20 +00:00
|
|
|
// Atomic-flush memtables from quanlified CFs among `provided_candidate_cfds`
|
|
|
|
// (if non-empty) or amomg all column families and atomically record the
|
|
|
|
// result to the MANIFEST.
|
2018-10-26 22:06:44 +00:00
|
|
|
Status AtomicFlushMemTables(
|
|
|
|
const FlushOptions& options, FlushReason flush_reason,
|
Fix bug of prematurely excluded CF in atomic flush contains unflushed data that should've been included in the atomic flush (#11148)
Summary:
**Context:**
Atomic flush should guarantee recoverability of all data of seqno up to the max seqno of the flush. It achieves this by ensuring all such data are flushed by the time this atomic flush finishes through `SelectColumnFamiliesForAtomicFlush()`. However, our crash test exposed the following case where an excluded CF from an atomic flush contains unflushed data of seqno less than the max seqno of that atomic flush and loses its data with `WriteOptions::DisableWAL=true` in face of a crash right after the atomic flush finishes .
```
./db_stress --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 10
kill $pid
sleep 0.2
./db_stress --ops_per_thread=1 --preserve_unverified_changes=1 --reopen=0 --acquire_snapshot_one_in=0 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=1 --atomic_flush=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 --batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=15 --bottommost_compression_type=none --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=0 --compact_range_one_in=0 --compaction_pri=1 --compaction_ttl=100 --compression_max_dict_buffer_bytes=134217727 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=$db --db_write_buffer_size=1048576 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --expected_values_dir=$exp --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --flush_one_in=0 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=100 --get_property_one_in=0 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True --long_running_snapshots=1 --manual_wal_flush_one_in=100 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=10000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=0 --periodic_compaction_seconds=100 --prefix_size=8 --prefixpercent=5 --prepopulate_block_cache=0 --preserve_internal_time_seconds=3600 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --ribbon_starting_level=6 --secondary_cache_fault_one_in=0 --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verify_checksum=1 --verify_checksum_one_in=0 --verify_db_one_in=1000 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=524288 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=30 &
pid=$!
sleep 0.2
sleep 40
kill $pid
sleep 0.2
Verification failed for column family 6 key 0000000000000239000000000000012B0000000000000138 (56622): value_from_db: , value_from_expected: 4A6331754E4F4C4D42434041464744455A5B58595E5F5C5D5253505156575455, msg: Value not found: NotFound:
Crash-recovery verification failed :(
No writes or ops?
Verification failed :(
```
The bug is due to the following:
- When atomic flush is used, an empty CF is legally [excluded](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L39) in `SelectColumnFamiliesForAtomicFlush` as the first step of `DBImpl::FlushForGetLiveFiles` before [passing](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_filesnapshot.cc#L42) the included CFDs to `AtomicFlushMemTables`.
- But [later](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2133) in `AtomicFlushMemTables`, `WaitUntilFlushWouldNotStallWrites` will [release the db mutex](https://github.com/facebook/rocksdb/blob/7.10.fb/db/db_impl/db_impl_compaction_flush.cc#L2403), during which data@seqno N can be inserted into the excluded CF and data@seqno M can be inserted into one of the included CFs, where M > N.
- However, data@seqno N in an already-excluded CF is thus excluded from this atomic flush while we seqno N is less than seqno M.
**Summary:**
- Replace `SelectColumnFamiliesForAtomicFlush()`-before-`AtomicFlushMemTables()` with `SelectColumnFamiliesForAtomicFlush()`-after-wait-within-`AtomicFlushMemTables()` so we ensure no write affecting the recoverability of this atomic job (i.e, change to max seqno of this atomic flush or insertion of data with less seqno than the max seqno of the atomic flush to excluded CF) can happen after calling `SelectColumnFamiliesForAtomicFlush()`.
- For above, refactored and clarified comments on `SelectColumnFamiliesForAtomicFlush()` and `AtomicFlushMemTables()` for clearer semantics of passed-in CFDs to atomic-flush
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11148
Test Plan:
- New unit test failed before the fix and passes after
- Make check
- Rehearsal stress test
Reviewed By: ajkr
Differential Revision: D42799871
Pulled By: hx235
fbshipit-source-id: 13636b63e9c25c5895857afc36ea580d57f6d644
2023-03-14 23:53:20 +00:00
|
|
|
const autovector<ColumnFamilyData*>& provided_candidate_cfds = {},
|
2022-10-04 23:43:01 +00:00
|
|
|
bool entered_write_thread = false);
|
2018-10-26 22:06:44 +00:00
|
|
|
|
2023-10-02 23:26:24 +00:00
|
|
|
Status RetryFlushesForErrorRecovery(FlushReason flush_reason, bool wait);
|
|
|
|
|
2018-08-29 18:58:13 +00:00
|
|
|
// Wait until flushing this column family won't stall writes
|
|
|
|
Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
|
|
|
|
bool* flush_needed);
|
|
|
|
|
2018-01-19 01:32:50 +00:00
|
|
|
// Wait for memtable flushed.
|
|
|
|
// If flush_memtable_id is non-null, wait until the memtable with the ID
|
|
|
|
// gets flush. Otherwise, wait until the column family don't have any
|
|
|
|
// memtable pending flush.
|
2018-10-26 22:06:44 +00:00
|
|
|
// resuming_from_bg_err indicates whether the caller is attempting to resume
|
|
|
|
// from background error.
|
2018-01-19 01:32:50 +00:00
|
|
|
Status WaitForFlushMemTable(ColumnFamilyData* cfd,
|
2018-10-26 22:06:44 +00:00
|
|
|
const uint64_t* flush_memtable_id = nullptr,
|
|
|
|
bool resuming_from_bg_err = false) {
|
|
|
|
return WaitForFlushMemTables({cfd}, {flush_memtable_id},
|
|
|
|
resuming_from_bg_err);
|
2018-08-24 20:17:29 +00:00
|
|
|
}
|
|
|
|
// Wait for memtables to be flushed for multiple column families.
|
|
|
|
Status WaitForFlushMemTables(
|
|
|
|
const autovector<ColumnFamilyData*>& cfds,
|
2018-10-26 22:06:44 +00:00
|
|
|
const autovector<const uint64_t*>& flush_memtable_ids,
|
|
|
|
bool resuming_from_bg_err);
|
|
|
|
|
2019-05-14 00:43:47 +00:00
|
|
|
inline void WaitForPendingWrites() {
|
2019-10-30 01:15:08 +00:00
|
|
|
mutex_.AssertHeld();
|
2019-12-12 22:05:48 +00:00
|
|
|
TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
|
2019-10-30 01:15:08 +00:00
|
|
|
// In case of pipelined write is enabled, wait for all pending memtable
|
|
|
|
// writers.
|
|
|
|
if (immutable_db_options_.enable_pipelined_write) {
|
|
|
|
// Memtable writers may call DB::Get in case max_successive_merges > 0,
|
|
|
|
// which may lock mutex. Unlocking mutex here to avoid deadlock.
|
|
|
|
mutex_.Unlock();
|
|
|
|
write_thread_.WaitForMemTableWriters();
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
|
2019-05-14 00:43:47 +00:00
|
|
|
if (!immutable_db_options_.unordered_write) {
|
|
|
|
// Then the writes are finished before the next write group starts
|
|
|
|
return;
|
|
|
|
}
|
2019-10-30 01:15:08 +00:00
|
|
|
|
2019-05-14 00:43:47 +00:00
|
|
|
// Wait for the ones who already wrote to the WAL to finish their
|
|
|
|
// memtable write.
|
|
|
|
if (pending_memtable_writes_.load() != 0) {
|
|
|
|
std::unique_lock<std::mutex> guard(switch_mutex_);
|
|
|
|
switch_cv_.wait(guard,
|
|
|
|
[&] { return pending_memtable_writes_.load() == 0; });
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-02 21:43:00 +00:00
|
|
|
// TaskType is used to identify tasks in thread-pool, currently only
|
|
|
|
// differentiate manual compaction, which could be unscheduled from the
|
|
|
|
// thread-pool.
|
|
|
|
enum class TaskType : uint8_t {
|
|
|
|
kDefault = 0,
|
|
|
|
kManualCompaction = 1,
|
|
|
|
kCount = 2,
|
|
|
|
};
|
|
|
|
|
|
|
|
// Task tag is used to identity tasks in thread-pool, which is
|
|
|
|
// dbImpl obj address + type
|
|
|
|
inline void* GetTaskTag(TaskType type) {
|
|
|
|
return GetTaskTag(static_cast<uint8_t>(type));
|
|
|
|
}
|
|
|
|
|
|
|
|
inline void* GetTaskTag(uint8_t type) {
|
|
|
|
return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
|
|
|
|
}
|
|
|
|
|
2018-10-26 22:06:44 +00:00
|
|
|
// REQUIRES: mutex locked and in write thread.
|
|
|
|
void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
|
2012-07-06 18:42:09 +00:00
|
|
|
|
2020-02-18 21:52:09 +00:00
|
|
|
// REQUIRES: mutex locked and in write thread.
|
2017-09-28 00:37:08 +00:00
|
|
|
Status SwitchWAL(WriteContext* write_context);
|
2017-04-04 17:19:33 +00:00
|
|
|
|
2020-02-18 21:52:09 +00:00
|
|
|
// REQUIRES: mutex locked and in write thread.
|
2021-04-08 06:17:41 +00:00
|
|
|
Status HandleWriteBufferManagerFlush(WriteContext* write_context);
|
2017-04-04 17:19:33 +00:00
|
|
|
|
|
|
|
// REQUIRES: mutex locked
|
2022-07-21 20:35:36 +00:00
|
|
|
Status PreprocessWrite(const WriteOptions& write_options,
|
|
|
|
LogContext* log_context, WriteContext* write_context);
|
2017-04-04 17:19:33 +00:00
|
|
|
|
2022-06-15 20:43:58 +00:00
|
|
|
// Merge write batches in the write group into merged_batch.
|
|
|
|
// Returns OK if merge is successful.
|
|
|
|
// Returns Corruption if corruption in write batch is detected.
|
|
|
|
Status MergeBatch(const WriteThread::WriteGroup& write_group,
|
|
|
|
WriteBatch* tmp_batch, WriteBatch** merged_batch,
|
|
|
|
size_t* write_with_wal, WriteBatch** to_be_cached_state);
|
2017-06-24 21:06:43 +00:00
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
IOStatus WriteToWAL(const WriteBatch& merged_batch,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
log::Writer* log_writer, uint64_t* log_used,
|
|
|
|
uint64_t* log_size,
|
2022-06-01 22:33:22 +00:00
|
|
|
LogFileNumberSize& log_file_number_size);
|
2017-06-24 21:06:43 +00:00
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
|
log::Writer* log_writer, uint64_t* log_used,
|
|
|
|
bool need_log_sync, bool need_log_dir_sync,
|
2022-06-01 22:33:22 +00:00
|
|
|
SequenceNumber sequence,
|
|
|
|
LogFileNumberSize& log_file_number_size);
|
2017-06-24 21:06:43 +00:00
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
|
|
|
|
uint64_t* log_used,
|
|
|
|
SequenceNumber* last_sequence, size_t seq_inc);
|
2017-04-04 17:19:33 +00:00
|
|
|
|
2020-08-07 18:27:14 +00:00
|
|
|
// Used by WriteImpl to update bg_error_ if paranoid check is enabled.
|
|
|
|
// Caller must hold mutex_.
|
|
|
|
void WriteStatusCheckOnLocked(const Status& status);
|
|
|
|
|
2017-05-19 21:24:23 +00:00
|
|
|
// Used by WriteImpl to update bg_error_ if paranoid check is enabled.
|
2018-03-22 22:56:52 +00:00
|
|
|
void WriteStatusCheck(const Status& status);
|
2017-05-19 21:24:23 +00:00
|
|
|
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
// Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
|
|
|
|
// WAL, sync WAL fails, if paranoid check is enabled.
|
|
|
|
void IOStatusCheck(const IOStatus& status);
|
|
|
|
|
2017-05-19 21:24:23 +00:00
|
|
|
// Used by WriteImpl to update bg_error_ in case of memtable insert error.
|
|
|
|
void MemTableInsertStatusCheck(const Status& memtable_insert_status);
|
2017-04-13 21:46:25 +00:00
|
|
|
|
2016-09-28 22:42:06 +00:00
|
|
|
Status CompactFilesImpl(const CompactionOptions& compact_options,
|
|
|
|
ColumnFamilyData* cfd, Version* version,
|
|
|
|
const std::vector<std::string>& input_file_names,
|
2018-03-15 18:46:16 +00:00
|
|
|
std::vector<std::string>* const output_file_names,
|
2016-09-28 22:42:06 +00:00
|
|
|
const int output_level, int output_path_id,
|
2018-12-13 22:12:02 +00:00
|
|
|
JobContext* job_context, LogBuffer* log_buffer,
|
|
|
|
CompactionJobInfo* compaction_job_info);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 22:45:18 +00:00
|
|
|
|
|
|
|
ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
|
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
void MaybeScheduleFlushOrCompaction();
|
2018-08-24 20:17:29 +00:00
|
|
|
|
2023-01-24 17:54:04 +00:00
|
|
|
struct FlushRequest {
|
|
|
|
FlushReason flush_reason;
|
|
|
|
// A map from column family to flush to largest memtable id to persist for
|
|
|
|
// each column family. Once all the memtables whose IDs are smaller than or
|
|
|
|
// equal to this per-column-family specified value, this flush request is
|
|
|
|
// considered to have completed its work of flushing this column family.
|
|
|
|
// After completing the work for all column families in this request, this
|
|
|
|
// flush is considered complete.
|
|
|
|
std::unordered_map<ColumnFamilyData*, uint64_t>
|
|
|
|
cfd_to_max_mem_id_to_persist;
|
2023-07-26 23:25:06 +00:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
int reschedule_count = 1;
|
|
|
|
#endif /* !NDEBUG */
|
2023-01-24 17:54:04 +00:00
|
|
|
};
|
2018-08-24 20:17:29 +00:00
|
|
|
|
2023-10-02 23:26:24 +00:00
|
|
|
// In case of atomic flush, generates a `FlushRequest` for the latest atomic
|
|
|
|
// cuts for these `cfds`. Atomic cuts are recorded in
|
|
|
|
// `AssignAtomicFlushSeq()`. For each entry in `cfds`, all CFDs sharing the
|
|
|
|
// same latest atomic cut must also be present.
|
|
|
|
//
|
|
|
|
// REQUIRES: mutex held
|
2018-10-26 22:06:44 +00:00
|
|
|
void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
|
2023-01-24 17:54:04 +00:00
|
|
|
FlushReason flush_reason, FlushRequest* req);
|
2018-10-26 22:06:44 +00:00
|
|
|
|
2023-01-24 17:54:04 +00:00
|
|
|
void SchedulePendingFlush(const FlushRequest& req);
|
2018-08-24 20:17:29 +00:00
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
void SchedulePendingCompaction(ColumnFamilyData* cfd);
|
2018-04-26 20:51:39 +00:00
|
|
|
void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
|
|
|
|
FileType type, uint64_t number, int job_id);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 19:20:34 +00:00
|
|
|
static void BGWorkCompaction(void* arg);
|
2017-08-03 22:36:28 +00:00
|
|
|
// Runs a pre-chosen universal compaction involving bottom level in a
|
|
|
|
// separate, bottom-pri thread pool.
|
|
|
|
static void BGWorkBottomCompaction(void* arg);
|
2019-03-20 00:24:09 +00:00
|
|
|
static void BGWorkFlush(void* arg);
|
2016-06-22 01:41:23 +00:00
|
|
|
static void BGWorkPurge(void* arg);
|
2019-03-20 00:24:09 +00:00
|
|
|
static void UnscheduleCompactionCallback(void* arg);
|
|
|
|
static void UnscheduleFlushCallback(void* arg);
|
2017-08-03 22:36:28 +00:00
|
|
|
void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
2019-03-20 00:24:09 +00:00
|
|
|
Env::Priority thread_pri);
|
|
|
|
void BackgroundCallFlush(Env::Priority thread_pri);
|
2016-06-22 01:41:23 +00:00
|
|
|
void BackgroundCallPurge();
|
2014-10-28 18:54:33 +00:00
|
|
|
Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
|
2017-08-03 22:36:28 +00:00
|
|
|
LogBuffer* log_buffer,
|
2019-03-20 00:24:09 +00:00
|
|
|
PrepickedCompaction* prepicked_compaction,
|
|
|
|
Env::Priority thread_pri);
|
2014-10-28 18:54:33 +00:00
|
|
|
Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
|
2019-03-20 00:24:09 +00:00
|
|
|
LogBuffer* log_buffer, FlushReason* reason,
|
2023-07-26 23:25:06 +00:00
|
|
|
bool* flush_rescheduled_to_retain_udt,
|
2019-03-20 00:24:09 +00:00
|
|
|
Env::Priority thread_pri);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs,
|
2018-04-03 02:53:19 +00:00
|
|
|
bool* sfm_bookkeeping, LogBuffer* log_buffer);
|
|
|
|
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
2018-12-13 21:16:04 +00:00
|
|
|
// Request compaction tasks token from compaction thread limiter.
|
|
|
|
// It always succeeds if force = true or limiter is disable.
|
|
|
|
bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
|
|
|
|
std::unique_ptr<TaskLimiterToken>* token,
|
|
|
|
LogBuffer* log_buffer);
|
|
|
|
|
2023-07-26 23:25:06 +00:00
|
|
|
// Return true if the `FlushRequest` can be rescheduled to retain the UDT.
|
|
|
|
// Only true if there are user-defined timestamps in the involved MemTables
|
|
|
|
// with newer than cutoff timestamp `full_history_ts_low` and not flushing
|
|
|
|
// immediately will not cause entering write stall mode.
|
|
|
|
bool ShouldRescheduleFlushRequestToRetainUDT(const FlushRequest& flush_req);
|
|
|
|
|
move dump stats to a separate thread (#4382)
Summary:
Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all.
We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior.
Tested with db_bench using `--stats_dump_period_sec=20` in command line:
> LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG content:
> 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606]
** DB Stats **
Uptime(secs): 20.0 total, 20.0 interval
Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s
Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s
Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent
Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s
Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s
Interval stall: 00:00:0.012 H:M:S, 0.1 percent
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382
Differential Revision: D9933051
Pulled By: miasantreble
fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 05:52:58 +00:00
|
|
|
// Schedule background tasks
|
2022-08-26 01:52:37 +00:00
|
|
|
Status StartPeriodicTaskScheduler();
|
move dump stats to a separate thread (#4382)
Summary:
Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all.
We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior.
Tested with db_bench using `--stats_dump_period_sec=20` in command line:
> LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG content:
> 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606]
** DB Stats **
Uptime(secs): 20.0 total, 20.0 interval
Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s
Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s
Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent
Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s
Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s
Interval stall: 00:00:0.012 H:M:S, 0.1 percent
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382
Differential Revision: D9933051
Pulled By: miasantreble
fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 05:52:58 +00:00
|
|
|
|
2023-08-11 19:30:48 +00:00
|
|
|
// Cancel scheduled periodic tasks
|
|
|
|
Status CancelPeriodicTaskScheduler();
|
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
Status RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
bool is_new_db);
|
2022-07-15 04:49:34 +00:00
|
|
|
|
2013-05-28 19:35:43 +00:00
|
|
|
void PrintStatistics();
|
|
|
|
|
2019-06-17 22:17:43 +00:00
|
|
|
size_t EstimateInMemoryStatsHistorySize() const;
|
2019-02-20 23:46:59 +00:00
|
|
|
|
2013-06-30 06:21:36 +00:00
|
|
|
// Return the minimum empty level that could hold the total data in the
|
|
|
|
// input level. Return the input level, if such level could not be found.
|
2014-10-01 23:19:16 +00:00
|
|
|
int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
|
2019-03-27 23:13:08 +00:00
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
int level);
|
2013-06-30 06:21:36 +00:00
|
|
|
|
2013-09-04 20:13:08 +00:00
|
|
|
// Move the files in the input level to the target level.
|
|
|
|
// If target_level < 0, automatically calculate the minimum level that could
|
|
|
|
// hold the data set.
|
2014-02-01 00:45:20 +00:00
|
|
|
Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
|
2013-06-30 06:21:36 +00:00
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
// helper functions for adding and removing from flush & compaction queues
|
|
|
|
void AddToCompactionQueue(ColumnFamilyData* cfd);
|
|
|
|
ColumnFamilyData* PopFirstFromCompactionQueue();
|
2018-08-24 20:17:29 +00:00
|
|
|
FlushRequest PopFirstFromFlushQueue();
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
2018-12-13 21:16:04 +00:00
|
|
|
// Pick the first unthrottled compaction with task token from queue.
|
|
|
|
ColumnFamilyData* PickCompactionFromQueue(
|
|
|
|
std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
|
|
|
|
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
// helper function to call after some of the logs_ were synced
|
2022-07-21 20:35:36 +00:00
|
|
|
void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
Status ApplyWALToManifest(const ReadOptions& read_options,
|
|
|
|
const WriteOptions& write_options,
|
|
|
|
VersionEdit* edit);
|
2020-11-07 00:30:44 +00:00
|
|
|
// WALs with log number up to up_to are not synced successfully.
|
|
|
|
void MarkLogsNotSynced(uint64_t up_to);
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 13:06:39 +00:00
|
|
|
|
2019-01-16 05:32:15 +00:00
|
|
|
SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
|
|
|
|
bool lock = true);
|
2015-12-08 20:25:48 +00:00
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
// If snapshot_seq != kMaxSequenceNumber, then this function can only be
|
|
|
|
// called from the write thread that publishes sequence numbers to readers.
|
|
|
|
// For 1) write-committed, or 2) write-prepared + one-write-queue, this will
|
|
|
|
// be the write thread performing memtable writes. For write-prepared with
|
|
|
|
// two write queues, this will be the write thread writing commit marker to
|
|
|
|
// the WAL.
|
|
|
|
// If snapshot_seq == kMaxSequenceNumber, this function is called by a caller
|
|
|
|
// ensuring no writes to the database.
|
|
|
|
std::pair<Status, std::shared_ptr<const SnapshotImpl>>
|
|
|
|
CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
|
|
|
|
bool lock = true);
|
|
|
|
|
2016-11-15 06:45:16 +00:00
|
|
|
uint64_t GetMaxTotalWalSize() const;
|
|
|
|
|
2020-03-03 00:14:00 +00:00
|
|
|
FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
|
2018-04-06 02:49:06 +00:00
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
Status MaybeReleaseTimestampedSnapshotsAndCheck();
|
|
|
|
|
2018-02-23 21:50:02 +00:00
|
|
|
Status CloseHelper();
|
2018-01-16 18:57:56 +00:00
|
|
|
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
void WaitForBackgroundWork();
|
|
|
|
|
2019-05-31 04:29:44 +00:00
|
|
|
// Background threads call this function, which is just a wrapper around
|
|
|
|
// the InstallSuperVersion() function. Background threads carry
|
|
|
|
// sv_context which can have new_superversion already
|
|
|
|
// allocated.
|
|
|
|
// All ColumnFamily state changes go through this function. Here we analyze
|
|
|
|
// the new state and we schedule background work if we detect that the new
|
|
|
|
// state needs flush or compaction.
|
|
|
|
void InstallSuperVersionAndScheduleWork(
|
|
|
|
ColumnFamilyData* cfd, SuperVersionContext* sv_context,
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
2021-07-10 00:16:00 +00:00
|
|
|
const MutableCFOptions& mutable_cf_options);
|
2019-05-31 04:29:44 +00:00
|
|
|
|
|
|
|
bool GetIntPropertyInternal(ColumnFamilyData* cfd,
|
|
|
|
const DBPropertyInfo& property_info,
|
|
|
|
bool is_locked, uint64_t* value);
|
|
|
|
bool GetPropertyHandleOptionsStatistics(std::string* value);
|
|
|
|
|
|
|
|
bool HasPendingManualCompaction();
|
|
|
|
bool HasExclusiveManualCompaction();
|
|
|
|
void AddManualCompaction(ManualCompactionState* m);
|
|
|
|
void RemoveManualCompaction(ManualCompactionState* m);
|
|
|
|
bool ShouldntRunManualCompaction(ManualCompactionState* m);
|
|
|
|
bool HaveManualCompaction(ColumnFamilyData* cfd);
|
|
|
|
bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
|
2023-10-19 01:00:07 +00:00
|
|
|
void UpdateDeletionCompactionStats(const std::unique_ptr<Compaction>& c);
|
2019-05-31 04:29:44 +00:00
|
|
|
void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
|
|
|
|
const Status& st,
|
|
|
|
const CompactionJobStats& compaction_job_stats,
|
2023-07-28 16:47:31 +00:00
|
|
|
const int job_id,
|
2019-05-31 04:29:44 +00:00
|
|
|
CompactionJobInfo* compaction_job_info) const;
|
|
|
|
// Reserve the next 'num' file numbers for to-be-ingested external SST files,
|
|
|
|
// and return the current file_number in 'next_file_number'.
|
|
|
|
// Write a version edit to the MANIFEST.
|
|
|
|
Status ReserveFileNumbersBeforeIngestion(
|
|
|
|
ColumnFamilyData* cfd, uint64_t num,
|
2019-10-08 21:18:48 +00:00
|
|
|
std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
|
2019-05-31 04:29:44 +00:00
|
|
|
uint64_t* next_file_number);
|
|
|
|
|
|
|
|
bool ShouldPurge(uint64_t file_number) const;
|
|
|
|
void MarkAsGrabbedForPurge(uint64_t file_number);
|
|
|
|
|
|
|
|
size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
|
|
|
|
Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
|
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
IOStatus CreateWAL(const WriteOptions& write_options, uint64_t log_file_num,
|
|
|
|
uint64_t recycle_log_number, size_t preallocate_block_size,
|
|
|
|
log::Writer** new_log);
|
2019-05-31 04:29:44 +00:00
|
|
|
|
2019-06-04 02:47:02 +00:00
|
|
|
// Validate self-consistency of DB options
|
|
|
|
static Status ValidateOptions(const DBOptions& db_options);
|
|
|
|
// Validate self-consistency of DB options and its consistency with cf options
|
|
|
|
static Status ValidateOptions(
|
|
|
|
const DBOptions& db_options,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families);
|
|
|
|
|
2019-11-12 21:51:18 +00:00
|
|
|
// Utility function to do some debug validation and sort the given vector
|
|
|
|
// of MultiGet keys
|
|
|
|
void PrepareMultiGetKeys(
|
|
|
|
const size_t num_keys, bool sorted,
|
|
|
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
|
|
|
|
|
2023-02-15 17:34:17 +00:00
|
|
|
void MultiGetCommon(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const size_t num_keys,
|
|
|
|
const Slice* keys, PinnableSlice* values,
|
|
|
|
PinnableWideColumns* columns, std::string* timestamps,
|
|
|
|
Status* statuses, bool sorted_input);
|
|
|
|
|
|
|
|
void MultiGetCommon(const ReadOptions& options, const size_t num_keys,
|
|
|
|
ColumnFamilyHandle** column_families, const Slice* keys,
|
|
|
|
PinnableSlice* values, PinnableWideColumns* columns,
|
|
|
|
std::string* timestamps, Status* statuses,
|
|
|
|
bool sorted_input);
|
|
|
|
|
2019-11-12 21:51:18 +00:00
|
|
|
// A structure to hold the information required to process MultiGet of keys
|
|
|
|
// belonging to one column family. For a multi column family MultiGet, there
|
|
|
|
// will be a container of these objects.
|
|
|
|
struct MultiGetColumnFamilyData {
|
|
|
|
ColumnFamilyHandle* cf;
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
|
|
|
|
// For the batched MultiGet which relies on sorted keys, start specifies
|
|
|
|
// the index of first key belonging to this column family in the sorted
|
|
|
|
// list.
|
|
|
|
size_t start;
|
|
|
|
|
|
|
|
// For the batched MultiGet case, num_keys specifies the number of keys
|
|
|
|
// belonging to this column family in the sorted list
|
|
|
|
size_t num_keys;
|
|
|
|
|
|
|
|
// SuperVersion for the column family obtained in a manner that ensures a
|
|
|
|
// consistent view across all column families in the DB
|
|
|
|
SuperVersion* super_version;
|
|
|
|
MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
|
|
|
|
SuperVersion* sv)
|
|
|
|
: cf(column_family),
|
|
|
|
cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
|
|
|
|
start(0),
|
|
|
|
num_keys(0),
|
|
|
|
super_version(sv) {}
|
|
|
|
|
|
|
|
MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
|
|
|
|
size_t count, SuperVersion* sv)
|
|
|
|
: cf(column_family),
|
|
|
|
cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
|
|
|
|
start(first),
|
|
|
|
num_keys(count),
|
|
|
|
super_version(sv) {}
|
|
|
|
|
|
|
|
MultiGetColumnFamilyData() = default;
|
|
|
|
};
|
|
|
|
|
|
|
|
// A common function to obtain a consistent snapshot, which can be implicit
|
|
|
|
// if the user doesn't specify a snapshot in read_options, across
|
|
|
|
// multiple column families for MultiGet. It will attempt to get an implicit
|
|
|
|
// snapshot without acquiring the db_mutes, but will give up after a few
|
|
|
|
// tries and acquire the mutex if a memtable flush happens. The template
|
|
|
|
// allows both the batched and non-batched MultiGet to call this with
|
|
|
|
// either an std::unordered_map or autovector of column families.
|
|
|
|
//
|
|
|
|
// If callback is non-null, the callback is refreshed with the snapshot
|
|
|
|
// sequence number
|
|
|
|
//
|
2023-09-15 16:50:39 +00:00
|
|
|
// `sv_from_thread_local` being set to false indicates that the SuperVersion
|
|
|
|
// obtained from the ColumnFamilyData, whereas true indicates they are thread
|
2023-09-13 23:34:18 +00:00
|
|
|
// local.
|
|
|
|
// A non-OK status will be returned if for a column family that enables
|
|
|
|
// user-defined timestamp feature, the specified `ReadOptions.timestamp`
|
|
|
|
// attemps to read collapsed history.
|
2019-11-12 21:51:18 +00:00
|
|
|
template <class T>
|
2023-09-13 23:34:18 +00:00
|
|
|
Status MultiCFSnapshot(
|
2019-11-12 21:51:18 +00:00
|
|
|
const ReadOptions& read_options, ReadCallback* callback,
|
|
|
|
std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
|
|
|
|
iter_deref_func,
|
2023-09-15 16:50:39 +00:00
|
|
|
T* cf_list, SequenceNumber* snapshot, bool* sv_from_thread_local);
|
2019-11-12 21:51:18 +00:00
|
|
|
|
|
|
|
// The actual implementation of the batching MultiGet. The caller is expected
|
|
|
|
// to have acquired the SuperVersion and pass in a snapshot sequence number
|
|
|
|
// in order to construct the LookupKeys. The start_key and num_keys specify
|
|
|
|
// the range of keys in the sorted_keys vector for a single column family.
|
2020-04-21 21:49:50 +00:00
|
|
|
Status MultiGetImpl(
|
2019-11-12 21:51:18 +00:00
|
|
|
const ReadOptions& read_options, size_t start_key, size_t num_keys,
|
|
|
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
|
2020-12-14 21:47:17 +00:00
|
|
|
SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback);
|
2019-11-12 21:51:18 +00:00
|
|
|
|
2023-09-15 22:34:04 +00:00
|
|
|
void MultiGetWithCallbackImpl(
|
|
|
|
const ReadOptions& read_options, ColumnFamilyHandle* column_family,
|
|
|
|
ReadCallback* callback,
|
|
|
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
|
|
|
|
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
Status DisableFileDeletionsWithLock();
|
|
|
|
|
2021-12-23 19:02:43 +00:00
|
|
|
Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
|
|
|
|
std::string ts_low);
|
2021-02-08 21:43:23 +00:00
|
|
|
|
Avoid allocations/copies for large `GetMergeOperands()` results (#10458)
Summary:
This PR avoids allocations and copies for the result of `GetMergeOperands()` when the average operand size is at least 256 bytes and the total operands size is at least 32KB. The `GetMergeOperands()` already included `PinnableSlice` but was calling `PinSelf()` (i.e., allocating and copying) for each operand. When this optimization takes effect, we instead call `PinSlice()` to skip that allocation and copy. Resources are pinned in order for the `PinnableSlice` to point to valid memory even after `GetMergeOperands()` returns.
The pinned resources include a referenced `SuperVersion`, a `MergingContext`, and a `PinnedIteratorsManager`. They are bundled into a `GetMergeOperandsState`. We use `SharedCleanablePtr` to share that bundle among all `PinnableSlice`s populated by `GetMergeOperands()`. That way, the last `PinnableSlice` to be `Reset()` will cleanup the bundle, including unreferencing the `SuperVersion`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10458
Test Plan:
- new DB level test
- measured benefit/regression in a number of memtable scenarios
Setup command:
```
$ ./db_bench -benchmarks=mergerandom -merge_operator=StringAppendOperator -num=$num -writes=16384 -key_size=16 -value_size=$value_sz -compression_type=none -write_buffer_size=1048576000
```
Benchmark command:
```
./db_bench -threads=$threads -use_existing_db=true -avoid_flush_during_recovery=true -write_buffer_size=1048576000 -benchmarks=readrandomoperands -merge_operator=StringAppendOperator -num=$num -duration=10
```
Worst regression is when a key has many tiny operands:
- Parameters: num=1 (implying 16384 operands per key), value_sz=8, threads=1
- `GetMergeOperands()` latency increases 682 micros -> 800 micros (+17%)
The regression disappears into the noise (<1% difference) if we remove the `Reset()` loop and the size counting loop. The former is arguably needed regardless of this PR as the convention in `Get()` and `MultiGet()` is to `Reset()` the input `PinnableSlice`s at the start. The latter could be optimized to count the size as we accumulate operands rather than after the fact.
Best improvement is when a key has large operands and high concurrency:
- Parameters: num=4 (implying 4096 operands per key), value_sz=2KB, threads=32
- `GetMergeOperands()` latency decreases 11492 micros -> 437 micros (-96%).
Reviewed By: cbi42
Differential Revision: D38336578
Pulled By: ajkr
fbshipit-source-id: 48146d127e04cb7f2d4d2939a2b9dff3aba18258
2022-08-04 07:42:13 +00:00
|
|
|
bool ShouldReferenceSuperVersion(const MergeContext& merge_context);
|
|
|
|
|
2013-02-15 23:28:24 +00:00
|
|
|
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
|
2011-03-18 22:37:00 +00:00
|
|
|
FileLock* db_lock_;
|
|
|
|
|
2023-10-12 17:05:23 +00:00
|
|
|
// Guards changes to DB and CF options to ensure consistency between
|
|
|
|
// * In-memory options objects
|
|
|
|
// * Settings in effect
|
|
|
|
// * Options file contents
|
|
|
|
// while allowing the DB mutex to be released during slow operations like
|
|
|
|
// persisting options file or modifying global periodic task timer.
|
|
|
|
// Always acquired *before* DB mutex when this one is applicable.
|
|
|
|
InstrumentedMutex options_mutex_;
|
|
|
|
|
|
|
|
// Guards reads and writes to in-memory stats_history_.
|
2019-02-20 23:46:59 +00:00
|
|
|
InstrumentedMutex stats_history_mutex_;
|
2023-10-12 17:05:23 +00:00
|
|
|
|
|
|
|
// In addition to mutex_, log_write_mutex_ protects writes to logs_ and
|
2017-11-11 01:18:01 +00:00
|
|
|
// logfile_number_. With two_write_queues it also protects alive_log_files_,
|
2017-06-30 16:30:03 +00:00
|
|
|
// and log_empty_. Refer to the definition of each variable below for more
|
|
|
|
// details.
|
2023-05-03 18:12:20 +00:00
|
|
|
// Note: to avoid deadlock, if needed to acquire both log_write_mutex_ and
|
2019-06-11 00:02:23 +00:00
|
|
|
// mutex_, the order should be first mutex_ and then log_write_mutex_.
|
2017-06-24 21:06:43 +00:00
|
|
|
InstrumentedMutex log_write_mutex_;
|
2015-11-11 06:58:01 +00:00
|
|
|
|
2020-08-14 18:28:12 +00:00
|
|
|
// If zero, manual compactions are allowed to proceed. If non-zero, manual
|
|
|
|
// compactions may still be running, but will quickly fail with
|
|
|
|
// `Status::Incomplete`. The value indicates how many threads have paused
|
|
|
|
// manual compactions. It is accessed in read mode outside the DB mutex in
|
|
|
|
// compaction code paths.
|
|
|
|
std::atomic<int> manual_compaction_paused_;
|
|
|
|
|
2014-06-03 00:23:55 +00:00
|
|
|
// This condition variable is signaled on these conditions:
|
|
|
|
// * whenever bg_compaction_scheduled_ goes down to 0
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 19:20:34 +00:00
|
|
|
// * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
|
2014-06-03 00:23:55 +00:00
|
|
|
// made any progress
|
|
|
|
// * whenever a compaction made any progress
|
2016-06-22 01:41:23 +00:00
|
|
|
// * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
|
|
|
|
// (i.e. whenever a flush is done, even if it didn't make any progress)
|
|
|
|
// * whenever there is an error in background purge, flush or compaction
|
2016-10-21 00:05:32 +00:00
|
|
|
// * whenever num_running_ingest_file_ goes to 0.
|
2018-01-18 01:37:10 +00:00
|
|
|
// * whenever pending_purge_obsolete_files_ goes to 0.
|
|
|
|
// * whenever disable_delete_obsolete_files_ goes to 0.
|
2018-02-12 23:34:39 +00:00
|
|
|
// * whenever SetOptions successfully updates options.
|
|
|
|
// * whenever a column family is dropped.
|
2015-02-05 05:39:45 +00:00
|
|
|
InstrumentedCondVar bg_cv_;
|
2017-06-30 16:30:03 +00:00
|
|
|
// Writes are protected by locking both mutex_ and log_write_mutex_, and reads
|
|
|
|
// must be under either mutex_ or log_write_mutex_. Since after ::Open,
|
|
|
|
// logfile_number_ is currently updated only in write_thread_, it can be read
|
|
|
|
// from the same write_thread_ without any locks.
|
2011-06-22 02:36:45 +00:00
|
|
|
uint64_t logfile_number_;
|
2022-07-21 20:35:36 +00:00
|
|
|
// Log files that we can recycle. Must be protected by db mutex_.
|
|
|
|
std::deque<uint64_t> log_recycle_files_;
|
|
|
|
// Protected by log_write_mutex_.
|
2015-01-26 23:48:59 +00:00
|
|
|
bool log_dir_synced_;
|
2017-11-11 01:18:01 +00:00
|
|
|
// Without two_write_queues, read and writes to log_empty_ are protected by
|
2017-06-30 16:30:03 +00:00
|
|
|
// mutex_. Since it is currently updated/read only in write_thread_, it can be
|
|
|
|
// accessed from the same write_thread_ without any locks. With
|
2017-11-11 01:18:01 +00:00
|
|
|
// two_write_queues writes, where it can be updated in different threads,
|
2017-06-30 16:30:03 +00:00
|
|
|
// read and writes are protected by log_write_mutex_ instead. This is to avoid
|
2021-08-27 19:09:13 +00:00
|
|
|
// expensive mutex_ lock during WAL write, which update log_empty_.
|
2014-04-15 16:57:25 +00:00
|
|
|
bool log_empty_;
|
2019-03-26 23:41:31 +00:00
|
|
|
|
2019-06-17 22:17:43 +00:00
|
|
|
ColumnFamilyHandleImpl* persist_stats_cf_handle_;
|
|
|
|
|
|
|
|
bool persistent_stats_cfd_exists_ = true;
|
2015-08-06 03:51:27 +00:00
|
|
|
|
2022-07-21 20:35:36 +00:00
|
|
|
// alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
|
|
|
|
// as follows:
|
|
|
|
// 1. read by FindObsoleteFiles() which can be called in either application
|
|
|
|
// thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
|
|
|
|
// held.
|
|
|
|
// 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
|
|
|
|
// are held.
|
|
|
|
// 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
|
|
|
|
// (actually called by Open()), only mutex_ is held because at this point,
|
|
|
|
// the DB::Open() call has not returned success to application, and the
|
|
|
|
// only other thread(s) that can conflict are bg threads calling
|
|
|
|
// FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
|
|
|
|
// are held when accessing alive_log_files_.
|
|
|
|
// 4. read by DBImpl::Open() is protected by mutex_.
|
|
|
|
// 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
|
|
|
|
// held. This is done by the write group leader. Note that in the case of
|
|
|
|
// two-write-queues, another WAL-only write thread can be writing to the
|
|
|
|
// WAL concurrently. See 9.
|
|
|
|
// 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
|
|
|
|
// done by write group leader.
|
|
|
|
// 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
|
|
|
|
// two-write-queues. Only log_write_mutex_ is held to protect concurrent
|
|
|
|
// pop_front() by FindObsoleteFiles().
|
|
|
|
// 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
|
|
|
|
// is held to protect the data structure from concurrent pop_front() by
|
|
|
|
// FindObsoleteFiles().
|
|
|
|
// 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
|
|
|
|
// of two-write-queues. Only log_write_mutex_ is held. This suffices to
|
|
|
|
// protect the data structure from concurrent push_back() by current
|
|
|
|
// write group leader as well as pop_front() by FindObsoleteFiles().
|
2014-04-30 18:33:40 +00:00
|
|
|
std::deque<LogFileNumberSize> alive_log_files_;
|
2022-06-01 22:33:22 +00:00
|
|
|
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 10:28:08 +00:00
|
|
|
// Log files that aren't fully synced, and the current log file.
|
|
|
|
// Synchronization:
|
2022-07-21 20:35:36 +00:00
|
|
|
// 1. read by FindObsoleteFiles() which can be called either in application
|
|
|
|
// thread or RocksDB bg threads. log_write_mutex_ is always held, while
|
|
|
|
// some reads are performed without mutex_.
|
|
|
|
// 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
|
|
|
|
// 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
|
|
|
|
// 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
|
|
|
|
// Note that at this point, DB::Open() has not returned success to
|
|
|
|
// application, thus the only other thread(s) that can conflict are bg
|
|
|
|
// threads calling FindObsoleteFiles(). See 1.
|
|
|
|
// 5. iteration and clear() from CloseHelper() always hold log_write_mutex
|
|
|
|
// and mutex_.
|
|
|
|
// 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
|
|
|
|
// log_write_mutex_. These two can be called by application threads after
|
|
|
|
// DB::Open() returns success to applications.
|
|
|
|
// 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
|
|
|
|
// 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
|
|
|
|
// log_write_mutex_.
|
|
|
|
// 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
|
|
|
|
// 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
|
|
|
|
// happen in bg flush threads after DB::Open() returns success to
|
|
|
|
// applications.
|
|
|
|
// 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
|
|
|
|
// holds only the log_write_mutex_. This is done by the write group
|
|
|
|
// leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
|
|
|
|
// can happen concurrently. This is fine because log_write_mutex_ is used
|
|
|
|
// by all parties. See 2, 5, 9.
|
|
|
|
// 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
|
|
|
|
// log_write_mutex_. This happens in the write group leader.
|
|
|
|
// 13. emplace_back() by SwitchMemtable() hold both mutex_ and
|
|
|
|
// log_write_mutex_. This happens in the write group leader. Can conflict
|
|
|
|
// with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
|
|
|
|
// SyncClosedLogs(), etc. as well as application threads calling
|
|
|
|
// FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
|
|
|
|
// require at least log_write_mutex_.
|
|
|
|
// 14. iteration called in WriteToWAL(write_group) protected by
|
|
|
|
// log_write_mutex_. This is done by write group leader when
|
|
|
|
// two-write-queues is disabled and write needs to sync logs.
|
|
|
|
// 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
|
|
|
|
// This can be done by the write group leader if two-write-queues is
|
|
|
|
// enabled. It can also be done by another WAL-only write thread.
|
|
|
|
//
|
|
|
|
// Other observations:
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 10:28:08 +00:00
|
|
|
// - back() and items with getting_synced=true are not popped,
|
2017-06-30 16:30:03 +00:00
|
|
|
// - The same thread that sets getting_synced=true will reset it.
|
|
|
|
// - it follows that the object referred by back() can be safely read from
|
2022-07-21 20:35:36 +00:00
|
|
|
// the write_thread_ without using mutex. Note that calling back() without
|
|
|
|
// mutex may be unsafe because different implementations of deque::back() may
|
|
|
|
// access other member variables of deque, causing undefined behaviors.
|
|
|
|
// Generally, do not access stl containers without proper synchronization.
|
2017-06-30 16:30:03 +00:00
|
|
|
// - it follows that the items with getting_synced=true can be safely read
|
|
|
|
// from the same thread that has set getting_synced=true
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 10:28:08 +00:00
|
|
|
std::deque<LogWriterNumber> logs_;
|
2022-07-21 20:35:36 +00:00
|
|
|
|
[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary:
I'll just copy internal task summary here:
"
This sequence will cause data loss in the middle after an sync write:
non-sync write key 1
flush triggered, not yet scheduled
sync write key 2
system crash
After rebooting, users might see key 2 but not key 1, which violates the API of sync write.
This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest.
One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too.
"
This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler.
Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind
Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D40899
2015-07-22 10:28:08 +00:00
|
|
|
// Signaled when getting_synced becomes false for some of the logs_.
|
|
|
|
InstrumentedCondVar log_sync_cv_;
|
2017-11-02 00:23:52 +00:00
|
|
|
// This is the app-level state that is written to the WAL but will be used
|
|
|
|
// only during recovery. Using this feature enables not writing the state to
|
|
|
|
// memtable on normal writes and hence improving the throughput. Each new
|
|
|
|
// write of the state will replace the previous state entirely even if the
|
2022-04-05 18:10:20 +00:00
|
|
|
// keys in the two consecutive states do not overlap.
|
2017-11-11 01:18:01 +00:00
|
|
|
// It is protected by log_write_mutex_ when two_write_queues_ is enabled.
|
2017-11-02 00:23:52 +00:00
|
|
|
// Otherwise only the heaad of write_thread_ can access it.
|
|
|
|
WriteBatch cached_recoverable_state_;
|
|
|
|
std::atomic<bool> cached_recoverable_state_empty_ = {true};
|
2017-03-22 18:50:25 +00:00
|
|
|
std::atomic<uint64_t> total_log_size_;
|
2019-03-26 23:41:31 +00:00
|
|
|
|
2015-03-30 19:04:10 +00:00
|
|
|
// If this is non-empty, we need to delete these log files in background
|
2022-07-21 20:35:36 +00:00
|
|
|
// threads. Protected by log_write_mutex_.
|
2015-03-30 19:04:10 +00:00
|
|
|
autovector<log::Writer*> logs_to_free_;
|
2013-12-20 17:57:58 +00:00
|
|
|
|
2014-12-11 02:39:09 +00:00
|
|
|
bool is_snapshot_supported_;
|
|
|
|
|
2019-02-20 23:46:59 +00:00
|
|
|
std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
|
|
|
|
|
|
|
|
std::map<std::string, uint64_t> stats_slice_;
|
|
|
|
|
|
|
|
bool stats_slice_initialized_ = false;
|
|
|
|
|
2015-01-26 21:59:38 +00:00
|
|
|
Directories directories_;
|
2014-01-27 19:02:21 +00:00
|
|
|
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager* write_buffer_manager_;
|
2014-12-02 20:09:20 +00:00
|
|
|
|
2014-09-12 23:23:58 +00:00
|
|
|
WriteThread write_thread_;
|
2013-03-28 22:19:28 +00:00
|
|
|
WriteBatch tmp_batch_;
|
2017-06-24 21:06:43 +00:00
|
|
|
// The write thread when the writers have no memtable write. This will be used
|
|
|
|
// in 2PC to batch the prepares separately from the serial commit.
|
|
|
|
WriteThread nonmem_write_thread_;
|
2012-03-09 00:23:21 +00:00
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 18:20:25 +00:00
|
|
|
WriteController write_controller_;
|
2015-05-15 22:52:51 +00:00
|
|
|
|
|
|
|
// Size of the last batch group. In slowdown mode, next write needs to
|
|
|
|
// sleep if it uses up the quota.
|
2017-06-24 21:06:43 +00:00
|
|
|
// Note: This is to protect memtable and compaction. If the batch only writes
|
|
|
|
// to the WAL its size need not to be included in this.
|
2015-05-15 22:52:51 +00:00
|
|
|
uint64_t last_batch_group_size_;
|
|
|
|
|
2014-09-11 01:46:09 +00:00
|
|
|
FlushScheduler flush_scheduler_;
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 18:20:25 +00:00
|
|
|
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
TrimHistoryScheduler trim_history_scheduler_;
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
SnapshotList snapshots_;
|
|
|
|
|
Snapshots with user-specified timestamps (#9879)
Summary:
In RocksDB, keys are associated with (internal) sequence numbers which denote when the keys are written
to the database. Sequence numbers in different RocksDB instances are unrelated, thus not comparable.
It is nice if we can associate sequence numbers with their corresponding actual timestamps. One thing we can
do is to support user-defined timestamp, which allows the applications to specify the format of custom timestamps
and encode a timestamp with each key. More details can be found at https://github.com/facebook/rocksdb/wiki/User-defined-Timestamp-%28Experimental%29.
This PR provides a different but complementary approach. We can associate rocksdb snapshots (defined in
https://github.com/facebook/rocksdb/blob/7.2.fb/include/rocksdb/snapshot.h#L20) with **user-specified** timestamps.
Since a snapshot is essentially an object representing a sequence number, this PR establishes a bi-directional mapping between sequence numbers and timestamps.
In the past, snapshots are usually taken by readers. The current super-version is grabbed, and a `rocksdb::Snapshot`
object is created with the last published sequence number of the super-version. You can see that the reader actually
has no good idea of what timestamp to assign to this snapshot, because by the time the `GetSnapshot()` is called,
an arbitrarily long period of time may have already elapsed since the last write, which is when the last published
sequence number is written.
This observation motivates the creation of "timestamped" snapshots on the write path. Currently, this functionality is
exposed only to the layer of `TransactionDB`. Application can tell RocksDB to create a snapshot when a transaction
commits, effectively associating the last sequence number with a timestamp. It is also assumed that application will
ensure any two snapshots with timestamps should satisfy the following:
```
snapshot1.seq < snapshot2.seq iff. snapshot1.ts < snapshot2.ts
```
If the application can guarantee that when a reader takes a timestamped snapshot, there is no active writes going on
in the database, then we also allow the user to use a new API `TransactionDB::CreateTimestampedSnapshot()` to create
a snapshot with associated timestamp.
Code example
```cpp
// Create a timestamped snapshot when committing transaction.
txn->SetCommitTimestamp(100);
txn->SetSnapshotOnNextOperation();
txn->Commit();
// A wrapper API for convenience
Status Transaction::CommitAndTryCreateSnapshot(
std::shared_ptr<TransactionNotifier> notifier,
TxnTimestamp ts,
std::shared_ptr<const Snapshot>* ret);
// Create a timestamped snapshot if caller guarantees no concurrent writes
std::pair<Status, std::shared_ptr<const Snapshot>> snapshot = txn_db->CreateTimestampedSnapshot(100);
```
The snapshots created in this way will be managed by RocksDB with ref-counting and potentially shared with
other readers. We provide the following APIs for readers to retrieve a snapshot given a timestamp.
```cpp
// Return the timestamped snapshot correponding to given timestamp. If ts is
// kMaxTxnTimestamp, then we return the latest timestamped snapshot if present.
// Othersise, we return the snapshot whose timestamp is equal to `ts`. If no
// such snapshot exists, then we return null.
std::shared_ptr<const Snapshot> TransactionDB::GetTimestampedSnapshot(TxnTimestamp ts) const;
// Return the latest timestamped snapshot if present.
std::shared_ptr<const Snapshot> TransactionDB::GetLatestTimestampedSnapshot() const;
```
We also provide two additional APIs for stats collection and reporting purposes.
```cpp
Status TransactionDB::GetAllTimestampedSnapshots(
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
// Return timestamped snapshots whose timestamps fall in [ts_lb, ts_ub) and store them in `snapshots`.
Status TransactionDB::GetTimestampedSnapshots(
TxnTimestamp ts_lb,
TxnTimestamp ts_ub,
std::vector<std::shared_ptr<const Snapshot>>& snapshots) const;
```
To prevent the number of timestamped snapshots from growing infinitely, we provide the following API to release
timestamped snapshots whose timestamps are older than or equal to a given threshold.
```cpp
void TransactionDB::ReleaseTimestampedSnapshotsOlderThan(TxnTimestamp ts);
```
Before shutdown, RocksDB will release all timestamped snapshots.
Comparison with user-defined timestamp and how they can be combined:
User-defined timestamp persists every key with a timestamp, while timestamped snapshots maintain a volatile
mapping between snapshots (sequence numbers) and timestamps.
Different internal keys with the same user key but different timestamps will be treated as different by compaction,
thus a newer version will not hide older versions (with smaller timestamps) unless they are eligible for garbage collection.
In contrast, taking a timestamped snapshot at a certain sequence number and timestamp prevents all the keys visible in
this snapshot from been dropped by compaction. Here, visible means (seq < snapshot and most recent).
The timestamped snapshot supports the semantics of reading at an exact point in time.
Timestamped snapshots can also be used with user-defined timestamp.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9879
Test Plan:
```
make check
TEST_TMPDIR=/dev/shm make crash_test_with_txn
```
Reviewed By: siying
Differential Revision: D35783919
Pulled By: riversand963
fbshipit-source-id: 586ad905e169189e19d3bfc0cb0177a7239d1bd4
2022-06-10 23:07:03 +00:00
|
|
|
TimestampedSnapshotList timestamped_snapshots_;
|
|
|
|
|
2014-11-07 19:50:34 +00:00
|
|
|
// For each background job, pending_outputs_ keeps the current file number at
|
|
|
|
// the time that background job started.
|
|
|
|
// FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
|
|
|
|
// number bigger than any of the file number in pending_outputs_. Since file
|
|
|
|
// numbers grow monotonically, this also means that pending_outputs_ is always
|
|
|
|
// sorted. After a background job is done executing, its file number is
|
|
|
|
// deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
|
|
|
|
// it up.
|
|
|
|
// State is protected with db mutex.
|
|
|
|
std::list<uint64_t> pending_outputs_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
// flush_queue_ and compaction_queue_ hold column families that we need to
|
|
|
|
// flush and compact, respectively.
|
|
|
|
// A column family is inserted into flush_queue_ when it satisfies condition
|
|
|
|
// cfd->imm()->IsFlushPending()
|
|
|
|
// A column family is inserted into compaction_queue_ when it satisfied
|
|
|
|
// condition cfd->NeedsCompaction()
|
|
|
|
// Column families in this list are all Ref()-erenced
|
|
|
|
// TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
|
|
|
|
// do RAII on ColumnFamilyData
|
|
|
|
// Column families are in this queue when they need to be flushed or
|
|
|
|
// compacted. Consumers of these queues are flush and compaction threads. When
|
|
|
|
// column family is put on this queue, we increase unscheduled_flushes_ and
|
|
|
|
// unscheduled_compactions_. When these variables are bigger than zero, that
|
2019-01-07 21:50:26 +00:00
|
|
|
// means we need to schedule background threads for flush and compaction.
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
// Once the background threads are scheduled, we decrease unscheduled_flushes_
|
|
|
|
// and unscheduled_compactions_. That way we keep track of number of
|
|
|
|
// compaction and flush threads we need to schedule. This scheduling is done
|
|
|
|
// in MaybeScheduleFlushOrCompaction()
|
|
|
|
// invariant(column family present in flush_queue_ <==>
|
|
|
|
// ColumnFamilyData::pending_flush_ == true)
|
2018-08-24 20:17:29 +00:00
|
|
|
std::deque<FlushRequest> flush_queue_;
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
// invariant(column family present in compaction_queue_ <==>
|
|
|
|
// ColumnFamilyData::pending_compaction_ == true)
|
|
|
|
std::deque<ColumnFamilyData*> compaction_queue_;
|
2016-06-22 01:41:23 +00:00
|
|
|
|
2019-09-17 23:43:07 +00:00
|
|
|
// A map to store file numbers and filenames of the files to be purged
|
|
|
|
std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
|
2016-08-04 00:42:06 +00:00
|
|
|
|
2018-03-28 17:23:31 +00:00
|
|
|
// A vector to store the file numbers that have been assigned to certain
|
2020-05-07 16:29:21 +00:00
|
|
|
// JobContext. Current implementation tracks table and blob files only.
|
2019-09-17 23:43:07 +00:00
|
|
|
std::unordered_set<uint64_t> files_grabbed_for_purge_;
|
2018-03-28 17:23:31 +00:00
|
|
|
|
2022-07-21 20:35:36 +00:00
|
|
|
// A queue to store log writers to close. Protected by db mutex_.
|
2016-08-04 00:42:06 +00:00
|
|
|
std::deque<log::Writer*> logs_to_free_queue_;
|
2022-07-21 20:35:36 +00:00
|
|
|
|
2019-12-17 21:20:42 +00:00
|
|
|
std::deque<SuperVersion*> superversions_to_free_queue_;
|
2022-07-21 20:35:36 +00:00
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
int unscheduled_flushes_;
|
2022-07-21 20:35:36 +00:00
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 19:38:12 +00:00
|
|
|
int unscheduled_compactions_;
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
2014-03-11 17:36:30 +00:00
|
|
|
|
2017-08-03 22:36:28 +00:00
|
|
|
// count how many background compactions are running or have been scheduled in
|
|
|
|
// the BOTTOM pool
|
|
|
|
int bg_bottom_compaction_scheduled_;
|
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-21 23:10:39 +00:00
|
|
|
// count how many background compactions are running or have been scheduled
|
2012-10-19 21:00:53 +00:00
|
|
|
int bg_compaction_scheduled_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2015-10-17 07:16:36 +00:00
|
|
|
// stores the number of compactions are currently running
|
|
|
|
int num_running_compactions_;
|
|
|
|
|
2013-09-13 21:38:37 +00:00
|
|
|
// number of background memtable flush jobs, submitted to the HIGH pool
|
|
|
|
int bg_flush_scheduled_;
|
|
|
|
|
2015-10-17 07:16:36 +00:00
|
|
|
// stores the number of flushes are currently running
|
|
|
|
int num_running_flushes_;
|
|
|
|
|
2016-06-22 01:41:23 +00:00
|
|
|
// number of background obsolete file purge jobs, submitted to the HIGH pool
|
|
|
|
int bg_purge_scheduled_;
|
|
|
|
|
2017-08-03 22:36:28 +00:00
|
|
|
std::deque<ManualCompactionState*> manual_compaction_dequeue_;
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
2015-12-14 19:20:34 +00:00
|
|
|
|
2012-09-15 00:11:35 +00:00
|
|
|
// shall we disable deletion of obsolete files
|
2014-01-02 11:33:42 +00:00
|
|
|
// if 0 the deletion is enabled.
|
|
|
|
// if non-zero, files will not be getting deleted
|
|
|
|
// This enables two different threads to call
|
|
|
|
// EnableFileDeletions() and DisableFileDeletions()
|
|
|
|
// without any synchronization
|
|
|
|
int disable_delete_obsolete_files_;
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2018-01-18 01:37:10 +00:00
|
|
|
// Number of times FindObsoleteFiles has found deletable files and the
|
|
|
|
// corresponding call to PurgeObsoleteFiles has not yet finished.
|
|
|
|
int pending_purge_obsolete_files_;
|
|
|
|
|
2019-05-14 00:43:47 +00:00
|
|
|
// last time when DeleteObsoleteFiles with full scan was executed. Originally
|
2016-12-05 22:09:35 +00:00
|
|
|
// initialized with startup time.
|
|
|
|
uint64_t delete_obsolete_files_last_run_;
|
2012-10-16 15:53:46 +00:00
|
|
|
|
2019-05-14 00:43:47 +00:00
|
|
|
// The thread that wants to switch memtable, can wait on this cv until the
|
|
|
|
// pending writes to memtable finishes.
|
|
|
|
std::condition_variable switch_cv_;
|
|
|
|
// The mutex used by switch_cv_. mutex_ should be acquired beforehand.
|
|
|
|
std::mutex switch_mutex_;
|
|
|
|
// Number of threads intending to write to memtable
|
|
|
|
std::atomic<size_t> pending_memtable_writes_ = {};
|
|
|
|
|
2016-02-09 19:20:22 +00:00
|
|
|
// A flag indicating whether the current rocksdb database has any
|
|
|
|
// data that is not yet persisted into either WAL or SST file.
|
|
|
|
// Used when disableWAL is true.
|
2017-02-14 02:44:21 +00:00
|
|
|
std::atomic<bool> has_unpersisted_data_;
|
2017-01-19 23:21:07 +00:00
|
|
|
|
|
|
|
// if an attempt was made to flush all column families that
|
2019-01-02 19:15:01 +00:00
|
|
|
// the oldest log depends on but uncommitted data in the oldest
|
2017-01-19 23:21:07 +00:00
|
|
|
// log prevents the log from being released.
|
|
|
|
// We must attempt to free the dependent memtables again
|
|
|
|
// at a later time after the transaction in the oldest
|
|
|
|
// log is fully commited.
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
bool unable_to_release_oldest_log_;
|
2017-01-19 23:21:07 +00:00
|
|
|
|
Export Import sst files (#5495)
Summary:
Refresh of the earlier change here - https://github.com/facebook/rocksdb/issues/5135
This is a review request for code change needed for - https://github.com/facebook/rocksdb/issues/3469
"Add support for taking snapshot of a column family and creating column family from a given CF snapshot"
We have an implementation for this that we have been testing internally. We have two new APIs that together provide this functionality.
(1) ExportColumnFamily() - This API is modelled after CreateCheckpoint() as below.
// Exports all live SST files of a specified Column Family onto export_dir,
// returning SST files information in metadata.
// - SST files will be created as hard links when the directory specified
// is in the same partition as the db directory, copied otherwise.
// - export_dir should not already exist and will be created by this API.
// - Always triggers a flush.
virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
const std::string& export_dir,
ExportImportFilesMetaData** metadata);
Internally, the API will DisableFileDeletions(), GetColumnFamilyMetaData(), Parse through
metadata, creating links/copies of all the sst files, EnableFileDeletions() and complete the call by
returning the list of file metadata.
(2) CreateColumnFamilyWithImport() - This API is modeled after IngestExternalFile(), but invoked only during a CF creation as below.
// CreateColumnFamilyWithImport() will create a new column family with
// column_family_name and import external SST files specified in metadata into
// this column family.
// (1) External SST files can be created using SstFileWriter.
// (2) External SST files can be exported from a particular column family in
// an existing DB.
// Option in import_options specifies whether the external files are copied or
// moved (default is copy). When option specifies copy, managing files at
// external_file_path is caller's responsibility. When option specifies a
// move, the call ensures that the specified files at external_file_path are
// deleted on successful return and files are not modified on any error
// return.
// On error return, column family handle returned will be nullptr.
// ColumnFamily will be present on successful return and will not be present
// on error return. ColumnFamily may be present on any crash during this call.
virtual Status CreateColumnFamilyWithImport(
const ColumnFamilyOptions& options, const std::string& column_family_name,
const ImportColumnFamilyOptions& import_options,
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle);
Internally, this API creates a new CF, parses all the sst files and adds it to the specified column family, at the same level and with same sequence number as in the metadata. Also performs safety checks with respect to overlaps between the sst files being imported.
If incoming sequence number is higher than current local sequence number, local sequence
number is updated to reflect this.
Note, as the sst files is are being moved across Column Families, Column Family name in sst file
will no longer match the actual column family on destination DB. The API does not modify Column
Family name or id in the sst files being imported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5495
Differential Revision: D16018881
fbshipit-source-id: 9ae2251025d5916d35a9fc4ea4d6707f6be16ff9
2019-07-17 19:22:21 +00:00
|
|
|
// Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
|
|
|
|
// calls.
|
2016-09-26 21:44:48 +00:00
|
|
|
// REQUIRES: mutex held
|
2016-10-21 00:05:32 +00:00
|
|
|
int num_running_ingest_file_;
|
2016-09-26 21:44:48 +00:00
|
|
|
|
2014-10-30 00:43:37 +00:00
|
|
|
WalManager wal_manager_;
|
|
|
|
|
Fix a bug where flush does not happen when a manual compaction is running
Summary:
Currently, when rocksdb tries to run manual compaction to refit data into a level,
there's a ReFitLevel() process that requires no bg work is currently running.
When RocksDB plans to ReFitLevel(), it will do the following:
1. pause scheduling new bg work.
2. wait until all bg work finished
3. do the ReFitLevel()
4. unpause scheduling new bg work.
However, as it pause scheduling new bg work at step one and waiting for all bg work
finished in step 2, RocksDB will stop flushing until all bg work is done (which
could take a long time.)
This patch fix this issue by changing the way ReFitLevel() pause the background work:
1. pause scheduling compaction.
2. wait until all bg work finished.
3. pause scheduling flush
4. do ReFitLevel()
5. unpause both flush and compaction.
The major difference is that. We only pause scheduling compaction in step 1 and wait
for all bg work finished in step 2. This prevent flush being blocked for a long time.
Although there's a very rare case that ReFitLevel() might be in starvation in step 2,
but it's less likely the case as flush typically finish very fast.
Test Plan: existing test.
Reviewers: anthony, IslamAbdelRahman, kradhakrishnan, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D55029
2016-03-04 22:24:52 +00:00
|
|
|
// A value of > 0 temporarily disables scheduling of background work
|
2015-10-02 20:17:34 +00:00
|
|
|
int bg_work_paused_;
|
2013-06-30 06:21:36 +00:00
|
|
|
|
Fix a bug where flush does not happen when a manual compaction is running
Summary:
Currently, when rocksdb tries to run manual compaction to refit data into a level,
there's a ReFitLevel() process that requires no bg work is currently running.
When RocksDB plans to ReFitLevel(), it will do the following:
1. pause scheduling new bg work.
2. wait until all bg work finished
3. do the ReFitLevel()
4. unpause scheduling new bg work.
However, as it pause scheduling new bg work at step one and waiting for all bg work
finished in step 2, RocksDB will stop flushing until all bg work is done (which
could take a long time.)
This patch fix this issue by changing the way ReFitLevel() pause the background work:
1. pause scheduling compaction.
2. wait until all bg work finished.
3. pause scheduling flush
4. do ReFitLevel()
5. unpause both flush and compaction.
The major difference is that. We only pause scheduling compaction in step 1 and wait
for all bg work finished in step 2. This prevent flush being blocked for a long time.
Although there's a very rare case that ReFitLevel() might be in starvation in step 2,
but it's less likely the case as flush typically finish very fast.
Test Plan: existing test.
Reviewers: anthony, IslamAbdelRahman, kradhakrishnan, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D55029
2016-03-04 22:24:52 +00:00
|
|
|
// A value of > 0 temporarily disables scheduling of background compaction
|
|
|
|
int bg_compaction_paused_;
|
|
|
|
|
2013-06-30 06:21:36 +00:00
|
|
|
// Guard against multiple concurrent refitting
|
|
|
|
bool refitting_level_;
|
|
|
|
|
2014-02-27 19:38:55 +00:00
|
|
|
// Indicate DB was opened successfully
|
|
|
|
bool opened_successfully_;
|
|
|
|
|
2019-03-26 02:14:04 +00:00
|
|
|
// The min threshold to triggere bottommost compaction for removing
|
|
|
|
// garbages, among all column families.
|
|
|
|
SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
|
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
LogsWithPrepTracker logs_with_prep_tracker_;
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2017-10-06 17:26:38 +00:00
|
|
|
// Callback for compaction to check if a key is visible to a snapshot.
|
|
|
|
// REQUIRES: mutex held
|
|
|
|
std::unique_ptr<SnapshotChecker> snapshot_checker_;
|
|
|
|
|
2018-03-28 19:01:09 +00:00
|
|
|
// Callback for when the cached_recoverable_state_ is written to memtable
|
|
|
|
// Only to be set during initialization
|
|
|
|
std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
|
|
|
|
|
2020-10-02 02:12:26 +00:00
|
|
|
// Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog().
|
2022-08-26 01:52:37 +00:00
|
|
|
// Currently, internally it has a global timer instance for running the tasks.
|
|
|
|
PeriodicTaskScheduler periodic_task_scheduler_;
|
|
|
|
|
|
|
|
// It contains the implementations for each periodic task.
|
|
|
|
std::map<PeriodicTaskType, const PeriodicTaskFunc> periodic_task_functions_;
|
2019-02-20 23:46:59 +00:00
|
|
|
|
2020-02-28 22:10:51 +00:00
|
|
|
// When set, we use a separate queue for writes that don't write to memtable.
|
2018-03-02 01:50:54 +00:00
|
|
|
// In 2PC these are the writes at Prepare phase.
|
2017-11-11 01:18:01 +00:00
|
|
|
const bool two_write_queues_;
|
2017-06-24 21:06:43 +00:00
|
|
|
const bool manual_wal_flush_;
|
2019-04-24 19:05:29 +00:00
|
|
|
|
2017-12-01 07:39:56 +00:00
|
|
|
// LastSequence also indicates last published sequence visibile to the
|
|
|
|
// readers. Otherwise LastPublishedSequence should be used.
|
|
|
|
const bool last_seq_same_as_publish_seq_;
|
2017-11-11 01:18:01 +00:00
|
|
|
// It indicates that a customized gc algorithm must be used for
|
|
|
|
// flush/compaction and if it is not provided vis SnapshotChecker, we should
|
|
|
|
// disable gc to be safe.
|
2017-10-18 16:09:31 +00:00
|
|
|
const bool use_custom_gc_;
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
2018-09-15 20:36:19 +00:00
|
|
|
// Flag to indicate that the DB instance shutdown has been initiated. This
|
|
|
|
// different from shutting_down_ atomic in that it is set at the beginning
|
|
|
|
// of shutdown sequence, specifically in order to prevent any background
|
|
|
|
// error recovery from going on in parallel. The latter, shutting_down_,
|
|
|
|
// is set a little later during the shutdown after scheduling memtable
|
|
|
|
// flushes
|
move dump stats to a separate thread (#4382)
Summary:
Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all.
We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior.
Tested with db_bench using `--stats_dump_period_sec=20` in command line:
> LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG content:
> 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606]
** DB Stats **
Uptime(secs): 20.0 total, 20.0 interval
Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s
Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s
Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent
Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s
Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s
Interval stall: 00:00:0.012 H:M:S, 0.1 percent
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382
Differential Revision: D9933051
Pulled By: miasantreble
fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
2018-10-09 05:52:58 +00:00
|
|
|
std::atomic<bool> shutdown_initiated_;
|
2018-09-17 20:08:13 +00:00
|
|
|
// Flag to indicate whether sst_file_manager object was allocated in
|
|
|
|
// DB::Open() or passed to us
|
|
|
|
bool own_sfm_;
|
Added support for differential snapshots
Summary:
The motivation for this PR is to add to RocksDB support for differential (incremental) snapshots, as snapshot of the DB changes between two points in time (one can think of it as diff between to sequence numbers, or the diff D which can be thought of as an SST file or just set of KVs that can be applied to sequence number S1 to get the database to the state at sequence number S2).
This feature would be useful for various distributed storages layers built on top of RocksDB, as it should help reduce resources (time and network bandwidth) needed to recover and rebuilt DB instances as replicas in the context of distributed storages.
From the API standpoint that would like client app requesting iterator between (start seqnum) and current DB state, and reading the "diff".
This is a very draft PR for initial review in the discussion on the approach, i'm going to rework some parts and keep updating the PR.
For now, what's done here according to initial discussions:
Preserving deletes:
- We want to be able to optionally preserve recent deletes for some defined period of time, so that if a delete came in recently and might need to be included in the next incremental snapshot it would't get dropped by a compaction. This is done by adding new param to Options (preserve deletes flag) and new variable to DB Impl where we keep track of the sequence number after which we don't want to drop tombstones, even if they are otherwise eligible for deletion.
- I also added a new API call for clients to be able to advance this cutoff seqnum after which we drop deletes; i assume it's more flexible to let clients control this, since otherwise we'd need to keep some kind of timestamp < -- > seqnum mapping inside the DB, which sounds messy and painful to support. Clients could make use of it by periodically calling GetLatestSequenceNumber(), noting the timestamp, doing some calculation and figuring out by how much we need to advance the cutoff seqnum.
- Compaction codepath in compaction_iterator.cc has been modified to avoid dropping tombstones with seqnum > cutoff seqnum.
Iterator changes:
- couple params added to ReadOptions, to optionally allow client to request internal keys instead of user keys (so that client can get the latest value of a key, be it delete marker or a put), as well as min timestamp and min seqnum.
TableCache changes:
- I modified table_cache code to be able to quickly exclude SST files from iterators heep if creation_time on the file is less then iter_start_ts as passed in ReadOptions. That would help a lot in some DB settings (like reading very recent data only or using FIFO compactions), but not so much for universal compaction with more or less long iterator time span.
What's left:
- Still looking at how to best plug that inside DBIter codepath. So far it seems that FindNextUserKeyInternal only parses values as UserKeys, and iter->key() call generally returns user key. Can we add new API to DBIter as internal_key(), and modify this internal method to optionally set saved_key_ to point to the full internal key? I don't need to store actual seqnum there, but I do need to store type.
Closes https://github.com/facebook/rocksdb/pull/2999
Differential Revision: D6175602
Pulled By: mikhail-antonov
fbshipit-source-id: c779a6696ee2d574d86c69cec866a3ae095aa900
2017-11-02 01:43:29 +00:00
|
|
|
|
2018-01-16 18:57:56 +00:00
|
|
|
// Flag to check whether Close() has been called on this DB
|
|
|
|
bool closed_;
|
2021-10-19 03:31:32 +00:00
|
|
|
// save the closing status, for re-calling the close()
|
|
|
|
Status closing_status_;
|
|
|
|
// mutex for DB::Close()
|
|
|
|
InstrumentedMutex closing_mutex_;
|
2018-06-28 19:23:57 +00:00
|
|
|
|
2019-01-04 04:53:52 +00:00
|
|
|
// Conditional variable to coordinate installation of atomic flush results.
|
|
|
|
// With atomic flush, each bg thread installs the result of flushing multiple
|
|
|
|
// column families, and different threads can flush different column
|
|
|
|
// families. It's difficult to rely on one thread to perform batch
|
|
|
|
// installation for all threads. This is different from the non-atomic flush
|
|
|
|
// case.
|
|
|
|
// atomic_flush_install_cv_ makes sure that threads install atomic flush
|
|
|
|
// results sequentially. Flush results of memtables with lower IDs get
|
|
|
|
// installed to MANIFEST first.
|
|
|
|
InstrumentedCondVar atomic_flush_install_cv_;
|
2019-07-07 04:04:22 +00:00
|
|
|
|
|
|
|
bool wal_in_db_path_;
|
2022-07-21 20:35:36 +00:00
|
|
|
std::atomic<uint64_t> max_total_wal_size_;
|
2021-03-18 03:43:22 +00:00
|
|
|
|
|
|
|
BlobFileCompletionCallback blob_callback_;
|
2021-04-21 20:53:05 +00:00
|
|
|
|
|
|
|
// Pointer to WriteBufferManager stalling interface.
|
|
|
|
std::unique_ptr<StallInterface> wbm_stall_;
|
2022-07-15 04:49:34 +00:00
|
|
|
|
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905)
Summary:
This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up**
However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping.
Functional fixes / changes:
* This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc
* That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.)
* Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function.
* Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples.
Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work.
An apparent compaction bug is revealed in
PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909
Cosmetic / code safety things (not exhaustive):
* Fix some confusing names.
* `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name.
* Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing.
* Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions.
* Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind.
* (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.)
* A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`).
* Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905
Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others.
Reviewed By: jowlyzhang
Differential Revision: D49755592
Pulled By: pdillinger
fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
|
|
|
// seqno_to_time_mapping_ stores the sequence number to time mapping, it's not
|
2022-07-16 02:01:30 +00:00
|
|
|
// thread safe, both read and write need db mutex hold.
|
Refactor, clean up, fixes, and more testing for SeqnoToTimeMapping (#11905)
Summary:
This change is before a planned DBImpl change to ensure all sufficiently recent sequence numbers since Open are covered by SeqnoToTimeMapping (bug fix with existing test work-arounds). **Intended follow-up**
However, I found enough issues with SeqnoToTimeMapping to warrant this PR first, including very small fixes in DB implementation related to API contract of SeqnoToTimeMapping.
Functional fixes / changes:
* This fixes some mishandling of boundary cases. For example, if the user decides to stop writing to DB, the last written sequence number would perpetually have its write time updated to "now" and would always be ineligible for migration to cold tier. Part of the problem is that the SeqnoToTimeMapping would return a seqno known to have been written before (immediately or otherwise) the requested time, but compaction_job.cc would include that seqno in the preserve/exclude set. That is fixed (in part) by adding one in compaction_job.cc
* That problem was worse because a whole range of seqnos could be updated perpetually with new times in SeqnoToTimeMapping::Append (if no writes to DB). That logic was apparently optimized for GetOldestApproximateTime (now GetProximalTimeBeforeSeqno), which is not used in production, to the detriment of GetOldestSequenceNum (now GetProximalSeqnoBeforeTime), which is used in production. (Perhaps plans changed during development?) This is fixed in Append to optimize for accuracy of GetProximalSeqnoBeforeTime. (Unit tests added and updated.)
* Related: SeqnoToTimeMapping did not have a clear contract about the relationships between seqnos and times, just the idea of a rough correspondence. Now the class description makes it clear that the write time of each recorded seqno comes before or at the associated time, to support getting best results for GetProximalSeqnoBeforeTime. And this makes it easier to make clear the contract of each API function.
* Update `DBImpl::RecordSeqnoToTimeMapping()` to follow this ordering in gathering samples.
Some part of these changes has required an expanded test work-around for the problem (see intended follow-up above) that the DB does not immediately ensure recent seqnos are covered by its mapping. These work-arounds will be removed with that planned work.
An apparent compaction bug is revealed in
PrecludeLastLevelTest::RangeDelsCauseFileEndpointsToOverlap, so that test is disabled. Filed GitHub issue #11909
Cosmetic / code safety things (not exhaustive):
* Fix some confusing names.
* `seqno_time_mapping` was used inconsistently in places. Now just `seqno_to_time_mapping` to correspond to class name.
* Rename confusing `GetOldestSequenceNum` -> `GetProximalSeqnoBeforeTime` and `GetOldestApproximateTime` -> `GetProximalTimeBeforeSeqno`. Part of the motivation is that our times and seqnos here have the same underlying type, so we want to be clear about which is expected where to avoid mixing.
* Rename `kUnknownSeqnoTime` to `kUnknownTimeBeforeAll` because the value is a bad choice for unknown if we ever add ProximalAfterBlah functions.
* Arithmetic on SeqnoTimePair doesn't make sense except for delta encoding, so use better names / APIs with that in mind.
* (OMG) Don't allow direct comparison between SeqnoTimePair and SequenceNumber. (There is no checking that it isn't compared against time by accident.)
* A field name essentially matching the containing class name is a confusing pattern (`seqno_time_mapping_`).
* Wrap calls to confusing (but useful) upper_bound and lower_bound functions to have clearer names and more code reuse.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11905
Test Plan: GetOldestSequenceNum (now GetProximalSeqnoBeforeTime) and TruncateOldEntries were lacking unit tests, despite both being used in production (experimental feature). Added those and expanded others.
Reviewed By: jowlyzhang
Differential Revision: D49755592
Pulled By: pdillinger
fbshipit-source-id: f72a3baac74d24b963c77e538bba89a7fc8dce51
2023-09-29 18:21:59 +00:00
|
|
|
SeqnoToTimeMapping seqno_to_time_mapping_;
|
2022-12-14 05:45:00 +00:00
|
|
|
|
Cleanup, improve, stress test LockWAL() (#11143)
Summary:
The previous API comments for LockWAL didn't provide much about why you might want to use it, and didn't really meet what one would infer its contract was. Also, LockWAL was not in db_stress / crash test. In this change:
* Implement a counting semantics for LockWAL()+UnlockWAL(), so that they can safely be used concurrently across threads or recursively within a thread. This should make the API much less bug-prone and easier to use.
* Make sure no UnlockWAL() is needed after non-OK LockWAL() (to match RocksDB conventions)
* Make UnlockWAL() reliably return non-OK when there's no matching LockWAL() (for debug-ability)
* Clarify API comments on LockWAL(), UnlockWAL(), FlushWAL(), and SyncWAL(). Their exact meanings are not obvious, and I don't think it's appropriate to talk about implementation mutexes in the API comments, but about what operations might block each other.
* Add LockWAL()/UnlockWAL() to db_stress and crash test, mostly to check for assertion failures, but also checks that latest seqno doesn't change while WAL is locked. This is simpler to add when LockWAL() is allowed in multiple threads.
* Remove unnecessary use of sync points in test DBWALTest::LockWal. There was a bug during development of above changes that caused this test to fail sporadically, with and without this sync point change.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11143
Test Plan: unit tests added / updated, added to stress/crash test
Reviewed By: ajkr
Differential Revision: D42848627
Pulled By: pdillinger
fbshipit-source-id: 6d976c51791941a31fd8fbf28b0f82e888d9f4b4
2023-01-31 06:52:30 +00:00
|
|
|
// Stop write token that is acquired when first LockWAL() is called.
|
|
|
|
// Destroyed when last UnlockWAL() is called. Controlled by DB mutex.
|
|
|
|
// See lock_wal_count_
|
2022-12-14 05:45:00 +00:00
|
|
|
std::unique_ptr<WriteControllerToken> lock_wal_write_token_;
|
Cleanup, improve, stress test LockWAL() (#11143)
Summary:
The previous API comments for LockWAL didn't provide much about why you might want to use it, and didn't really meet what one would infer its contract was. Also, LockWAL was not in db_stress / crash test. In this change:
* Implement a counting semantics for LockWAL()+UnlockWAL(), so that they can safely be used concurrently across threads or recursively within a thread. This should make the API much less bug-prone and easier to use.
* Make sure no UnlockWAL() is needed after non-OK LockWAL() (to match RocksDB conventions)
* Make UnlockWAL() reliably return non-OK when there's no matching LockWAL() (for debug-ability)
* Clarify API comments on LockWAL(), UnlockWAL(), FlushWAL(), and SyncWAL(). Their exact meanings are not obvious, and I don't think it's appropriate to talk about implementation mutexes in the API comments, but about what operations might block each other.
* Add LockWAL()/UnlockWAL() to db_stress and crash test, mostly to check for assertion failures, but also checks that latest seqno doesn't change while WAL is locked. This is simpler to add when LockWAL() is allowed in multiple threads.
* Remove unnecessary use of sync points in test DBWALTest::LockWal. There was a bug during development of above changes that caused this test to fail sporadically, with and without this sync point change.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11143
Test Plan: unit tests added / updated, added to stress/crash test
Reviewed By: ajkr
Differential Revision: D42848627
Pulled By: pdillinger
fbshipit-source-id: 6d976c51791941a31fd8fbf28b0f82e888d9f4b4
2023-01-31 06:52:30 +00:00
|
|
|
|
|
|
|
// The number of LockWAL called without matching UnlockWAL call.
|
|
|
|
// See also lock_wal_write_token_
|
|
|
|
uint32_t lock_wal_count_;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2022-05-20 01:39:41 +00:00
|
|
|
class GetWithTimestampReadCallback : public ReadCallback {
|
|
|
|
public:
|
|
|
|
explicit GetWithTimestampReadCallback(SequenceNumber seq)
|
|
|
|
: ReadCallback(seq) {}
|
|
|
|
bool IsVisibleFullCheck(SequenceNumber seq) override {
|
|
|
|
return seq <= max_visible_seq_;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
Options SanitizeOptions(const std::string& db, const Options& src,
|
|
|
|
bool read_only = false,
|
|
|
|
Status* logger_creation_s = nullptr);
|
2016-09-23 23:34:04 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
|
|
|
|
bool read_only = false,
|
|
|
|
Status* logger_creation_s = nullptr);
|
2013-10-30 17:52:33 +00:00
|
|
|
|
2024-01-29 18:38:08 +00:00
|
|
|
CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
|
|
|
|
const MutableCFOptions& mutable_cf_options);
|
2017-05-24 18:25:38 +00:00
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
// Return the earliest log file to keep after the memtable flush is
|
|
|
|
// finalized.
|
|
|
|
// `cfd_to_flush` is the column family whose memtable (specified in
|
|
|
|
// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
|
|
|
|
// file.
|
|
|
|
// The function is only applicable to 2pc mode.
|
2024-01-29 18:38:08 +00:00
|
|
|
uint64_t PrecomputeMinLogNumberToKeep2PC(
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
|
2020-11-07 00:30:44 +00:00
|
|
|
const autovector<VersionEdit*>& edit_list,
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
const autovector<MemTable*>& memtables_to_flush,
|
|
|
|
LogsWithPrepTracker* prep_tracker);
|
2020-12-04 03:21:08 +00:00
|
|
|
// For atomic flush.
|
2024-01-29 18:38:08 +00:00
|
|
|
uint64_t PrecomputeMinLogNumberToKeep2PC(
|
2020-12-04 03:21:08 +00:00
|
|
|
VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
|
|
|
|
const autovector<autovector<VersionEdit*>>& edit_lists,
|
|
|
|
const autovector<const autovector<MemTable*>*>& memtables_to_flush,
|
|
|
|
LogsWithPrepTracker* prep_tracker);
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
|
2020-11-07 00:30:44 +00:00
|
|
|
// In non-2PC mode, WALs with log number < the returned number can be
|
|
|
|
// deleted after the cfd_to_flush column family is flushed successfully.
|
2024-01-29 18:38:08 +00:00
|
|
|
uint64_t PrecomputeMinLogNumberToKeepNon2PC(
|
2020-11-07 00:30:44 +00:00
|
|
|
VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
|
|
|
|
const autovector<VersionEdit*>& edit_list);
|
2020-11-17 23:54:49 +00:00
|
|
|
// For atomic flush.
|
2024-01-29 18:38:08 +00:00
|
|
|
uint64_t PrecomputeMinLogNumberToKeepNon2PC(
|
2020-11-17 23:54:49 +00:00
|
|
|
VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
|
|
|
|
const autovector<autovector<VersionEdit*>>& edit_lists);
|
2020-11-07 00:30:44 +00:00
|
|
|
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
// `cfd_to_flush` is the column family whose memtable will be flushed and thus
|
|
|
|
// will not depend on any WAL file. nullptr means no memtable is being flushed.
|
|
|
|
// The function is only applicable to 2pc mode.
|
2024-01-29 18:38:08 +00:00
|
|
|
uint64_t FindMinPrepLogReferencedByMemTable(
|
Fix a silent data loss for write-committed txn (#9571)
Summary:
The following sequence of events can cause silent data loss for write-committed
transactions.
```
Time thread 1 bg flush
| db->Put("a")
| txn = NewTxn()
| txn->Put("b", "v")
| txn->Prepare() // writes only to 5.log
| db->SwitchMemtable() // memtable 1 has "a"
| // close 5.log,
| // creates 8.log
| trigger flush
| pick memtable 1
| unlock db mutex
| write new sst
| txn->ctwb->Put("gtid", "1") // writes 8.log
| txn->Commit() // writes to 8.log
| // writes to memtable 2
| compute min_log_number_to_keep_2pc, this
| will be 8 (incorrect).
|
| Purge obsolete wals, including 5.log
|
V
```
At this point, writes of txn exists only in memtable. Close db without flush because db thinks the data in
memtable are backed by log. Then reopen, the writes are lost except key-value pair {"gtid"->"1"},
only the commit marker of txn is in 8.log
The reason lies in `PrecomputeMinLogNumberToKeep2PC()` which calls `FindMinPrepLogReferencedByMemTable()`.
In the above example, when bg flush thread tries to find obsolete wals, it uses the information
computed by `PrecomputeMinLogNumberToKeep2PC()`. The return value of `PrecomputeMinLogNumberToKeep2PC()`
depends on three components
- `PrecomputeMinLogNumberToKeepNon2PC()`. This represents the WAL that has unflushed data. As the name of this method suggests, it does not account for 2PC. Although the keys reside in the prepare section of a previous WAL, the column family references the current WAL when they are actually inserted into the memtable during txn commit.
- `prep_tracker->FindMinLogContainingOutstandingPrep()`. This represents the WAL with a prepare section but the txn hasn't committed.
- `FindMinPrepLogReferencedByMemTable()`. This represents the WAL on which some memtables (mutable and immutable) depend for their unflushed data.
The bug lies in `FindMinPrepLogReferencedByMemTable()`. Originally, this function skips checking the column families
that are being flushed, but the unit test added in this PR shows that they should not be. In this unit test, there is
only the default column family, and one of its memtables has unflushed data backed by a prepare section in 5.log.
We should return this information via `FindMinPrepLogReferencedByMemTable()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9571
Test Plan:
```
./transaction_test --gtest_filter=*/TransactionTest.SwitchMemtableDuringPrepareAndCommit_WC/*
make check
```
Reviewed By: siying
Differential Revision: D34235236
Pulled By: riversand963
fbshipit-source-id: 120eb21a666728a38dda77b96276c6af72b008b1
2022-02-17 07:07:48 +00:00
|
|
|
VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
|
2020-12-04 03:21:08 +00:00
|
|
|
// For atomic flush.
|
2024-01-29 18:38:08 +00:00
|
|
|
uint64_t FindMinPrepLogReferencedByMemTable(
|
Fix a silent data loss for write-committed txn (#9571)
Summary:
The following sequence of events can cause silent data loss for write-committed
transactions.
```
Time thread 1 bg flush
| db->Put("a")
| txn = NewTxn()
| txn->Put("b", "v")
| txn->Prepare() // writes only to 5.log
| db->SwitchMemtable() // memtable 1 has "a"
| // close 5.log,
| // creates 8.log
| trigger flush
| pick memtable 1
| unlock db mutex
| write new sst
| txn->ctwb->Put("gtid", "1") // writes 8.log
| txn->Commit() // writes to 8.log
| // writes to memtable 2
| compute min_log_number_to_keep_2pc, this
| will be 8 (incorrect).
|
| Purge obsolete wals, including 5.log
|
V
```
At this point, writes of txn exists only in memtable. Close db without flush because db thinks the data in
memtable are backed by log. Then reopen, the writes are lost except key-value pair {"gtid"->"1"},
only the commit marker of txn is in 8.log
The reason lies in `PrecomputeMinLogNumberToKeep2PC()` which calls `FindMinPrepLogReferencedByMemTable()`.
In the above example, when bg flush thread tries to find obsolete wals, it uses the information
computed by `PrecomputeMinLogNumberToKeep2PC()`. The return value of `PrecomputeMinLogNumberToKeep2PC()`
depends on three components
- `PrecomputeMinLogNumberToKeepNon2PC()`. This represents the WAL that has unflushed data. As the name of this method suggests, it does not account for 2PC. Although the keys reside in the prepare section of a previous WAL, the column family references the current WAL when they are actually inserted into the memtable during txn commit.
- `prep_tracker->FindMinLogContainingOutstandingPrep()`. This represents the WAL with a prepare section but the txn hasn't committed.
- `FindMinPrepLogReferencedByMemTable()`. This represents the WAL on which some memtables (mutable and immutable) depend for their unflushed data.
The bug lies in `FindMinPrepLogReferencedByMemTable()`. Originally, this function skips checking the column families
that are being flushed, but the unit test added in this PR shows that they should not be. In this unit test, there is
only the default column family, and one of its memtables has unflushed data backed by a prepare section in 5.log.
We should return this information via `FindMinPrepLogReferencedByMemTable()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9571
Test Plan:
```
./transaction_test --gtest_filter=*/TransactionTest.SwitchMemtableDuringPrepareAndCommit_WC/*
make check
```
Reviewed By: siying
Differential Revision: D34235236
Pulled By: riversand963
fbshipit-source-id: 120eb21a666728a38dda77b96276c6af72b008b1
2022-02-17 07:07:48 +00:00
|
|
|
VersionSet* vset,
|
2020-12-04 03:21:08 +00:00
|
|
|
const autovector<const autovector<MemTable*>*>& memtables_to_flush);
|
Skip deleted WALs during recovery
Summary:
This patch record min log number to keep to the manifest while flushing SST files to ignore them and any WAL older than them during recovery. This is to avoid scenarios when we have a gap between the WAL files are fed to the recovery procedure. The gap could happen by for example out-of-order WAL deletion. Such gap could cause problems in 2PC recovery where the prepared and commit entry are placed into two separate WAL and gap in the WALs could result into not processing the WAL with the commit entry and hence breaking the 2PC recovery logic.
Before the commit, for 2PC case, we determined which log number to keep in FindObsoleteFiles(). We looked at the earliest logs with outstanding prepare entries, or prepare entries whose respective commit or abort are in memtable. With the commit, the same calculation is done while we apply the SST flush. Just before installing the flush file, we precompute the earliest log file to keep after the flush finishes using the same logic (but skipping the memtables just flushed), record this information to the manifest entry for this new flushed SST file. This pre-computed value is also remembered in memory, and will later be used to determine whether a log file can be deleted. This value is unlikely to change until next flush because the commit entry will stay in memtable. (In WritePrepared, we could have removed the older log files as soon as all prepared entries are committed. It's not yet done anyway. Even if we do it, the only thing we loss with this new approach is earlier log deletion between two flushes, which does not guarantee to happen anyway because the obsolete file clean-up function is only executed after flush or compaction)
This min log number to keep is stored in the manifest using the safely-ignore customized field of AddFile entry, in order to guarantee that the DB generated using newer release can be opened by previous releases no older than 4.2.
Closes https://github.com/facebook/rocksdb/pull/3765
Differential Revision: D7747618
Pulled By: siying
fbshipit-source-id: d00c92105b4f83852e9754a1b70d6b64cb590729
2018-05-03 22:35:11 +00:00
|
|
|
|
2014-08-07 17:05:04 +00:00
|
|
|
// Fix user-supplied options to be reasonable
|
|
|
|
template <class T, class V>
|
|
|
|
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
|
|
|
|
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
|
|
|
|
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
inline Status DBImpl::FailIfCfHasTs(
|
|
|
|
const ColumnFamilyHandle* column_family) const {
|
2023-11-13 22:30:04 +00:00
|
|
|
if (!column_family) {
|
|
|
|
return Status::InvalidArgument("column family handle cannot be null");
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
assert(column_family);
|
|
|
|
const Comparator* const ucmp = column_family->GetComparator();
|
|
|
|
assert(ucmp);
|
|
|
|
if (ucmp->timestamp_size() > 0) {
|
|
|
|
std::ostringstream oss;
|
|
|
|
oss << "cannot call this method on column family "
|
|
|
|
<< column_family->GetName() << " that enables timestamp";
|
|
|
|
return Status::InvalidArgument(oss.str());
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2022-06-06 21:36:22 +00:00
|
|
|
inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family,
|
2023-09-13 23:34:18 +00:00
|
|
|
const Slice& ts) const {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
if (!column_family) {
|
|
|
|
return Status::InvalidArgument("column family handle cannot be null");
|
|
|
|
}
|
|
|
|
assert(column_family);
|
|
|
|
const Comparator* const ucmp = column_family->GetComparator();
|
|
|
|
assert(ucmp);
|
|
|
|
if (0 == ucmp->timestamp_size()) {
|
|
|
|
std::stringstream oss;
|
|
|
|
oss << "cannot call this method on column family "
|
|
|
|
<< column_family->GetName() << " that does not enable timestamp";
|
|
|
|
return Status::InvalidArgument(oss.str());
|
|
|
|
}
|
|
|
|
const size_t ts_sz = ts.size();
|
|
|
|
if (ts_sz != ucmp->timestamp_size()) {
|
|
|
|
std::stringstream oss;
|
|
|
|
oss << "Timestamp sizes mismatch: expect " << ucmp->timestamp_size() << ", "
|
|
|
|
<< ts_sz << " given";
|
|
|
|
return Status::InvalidArgument(oss.str());
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2023-09-13 23:34:18 +00:00
|
|
|
inline Status DBImpl::FailIfReadCollapsedHistory(const ColumnFamilyData* cfd,
|
|
|
|
const SuperVersion* sv,
|
|
|
|
const Slice& ts) const {
|
|
|
|
// Reaching to this point means the timestamp size matching sanity check in
|
|
|
|
// `DBImpl::FailIfTsMismatchCf` already passed. So we skip that and assume
|
|
|
|
// column family has the same user-defined timestamp format as `ts`.
|
|
|
|
const Comparator* const ucmp = cfd->user_comparator();
|
|
|
|
assert(ucmp);
|
|
|
|
const std::string& full_history_ts_low = sv->full_history_ts_low;
|
|
|
|
assert(full_history_ts_low.empty() ||
|
|
|
|
full_history_ts_low.size() == ts.size());
|
|
|
|
if (!full_history_ts_low.empty() &&
|
|
|
|
ucmp->CompareTimestamp(ts, full_history_ts_low) < 0) {
|
|
|
|
std::stringstream oss;
|
|
|
|
oss << "Read timestamp: " << ts.ToString(true)
|
|
|
|
<< " is smaller than full_history_ts_low: "
|
|
|
|
<< Slice(full_history_ts_low).ToString(true) << std::endl;
|
|
|
|
return Status::InvalidArgument(oss.str());
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|