2016-08-09 17:16:32 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2016-08-09 17:16:32 +00:00
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
2017-05-10 21:54:35 +00:00
|
|
|
|
|
|
|
#include <functional>
|
2020-06-23 20:41:03 +00:00
|
|
|
#include <limits>
|
2016-08-09 17:16:32 +00:00
|
|
|
#include <string>
|
2017-05-10 21:54:35 +00:00
|
|
|
#include <vector>
|
2020-06-23 20:41:03 +00:00
|
|
|
|
2016-08-09 17:16:32 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/status.h"
|
2017-05-10 21:54:35 +00:00
|
|
|
#include "rocksdb/utilities/stackable_db.h"
|
2016-08-09 17:16:32 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2017-05-10 21:54:35 +00:00
|
|
|
|
|
|
|
namespace blob_db {
|
|
|
|
|
2016-08-09 17:16:32 +00:00
|
|
|
// A wrapped database which puts values of KV pairs in a separate log
|
|
|
|
// and store location to the log in the underlying DB.
|
|
|
|
//
|
|
|
|
// The factory needs to be moved to include/rocksdb/utilities to allow
|
|
|
|
// users to use blob DB.
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2020-06-23 20:41:03 +00:00
|
|
|
constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
|
|
|
|
|
2017-05-10 21:54:35 +00:00
|
|
|
struct BlobDBOptions {
|
2020-05-06 20:58:25 +00:00
|
|
|
// Name of the directory under the base DB where blobs will be stored. Using
|
|
|
|
// a directory where the base DB stores its SST files is not supported.
|
|
|
|
// Default is "blob_dir"
|
2017-08-01 19:48:22 +00:00
|
|
|
std::string blob_dir = "blob_dir";
|
2017-05-10 21:54:35 +00:00
|
|
|
|
|
|
|
// whether the blob_dir path is relative or absolute.
|
2017-08-01 19:48:22 +00:00
|
|
|
bool path_relative = true;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2018-03-06 19:46:20 +00:00
|
|
|
// When max_db_size is reached, evict blob files to free up space
|
|
|
|
// instead of returnning NoSpace error on write. Blob files will be
|
2018-06-26 05:32:29 +00:00
|
|
|
// evicted from oldest to newest, based on file creation time.
|
2017-08-01 19:48:22 +00:00
|
|
|
bool is_fifo = false;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2018-03-06 19:46:20 +00:00
|
|
|
// Maximum size of the database (including SST files and blob files).
|
|
|
|
//
|
|
|
|
// Default: 0 (no limits)
|
|
|
|
uint64_t max_db_size = 0;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
|
|
|
// a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
|
|
|
|
// (10 minutes), and the first bucket starts at 1471542000
|
|
|
|
// then the blob buckets will be
|
|
|
|
// first bucket is 1471542000 - 1471542600
|
|
|
|
// second bucket is 1471542600 - 1471543200
|
|
|
|
// and so on
|
2017-08-04 00:46:00 +00:00
|
|
|
uint64_t ttl_range_secs = 3600;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2019-01-16 00:25:03 +00:00
|
|
|
// The smallest value to store in blob log. Values smaller than this threshold
|
Blob DB: Inline small values in base DB
Summary:
Adding the `min_blob_size` option to allow storing small values in base db (in LSM tree) together with the key. The goal is to improve performance for small values, while taking advantage of blob db's low write amplification for large values.
Also adding expiration timestamp to blob index. It will be useful to evict stale blob indexes in base db by adding a compaction filter. I'll work on the compaction filter in future patches.
See blob_index.h for the new blob index format. There are 4 cases when writing a new key:
* small value w/o TTL: put in base db as normal value (i.e. ValueType::kTypeValue)
* small value w/ TTL: put (type, expiration, value) to base db.
* large value w/o TTL: write value to blob log and put (type, file, offset, size, compression) to base db.
* large value w/TTL: write value to blob log and put (type, expiration, file, offset, size, compression) to base db.
Closes https://github.com/facebook/rocksdb/pull/3066
Differential Revision: D6142115
Pulled By: yiwu-arbug
fbshipit-source-id: 9526e76e19f0839310a3f5f2a43772a4ad182cd0
2017-10-26 19:19:43 +00:00
|
|
|
// will be inlined in base DB together with the key.
|
|
|
|
uint64_t min_blob_size = 0;
|
|
|
|
|
2017-12-20 00:34:25 +00:00
|
|
|
// Allows OS to incrementally sync blob files to disk for every
|
|
|
|
// bytes_per_sync bytes written. Users shouldn't rely on it for
|
|
|
|
// persistency guarantee.
|
|
|
|
uint64_t bytes_per_sync = 512 * 1024;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
|
|
|
// the target size of each blob file. File will become immutable
|
|
|
|
// after it exceeds that size
|
2017-08-01 19:48:22 +00:00
|
|
|
uint64_t blob_file_size = 256 * 1024 * 1024;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
|
|
|
// what compression to use for Blob's
|
2017-08-01 19:48:22 +00:00
|
|
|
CompressionType compression = kNoCompression;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2019-12-13 18:11:03 +00:00
|
|
|
// If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction
|
|
|
|
// by rewriting the remaining live blobs to new files.
|
2017-11-02 22:47:30 +00:00
|
|
|
bool enable_garbage_collection = false;
|
|
|
|
|
2019-12-13 18:11:03 +00:00
|
|
|
// The cutoff in terms of blob file age for garbage collection. Blobs in
|
|
|
|
// the oldest N non-TTL blob files will be rewritten when encountered during
|
|
|
|
// compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files.
|
|
|
|
double garbage_collection_cutoff = 0.25;
|
|
|
|
|
2017-11-02 22:47:30 +00:00
|
|
|
// Disable all background job. Used for test only.
|
2017-08-01 19:48:22 +00:00
|
|
|
bool disable_background_tasks = false;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2017-08-01 19:48:22 +00:00
|
|
|
void Dump(Logger* log) const;
|
2017-05-10 21:54:35 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class BlobDB : public StackableDB {
|
|
|
|
public:
|
2020-02-20 20:07:53 +00:00
|
|
|
using ROCKSDB_NAMESPACE::StackableDB::Put;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Put(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value) override = 0;
|
|
|
|
Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) override {
|
2019-12-20 02:03:24 +00:00
|
|
|
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
|
2017-09-08 17:57:12 +00:00
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
return Put(options, key, value);
|
|
|
|
}
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
using ROCKSDB_NAMESPACE::StackableDB::Delete;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) override {
|
2019-12-20 02:03:24 +00:00
|
|
|
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
|
2017-09-08 17:57:12 +00:00
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
2018-03-02 20:54:24 +00:00
|
|
|
assert(db_ != nullptr);
|
|
|
|
return db_->Delete(options, column_family, key);
|
2017-09-08 17:57:12 +00:00
|
|
|
}
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2017-09-08 17:57:12 +00:00
|
|
|
virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value, uint64_t ttl) = 0;
|
2017-05-10 21:54:35 +00:00
|
|
|
virtual Status PutWithTTL(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
2017-08-04 00:46:00 +00:00
|
|
|
const Slice& value, uint64_t ttl) {
|
2019-12-20 02:03:24 +00:00
|
|
|
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
|
2017-09-08 17:57:12 +00:00
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
return PutWithTTL(options, key, value, ttl);
|
2017-05-10 21:54:35 +00:00
|
|
|
}
|
|
|
|
|
2017-08-04 00:46:00 +00:00
|
|
|
// Put with expiration. Key with expiration time equal to
|
|
|
|
// std::numeric_limits<uint64_t>::max() means the key don't expire.
|
2017-09-08 17:57:12 +00:00
|
|
|
virtual Status PutUntil(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value, uint64_t expiration) = 0;
|
2017-05-10 21:54:35 +00:00
|
|
|
virtual Status PutUntil(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
2017-08-04 00:46:00 +00:00
|
|
|
const Slice& value, uint64_t expiration) {
|
2019-12-20 02:03:24 +00:00
|
|
|
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
|
2017-09-08 17:57:12 +00:00
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
return PutUntil(options, key, value, expiration);
|
2017-05-10 21:54:35 +00:00
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
using ROCKSDB_NAMESPACE::StackableDB::Get;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
2024-02-16 17:21:06 +00:00
|
|
|
const Slice& key, PinnableSlice* value,
|
|
|
|
std::string* timestamp) override = 0;
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2018-08-07 00:28:40 +00:00
|
|
|
// Get value and expiration.
|
|
|
|
virtual Status Get(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* value, uint64_t* expiration) = 0;
|
|
|
|
virtual Status Get(const ReadOptions& options, const Slice& key,
|
|
|
|
PinnableSlice* value, uint64_t* expiration) {
|
|
|
|
return Get(options, DefaultColumnFamily(), key, value, expiration);
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
using ROCKSDB_NAMESPACE::StackableDB::SingleDelete;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status SingleDelete(const WriteOptions& /*wopts*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/) override {
|
2017-09-08 17:57:12 +00:00
|
|
|
return Status::NotSupported("Not supported operation in blob db.");
|
|
|
|
}
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
using ROCKSDB_NAMESPACE::StackableDB::Merge;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Merge(const WriteOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
|
|
|
|
const Slice& /*value*/) override {
|
2017-05-10 21:54:35 +00:00
|
|
|
return Status::NotSupported("Not supported operation in blob db.");
|
|
|
|
}
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Write(const WriteOptions& opts, WriteBatch* updates) override = 0;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
using ROCKSDB_NAMESPACE::StackableDB::NewIterator;
|
2024-01-31 21:14:42 +00:00
|
|
|
Iterator* NewIterator(const ReadOptions& options) override = 0;
|
|
|
|
Iterator* NewIterator(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family) override {
|
2019-12-20 02:03:24 +00:00
|
|
|
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
|
2017-09-08 17:57:12 +00:00
|
|
|
// Blob DB doesn't support non-default column family.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
return NewIterator(options);
|
|
|
|
}
|
|
|
|
|
2019-11-11 22:00:25 +00:00
|
|
|
Status CompactFiles(
|
|
|
|
const CompactionOptions& compact_options,
|
|
|
|
const std::vector<std::string>& input_file_names, const int output_level,
|
|
|
|
const int output_path_id = -1,
|
|
|
|
std::vector<std::string>* const output_file_names = nullptr,
|
|
|
|
CompactionJobInfo* compaction_job_info = nullptr) override = 0;
|
|
|
|
Status CompactFiles(
|
|
|
|
const CompactionOptions& compact_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const std::vector<std::string>& input_file_names, const int output_level,
|
|
|
|
const int output_path_id = -1,
|
|
|
|
std::vector<std::string>* const output_file_names = nullptr,
|
|
|
|
CompactionJobInfo* compaction_job_info = nullptr) override {
|
2019-12-20 02:03:24 +00:00
|
|
|
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
|
2019-11-11 22:00:25 +00:00
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
|
|
|
|
return CompactFiles(compact_options, input_file_names, output_level,
|
|
|
|
output_path_id, output_file_names, compaction_job_info);
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
using ROCKSDB_NAMESPACE::StackableDB::Close;
|
2024-01-31 21:14:42 +00:00
|
|
|
Status Close() override = 0;
|
2018-03-06 19:46:20 +00:00
|
|
|
|
2017-12-11 20:01:22 +00:00
|
|
|
// Opening blob db.
|
2017-05-10 21:54:35 +00:00
|
|
|
static Status Open(const Options& options, const BlobDBOptions& bdb_options,
|
|
|
|
const std::string& dbname, BlobDB** blob_db);
|
|
|
|
|
|
|
|
static Status Open(const DBOptions& db_options,
|
|
|
|
const BlobDBOptions& bdb_options,
|
|
|
|
const std::string& dbname,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles,
|
2017-12-11 20:01:22 +00:00
|
|
|
BlobDB** blob_db);
|
2017-05-10 21:54:35 +00:00
|
|
|
|
2017-06-14 20:08:54 +00:00
|
|
|
virtual BlobDBOptions GetBlobDBOptions() const = 0;
|
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
virtual Status SyncBlobFiles(const WriteOptions& write_options) = 0;
|
2017-12-20 00:34:25 +00:00
|
|
|
|
2024-02-02 03:09:25 +00:00
|
|
|
~BlobDB() override {}
|
2017-05-10 21:54:35 +00:00
|
|
|
|
|
|
|
protected:
|
2017-12-11 20:01:22 +00:00
|
|
|
explicit BlobDB();
|
2017-05-10 21:54:35 +00:00
|
|
|
};
|
|
|
|
|
2017-06-14 20:08:54 +00:00
|
|
|
// Destroy the content of the database.
|
|
|
|
Status DestroyBlobDB(const std::string& dbname, const Options& options,
|
|
|
|
const BlobDBOptions& bdb_options);
|
|
|
|
|
2017-05-10 21:54:35 +00:00
|
|
|
} // namespace blob_db
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|