mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-28 05:43:50 +00:00
c2aad555c3
Summary:
Optionally enable zstd checksum flag (d857369028/lib/zstd.h (L428)
) to detect corruption during decompression. Main changes are in compression.h:
* User can set CompressionOptions::checksum to true to enable this feature.
* We enable this feature in ZSTD by setting the checksum flag in ZSTD compression context: `ZSTD_CCtx`.
* Uses `ZSTD_compress2()` to do compression since it supports frame parameter like the checksum flag. Compression level is also set in compression context as a flag.
* Error handling during decompression to propagate error message from ZSTD.
* Updated microbench to test read performance impact.
About compatibility, the current compression decoders should continue to work with the data created by the new compression API `ZSTD_compress2()`: https://github.com/facebook/zstd/issues/3711.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11666
Test Plan:
* Existing unit tests for zstd compression
* Add unit test `DBTest2.ZSTDChecksum` to test the corruption case
* Manually tested that compression levels, parallel compression, dictionary compression, index compression all work with the new ZSTD_compress2() API.
* Manually tested with `sst_dump --command=recompress` that different compression levels and dictionary compression settings all work.
* Manually tested compiling with older versions of ZSTD: v1.3.8, v1.1.0, v0.6.2.
* Perf impact: from public benchmark data: http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html for checksum and https://github.com/facebook/zstd#benchmarks, if decompression is 1700MB/s and checksum computation is 70000MB/s, checksum computation is an additional ~2.4% time for decompression. Compression is slower and checksumming should be less noticeable.
* Microbench:
```
TEST_TMPDIR=/dev/shm ./branch_db_basic_bench --benchmark_filter=DBGet/comp_style:0/max_data:1048576/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/mmap:0/compression_type:7/compression_checksum:1/no_blockcache:1/iterations:10000/threads:1 --benchmark_repetitions=100
Min out of 100 runs:
Main:
10390 10436 10456 10484 10499 10535 10544 10545 10565 10568
After this PR, checksum=false
10285 10397 10503 10508 10515 10557 10562 10635 10640 10660
After this PR, checksum=true
10827 10876 10925 10949 10971 11052 11061 11063 11100 11109
```
* db_bench:
```
Write perf
TEST_TMPDIR=/dev/shm/ ./db_bench_ichecksum --benchmarks=fillseq[-X10] --compression_type=zstd --num=10000000 --compression_checksum=..
[FillSeq checksum=0]
fillseq [AVG 10 runs] : 281635 (± 31711) ops/sec; 31.2 (± 3.5) MB/sec
fillseq [MEDIAN 10 runs] : 294027 ops/sec; 32.5 MB/sec
[FillSeq checksum=1]
fillseq [AVG 10 runs] : 286961 (± 34700) ops/sec; 31.7 (± 3.8) MB/sec
fillseq [MEDIAN 10 runs] : 283278 ops/sec; 31.3 MB/sec
Read perf
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=readrandom[-X20] --num=100000000 --reads=1000000 --use_existing_db=true --readonly=1
[Readrandom checksum=1]
readrandom [AVG 20 runs] : 360928 (± 3579) ops/sec; 4.0 (± 0.0) MB/sec
readrandom [MEDIAN 20 runs] : 362468 ops/sec; 4.0 MB/sec
[Readrandom checksum=0]
readrandom [AVG 20 runs] : 380365 (± 2384) ops/sec; 4.2 (± 0.0) MB/sec
readrandom [MEDIAN 20 runs] : 379800 ops/sec; 4.2 MB/sec
Compression
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=compress[-X20] --compression_type=zstd --num=100000000 --compression_checksum=1
checksum=1
compress [AVG 20 runs] : 54074 (± 634) ops/sec; 211.2 (± 2.5) MB/sec
compress [MEDIAN 20 runs] : 54396 ops/sec; 212.5 MB/sec
checksum=0
compress [AVG 20 runs] : 54598 (± 393) ops/sec; 213.3 (± 1.5) MB/sec
compress [MEDIAN 20 runs] : 54592 ops/sec; 213.3 MB/sec
Decompression:
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=uncompress[-X20] --compression_type=zstd --compression_checksum=1
checksum = 0
uncompress [AVG 20 runs] : 167499 (± 962) ops/sec; 654.3 (± 3.8) MB/sec
uncompress [MEDIAN 20 runs] : 167210 ops/sec; 653.2 MB/sec
checksum = 1
uncompress [AVG 20 runs] : 167980 (± 924) ops/sec; 656.2 (± 3.6) MB/sec
uncompress [MEDIAN 20 runs] : 168465 ops/sec; 658.1 MB/sec
```
Reviewed By: ajkr
Differential Revision: D48019378
Pulled By: cbi42
fbshipit-source-id: 674120c6e1853c2ced1436ac8138559d0204feba
427 lines
13 KiB
C++
427 lines
13 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "db/blob/blob_file_builder.h"
|
|
|
|
#include <cassert>
|
|
|
|
#include "db/blob/blob_contents.h"
|
|
#include "db/blob/blob_file_addition.h"
|
|
#include "db/blob/blob_file_completion_callback.h"
|
|
#include "db/blob/blob_index.h"
|
|
#include "db/blob/blob_log_format.h"
|
|
#include "db/blob/blob_log_writer.h"
|
|
#include "db/blob/blob_source.h"
|
|
#include "db/event_helpers.h"
|
|
#include "db/version_set.h"
|
|
#include "file/filename.h"
|
|
#include "file/read_write_util.h"
|
|
#include "file/writable_file_writer.h"
|
|
#include "logging/logging.h"
|
|
#include "options/cf_options.h"
|
|
#include "options/options_helper.h"
|
|
#include "rocksdb/slice.h"
|
|
#include "rocksdb/status.h"
|
|
#include "test_util/sync_point.h"
|
|
#include "trace_replay/io_tracer.h"
|
|
#include "util/compression.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
BlobFileBuilder::BlobFileBuilder(
|
|
VersionSet* versions, FileSystem* fs,
|
|
const ImmutableOptions* immutable_options,
|
|
const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
|
|
std::string db_id, std::string db_session_id, int job_id,
|
|
uint32_t column_family_id, const std::string& column_family_name,
|
|
Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
|
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
BlobFileCompletionCallback* blob_callback,
|
|
BlobFileCreationReason creation_reason,
|
|
std::vector<std::string>* blob_file_paths,
|
|
std::vector<BlobFileAddition>* blob_file_additions)
|
|
: BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs,
|
|
immutable_options, mutable_cf_options, file_options,
|
|
db_id, db_session_id, job_id, column_family_id,
|
|
column_family_name, io_priority, write_hint, io_tracer,
|
|
blob_callback, creation_reason, blob_file_paths,
|
|
blob_file_additions) {}
|
|
|
|
BlobFileBuilder::BlobFileBuilder(
|
|
std::function<uint64_t()> file_number_generator, FileSystem* fs,
|
|
const ImmutableOptions* immutable_options,
|
|
const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
|
|
std::string db_id, std::string db_session_id, int job_id,
|
|
uint32_t column_family_id, const std::string& column_family_name,
|
|
Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
|
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
BlobFileCompletionCallback* blob_callback,
|
|
BlobFileCreationReason creation_reason,
|
|
std::vector<std::string>* blob_file_paths,
|
|
std::vector<BlobFileAddition>* blob_file_additions)
|
|
: file_number_generator_(std::move(file_number_generator)),
|
|
fs_(fs),
|
|
immutable_options_(immutable_options),
|
|
min_blob_size_(mutable_cf_options->min_blob_size),
|
|
blob_file_size_(mutable_cf_options->blob_file_size),
|
|
blob_compression_type_(mutable_cf_options->blob_compression_type),
|
|
prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache),
|
|
file_options_(file_options),
|
|
db_id_(std::move(db_id)),
|
|
db_session_id_(std::move(db_session_id)),
|
|
job_id_(job_id),
|
|
column_family_id_(column_family_id),
|
|
column_family_name_(column_family_name),
|
|
io_priority_(io_priority),
|
|
write_hint_(write_hint),
|
|
io_tracer_(io_tracer),
|
|
blob_callback_(blob_callback),
|
|
creation_reason_(creation_reason),
|
|
blob_file_paths_(blob_file_paths),
|
|
blob_file_additions_(blob_file_additions),
|
|
blob_count_(0),
|
|
blob_bytes_(0) {
|
|
assert(file_number_generator_);
|
|
assert(fs_);
|
|
assert(immutable_options_);
|
|
assert(file_options_);
|
|
assert(blob_file_paths_);
|
|
assert(blob_file_paths_->empty());
|
|
assert(blob_file_additions_);
|
|
assert(blob_file_additions_->empty());
|
|
}
|
|
|
|
BlobFileBuilder::~BlobFileBuilder() = default;
|
|
|
|
Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
|
|
std::string* blob_index) {
|
|
assert(blob_index);
|
|
assert(blob_index->empty());
|
|
|
|
if (value.size() < min_blob_size_) {
|
|
return Status::OK();
|
|
}
|
|
|
|
{
|
|
const Status s = OpenBlobFileIfNeeded();
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
Slice blob = value;
|
|
std::string compressed_blob;
|
|
|
|
{
|
|
const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
uint64_t blob_file_number = 0;
|
|
uint64_t blob_offset = 0;
|
|
|
|
{
|
|
const Status s =
|
|
WriteBlobToFile(key, blob, &blob_file_number, &blob_offset);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
{
|
|
const Status s = CloseBlobFileIfNeeded();
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
{
|
|
const Status s =
|
|
PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset);
|
|
if (!s.ok()) {
|
|
ROCKS_LOG_WARN(immutable_options_->info_log,
|
|
"Failed to pre-populate the blob into blob cache: %s",
|
|
s.ToString().c_str());
|
|
}
|
|
}
|
|
|
|
BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(),
|
|
blob_compression_type_);
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BlobFileBuilder::Finish() {
|
|
if (!IsBlobFileOpen()) {
|
|
return Status::OK();
|
|
}
|
|
|
|
return CloseBlobFile();
|
|
}
|
|
|
|
bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; }
|
|
|
|
Status BlobFileBuilder::OpenBlobFileIfNeeded() {
|
|
if (IsBlobFileOpen()) {
|
|
return Status::OK();
|
|
}
|
|
|
|
assert(!blob_count_);
|
|
assert(!blob_bytes_);
|
|
|
|
assert(file_number_generator_);
|
|
const uint64_t blob_file_number = file_number_generator_();
|
|
|
|
assert(immutable_options_);
|
|
assert(!immutable_options_->cf_paths.empty());
|
|
std::string blob_file_path =
|
|
BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number);
|
|
|
|
if (blob_callback_) {
|
|
blob_callback_->OnBlobFileCreationStarted(
|
|
blob_file_path, column_family_name_, job_id_, creation_reason_);
|
|
}
|
|
|
|
std::unique_ptr<FSWritableFile> file;
|
|
|
|
{
|
|
assert(file_options_);
|
|
Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
"BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
|
|
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
// Note: files get added to blob_file_paths_ right after the open, so they
|
|
// can be cleaned up upon failure. Contrast this with blob_file_additions_,
|
|
// which only contains successfully written files.
|
|
assert(blob_file_paths_);
|
|
blob_file_paths_->emplace_back(std::move(blob_file_path));
|
|
|
|
assert(file);
|
|
file->SetIOPriority(io_priority_);
|
|
file->SetWriteLifeTimeHint(write_hint_);
|
|
FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
|
|
Statistics* const statistics = immutable_options_->stats;
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
std::move(file), blob_file_paths_->back(), *file_options_,
|
|
immutable_options_->clock, io_tracer_, statistics,
|
|
immutable_options_->listeners,
|
|
immutable_options_->file_checksum_gen_factory.get(),
|
|
tmp_set.Contains(FileType::kBlobFile), false));
|
|
|
|
constexpr bool do_flush = false;
|
|
|
|
std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter(
|
|
std::move(file_writer), immutable_options_->clock, statistics,
|
|
blob_file_number, immutable_options_->use_fsync, do_flush));
|
|
|
|
constexpr bool has_ttl = false;
|
|
constexpr ExpirationRange expiration_range;
|
|
|
|
BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl,
|
|
expiration_range);
|
|
|
|
{
|
|
Status s = blob_log_writer->WriteHeader(header);
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
"BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s);
|
|
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
writer_ = std::move(blob_log_writer);
|
|
|
|
assert(IsBlobFileOpen());
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BlobFileBuilder::CompressBlobIfNeeded(
|
|
Slice* blob, std::string* compressed_blob) const {
|
|
assert(blob);
|
|
assert(compressed_blob);
|
|
assert(compressed_blob->empty());
|
|
assert(immutable_options_);
|
|
|
|
if (blob_compression_type_ == kNoCompression) {
|
|
return Status::OK();
|
|
}
|
|
|
|
// TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb
|
|
CompressionOptions opts;
|
|
CompressionContext context(blob_compression_type_, opts);
|
|
constexpr uint64_t sample_for_compression = 0;
|
|
|
|
CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
|
|
blob_compression_type_, sample_for_compression);
|
|
|
|
constexpr uint32_t compression_format_version = 2;
|
|
|
|
bool success = false;
|
|
|
|
{
|
|
StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
|
|
BLOB_DB_COMPRESSION_MICROS);
|
|
success =
|
|
CompressData(*blob, info, compression_format_version, compressed_blob);
|
|
}
|
|
|
|
if (!success) {
|
|
return Status::Corruption("Error compressing blob");
|
|
}
|
|
|
|
*blob = Slice(*compressed_blob);
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob,
|
|
uint64_t* blob_file_number,
|
|
uint64_t* blob_offset) {
|
|
assert(IsBlobFileOpen());
|
|
assert(blob_file_number);
|
|
assert(blob_offset);
|
|
|
|
uint64_t key_offset = 0;
|
|
|
|
Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s);
|
|
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
*blob_file_number = writer_->get_log_number();
|
|
|
|
++blob_count_;
|
|
blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size();
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BlobFileBuilder::CloseBlobFile() {
|
|
assert(IsBlobFileOpen());
|
|
|
|
BlobLogFooter footer;
|
|
footer.blob_count = blob_count_;
|
|
|
|
std::string checksum_method;
|
|
std::string checksum_value;
|
|
|
|
Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value);
|
|
|
|
TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s);
|
|
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
const uint64_t blob_file_number = writer_->get_log_number();
|
|
|
|
if (blob_callback_) {
|
|
s = blob_callback_->OnBlobFileCompleted(
|
|
blob_file_paths_->back(), column_family_name_, job_id_,
|
|
blob_file_number, creation_reason_, s, checksum_value, checksum_method,
|
|
blob_count_, blob_bytes_);
|
|
}
|
|
|
|
assert(blob_file_additions_);
|
|
blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_,
|
|
std::move(checksum_method),
|
|
std::move(checksum_value));
|
|
|
|
assert(immutable_options_);
|
|
ROCKS_LOG_INFO(immutable_options_->logger,
|
|
"[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64
|
|
" total blobs, %" PRIu64 " total bytes",
|
|
column_family_name_.c_str(), job_id_, blob_file_number,
|
|
blob_count_, blob_bytes_);
|
|
|
|
writer_.reset();
|
|
blob_count_ = 0;
|
|
blob_bytes_ = 0;
|
|
|
|
return s;
|
|
}
|
|
|
|
Status BlobFileBuilder::CloseBlobFileIfNeeded() {
|
|
assert(IsBlobFileOpen());
|
|
|
|
const WritableFileWriter* const file_writer = writer_->file();
|
|
assert(file_writer);
|
|
|
|
if (file_writer->GetFileSize() < blob_file_size_) {
|
|
return Status::OK();
|
|
}
|
|
|
|
return CloseBlobFile();
|
|
}
|
|
|
|
void BlobFileBuilder::Abandon(const Status& s) {
|
|
if (!IsBlobFileOpen()) {
|
|
return;
|
|
}
|
|
if (blob_callback_) {
|
|
// BlobFileBuilder::Abandon() is called because of error while writing to
|
|
// Blob files. So we can ignore the below error.
|
|
blob_callback_
|
|
->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_,
|
|
job_id_, writer_->get_log_number(),
|
|
creation_reason_, s, "", "", blob_count_,
|
|
blob_bytes_)
|
|
.PermitUncheckedError();
|
|
}
|
|
|
|
writer_.reset();
|
|
blob_count_ = 0;
|
|
blob_bytes_ = 0;
|
|
}
|
|
|
|
Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
|
|
uint64_t blob_file_number,
|
|
uint64_t blob_offset) const {
|
|
Status s = Status::OK();
|
|
|
|
BlobSource::SharedCacheInterface blob_cache{immutable_options_->blob_cache};
|
|
auto statistics = immutable_options_->statistics.get();
|
|
bool warm_cache =
|
|
prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
|
|
creation_reason_ == BlobFileCreationReason::kFlush;
|
|
|
|
if (blob_cache && warm_cache) {
|
|
const OffsetableCacheKey base_cache_key(db_id_, db_session_id_,
|
|
blob_file_number);
|
|
const CacheKey cache_key = base_cache_key.WithOffset(blob_offset);
|
|
const Slice key = cache_key.AsSlice();
|
|
|
|
const Cache::Priority priority = Cache::Priority::BOTTOM;
|
|
|
|
s = blob_cache.InsertSaved(key, blob, nullptr /*context*/, priority,
|
|
immutable_options_->lowest_used_cache_tier);
|
|
|
|
if (s.ok()) {
|
|
RecordTick(statistics, BLOB_DB_CACHE_ADD);
|
|
RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, blob.size());
|
|
} else {
|
|
RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|