diff --git a/HISTORY.md b/HISTORY.md index 27e7cdec32..8f79c0fef1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * Fix useless no-op compactions scheduled upon snapshot release when options.disable-auto-compactions = true. * Fix a bug when max_write_buffer_size_to_maintain is set, immutable flushed memtable destruction is delayed until the next super version is installed. A memtable is not added to delete list because of its reference hold by super version and super version doesn't switch because of empt delete list. So memory usage keeps on increasing beyond write_buffer_size + max_write_buffer_size_to_maintain. * Avoid converting MERGES to PUTS when allow_ingest_behind is true. +* Fix compression dictionary sampling together with `SstFileWriter`. Previously, the dictionary would be trained/finalized immediately with zero samples. Now, the whole `SstFileWriter` file is buffered in memory and then sampled. ### New Features * A new option `std::shared_ptr file_checksum_gen_factory` is added to `BackupableDBOptions`. The default value for this option is `nullptr`. If this option is null, the default backup engine checksum function (crc32c) will be used for creating, verifying, or restoring backups. If it is not null and is set to the DB custom checksum factory, the custom checksum function used in DB will also be used for creating, verifying, or restoring backups, in addition to the default checksum function (crc32c). If it is not null and is set to a custom checksum factory different than the DB custom checksum factory (which may be null), BackupEngine will return `Status::InvalidArgument()`. diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 73a744122a..b58dbe68b2 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -730,7 +730,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->first_key_in_next_block = &key; Flush(); - if (r->state == Rep::State::kBuffered && + if (r->state == Rep::State::kBuffered && r->target_file_size != 0 && r->data_begin_offset > r->target_file_size) { EnterUnbuffered(); } diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index bc3766c1a1..21758f743f 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -226,12 +226,15 @@ int SstFileDumper::ShowAllCompressionSizes( size_t block_size, const std::vector>& compression_types, - int32_t compress_level_from, int32_t compress_level_to) { + int32_t compress_level_from, int32_t compress_level_to, + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes) { fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size); for (auto& i : compression_types) { if (CompressionTypeSupported(i.first)) { fprintf(stdout, "Compression: %-24s\n", i.second); CompressionOptions compress_opt; + compress_opt.max_dict_bytes = max_dict_bytes; + compress_opt.zstd_max_train_bytes = zstd_max_train_bytes; for (int32_t j = compress_level_from; j <= compress_level_to; j++) { fprintf(stdout, "Compression level: %d", j); compress_opt.level = j; diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h index 8380daf1d7..30040b3101 100644 --- a/table/sst_file_dumper.h +++ b/table/sst_file_dumper.h @@ -38,9 +38,9 @@ class SstFileDumper { int ShowAllCompressionSizes( size_t block_size, const std::vector>& - compression_types, - int32_t compress_level_from, - int32_t compress_level_to); + compression_types, + int32_t compress_level_from, int32_t compress_level_to, + uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes); int ShowCompressionSize( size_t block_size, diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 70ce6913f9..dd910d8004 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -97,6 +97,12 @@ void print_help(bool to_stderr) { --compression_level_to= Compression level to stop compressing when executing recompress. One compression type and compression_level_from must also be specified + + --compression_max_dict_bytes= + Maximum size of dictionary used to prime the compression library + + --compression_zstd_max_train_bytes= + Maximum size of training data passed to zstd's dictionary trainer )"); } @@ -156,6 +162,10 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { uint64_t total_filter_block_size = 0; int32_t compress_level_from = CompressionOptions::kDefaultCompressionLevel; int32_t compress_level_to = CompressionOptions::kDefaultCompressionLevel; + uint32_t compression_max_dict_bytes = + ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes; + uint32_t compression_zstd_max_train_bytes = + ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes; int64_t tmp_val; @@ -244,6 +254,27 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { "compression_level_to must be numeric", &tmp_val)) { has_compression_level_to = true; compress_level_to = static_cast(tmp_val); + } else if (ParseIntArg(argv[i], "--compression_max_dict_bytes=", + "compression_max_dict_bytes must be numeric", + &tmp_val)) { + if (tmp_val < 0 || tmp_val > port::kMaxUint32) { + fprintf(stderr, "compression_max_dict_bytes must be a uint32_t: '%s'\n", + argv[i]); + print_help(/*to_stderr*/ true); + return 1; + } + compression_max_dict_bytes = static_cast(tmp_val); + } else if (ParseIntArg(argv[i], "--compression_zstd_max_train_bytes=", + "compression_zstd_max_train_bytes must be numeric", + &tmp_val)) { + if (tmp_val < 0 || tmp_val > port::kMaxUint32) { + fprintf(stderr, + "compression_zstd_max_train_bytes must be a uint32_t: '%s'\n", + argv[i]); + print_help(/*to_stderr*/ true); + return 1; + } + compression_zstd_max_train_bytes = static_cast(tmp_val); } else if (strcmp(argv[i], "--help") == 0) { print_help(/*to_stderr*/ false); return 0; @@ -371,7 +402,8 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { dumper.ShowAllCompressionSizes( set_block_size ? block_size : 16384, compression_types.empty() ? kCompressions : compression_types, - compress_level_from, compress_level_to); + compress_level_from, compress_level_to, compression_max_dict_bytes, + compression_zstd_max_train_bytes); return 0; }