Merge branch 'main' into fixPessimisticTransactionReplication

This commit is contained in:
KuDeSnik33ra 2024-11-09 17:45:10 +03:00 committed by GitHub
commit ff0c57a1f7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
324 changed files with 17775 additions and 6189 deletions

View File

@ -1,13 +1,13 @@
name: facebook/rocksdb/benchmark-linux name: facebook/rocksdb/benchmark-linux
on: workflow_dispatch on: workflow_dispatch
jobs: permissions: {}
# FIXME: when this job is fixed, it should be given a cron schedule like # FIXME: Disabled temporarily
# schedule: # schedule:
# - cron: 0 * * * * # - cron: 7 */2 * * * # At minute 7 past every 2nd hour
# workflow_dispatch: jobs:
benchmark-linux: benchmark-linux:
if: ${{ github.repository_owner == 'facebook' }} if: ${{ github.repository_owner == 'facebook' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest # FIXME: change this back to self-hosted when ready
steps: steps:
- uses: actions/checkout@v4.1.0 - uses: actions/checkout@v4.1.0
- uses: "./.github/actions/build-for-benchmarks" - uses: "./.github/actions/build-for-benchmarks"

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/nightly name: facebook/rocksdb/nightly
on: workflow_dispatch on: workflow_dispatch
permissions: {}
jobs: jobs:
# These jobs would be in nightly but are failing or otherwise broken for # These jobs would be in nightly but are failing or otherwise broken for
# some reason. # some reason.

View File

@ -3,6 +3,7 @@ on:
schedule: schedule:
- cron: 0 9 * * * - cron: 0 9 * * *
workflow_dispatch: workflow_dispatch:
permissions: {}
jobs: jobs:
build-format-compatible: build-format-compatible:
if: ${{ github.repository_owner == 'facebook' }} if: ${{ github.repository_owner == 'facebook' }}
@ -59,12 +60,15 @@ jobs:
container: container:
image: zjay437/rocksdb:0.6 image: zjay437/rocksdb:0.6
options: --shm-size=16gb options: --shm-size=16gb
env:
CC: clang-13
CXX: clang++-13
steps: steps:
- uses: actions/checkout@v4.1.0 - uses: actions/checkout@v4.1.0
- uses: "./.github/actions/pre-steps" - uses: "./.github/actions/pre-steps"
- uses: "./.github/actions/setup-folly" - uses: "./.github/actions/setup-folly"
- uses: "./.github/actions/build-folly" - uses: "./.github/actions/build-folly"
- run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
- uses: "./.github/actions/post-steps" - uses: "./.github/actions/post-steps"
build-linux-valgrind: build-linux-valgrind:
if: ${{ github.repository_owner == 'facebook' }} if: ${{ github.repository_owner == 'facebook' }}
@ -76,7 +80,7 @@ jobs:
steps: steps:
- uses: actions/checkout@v4.1.0 - uses: actions/checkout@v4.1.0
- uses: "./.github/actions/pre-steps" - uses: "./.github/actions/pre-steps"
- run: PORTABLE=1 make V=1 -j32 valgrind_test - run: make V=1 -j32 valgrind_test
- uses: "./.github/actions/post-steps" - uses: "./.github/actions/post-steps"
build-windows-vs2022-avx2: build-windows-vs2022-avx2:
if: ${{ github.repository_owner == 'facebook' }} if: ${{ github.repository_owner == 'facebook' }}

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/pr-jobs-candidate name: facebook/rocksdb/pr-jobs-candidate
on: workflow_dispatch on: workflow_dispatch
permissions: {}
jobs: jobs:
# These jobs would be in pr-jobs but are failing or otherwise broken for # These jobs would be in pr-jobs but are failing or otherwise broken for
# some reason. # some reason.

View File

@ -1,5 +1,6 @@
name: facebook/rocksdb/pr-jobs name: facebook/rocksdb/pr-jobs
on: [push, pull_request] on: [push, pull_request]
permissions: {}
jobs: jobs:
# NOTE: multiple workflows would be recommended, but the current GHA UI in # NOTE: multiple workflows would be recommended, but the current GHA UI in
# PRs doesn't make it clear when there's an overall error with a workflow, # PRs doesn't make it clear when there's an overall error with a workflow,

View File

@ -32,7 +32,7 @@
# 3. cmake .. # 3. cmake ..
# 4. make -j # 4. make -j
cmake_minimum_required(VERSION 3.10) cmake_minimum_required(VERSION 3.12)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
include(ReadVersion) include(ReadVersion)

View File

@ -1,6 +1,70 @@
# Rocksdb Change Log # Rocksdb Change Log
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt` > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
## 9.8.0 (10/25/2024)
### New Features
* All non-`block_cache` options in `BlockBasedTableOptions` are now mutable with `DB::SetOptions()`. See also Bug Fixes below.
* When using iterators with BlobDB, it is now possible to load large values on an on-demand basis, i.e. only if they are actually needed by the application. This can save I/O in use cases where the values associated with certain keys are not needed. For more details, see the new read option `allow_unprepared_value` and the iterator API `PrepareValue`.
* Add a new file ingestion option `IngestExternalFileOptions::fill_cache` to support not adding blocks from ingested files into block cache during file ingestion.
* The option `allow_unprepared_value` is now also supported for multi-column-family iterators (i.e. `CoalescingIterator` and `AttributeGroupIterator`).
* When a file with just one range deletion (standalone range deletion file) is ingested via bulk loading, it will be marked for compaction. During compaction, this type of files can be used to directly filter out some input files that are not protected by any snapshots and completely deleted by the standalone range deletion file.
### Behavior Changes
* During file ingestion, overlapping files level assignment are done in multiple batches, so that they can potentially be assigned to lower levels other than always land on L0.
* OPTIONS file to be loaded by remote worker is now preserved so that it does not get purged by the primary host. A similar technique as how we are preserving new SST files from getting purged is used for this. min_options_file_numbers_ is tracked like pending_outputs_ is tracked.
* Trim readahead_size during scans so data blocks containing keys that are not in the same prefix as the seek key in `Seek()` are not prefetched when `ReadOptions::auto_readahead_size=true` (default value) and `ReadOptions::prefix_same_as_start = true`
* Assigning levels for external files are done in the same way for universal compaction and leveled compaction. The old behavior tends to assign files to L0 while the new behavior will assign the files to the lowest level possible.
### Bug Fixes
* Fix a longstanding race condition in SetOptions for `block_based_table_factory` options. The fix has some subtle behavior changes because of copying and replacing the TableFactory on a change with SetOptions, including requiring an Iterator::Refresh() for an existing Iterator to use the latest options.
* Fix under counting of allocated memory in the compressed secondary cache due to looking at the compressed block size rather than the actual memory allocated, which could be larger due to internal fragmentation.
* `GetApproximateMemTableStats()` could return disastrously bad estimates 5-25% of the time. The function has been re-engineered to return much better estimates with similar CPU cost.
* Skip insertion of compressed blocks in the secondary cache if the lowest_used_cache_tier DB option is kVolatileTier.
* Fix an issue in level compaction where a small CF with small compaction debt can cause the DB to allow parallel compactions. (#13054)
* Several DB option settings could be lost through `GetOptionsFromString()`, possibly elsewhere as well. Affected options, now fixed:`background_close_inactive_wals`, `write_dbid_to_manifest`, `write_identity_file`, `prefix_seek_opt_in_only`
## 9.7.0 (09/20/2024)
### New Features
* Make Cache a customizable class that can be instantiated by the object registry.
* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.
* Add a new table property "rocksdb.key.largest.seqno" which records the largest sequence number of all keys in file. It is verified to be zero during SST file ingestion.
### Behavior Changes
* Changed the semantics of the BlobDB configuration option `blob_garbage_collection_force_threshold` to define a threshold for the overall garbage ratio of all blob files currently eligible for garbage collection (according to `blob_garbage_collection_age_cutoff`). This can provide better control over space amplification at the cost of slightly higher write amplification.
* Set `write_dbid_to_manifest=true` by default. This means DB ID will now be preserved through backups, checkpoints, etc. by default. Also add `write_identity_file` option which can be set to false for anticipated future behavior.
* In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018).
* Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files.
* Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion.
* DB::Close now untracks files in SstFileManager, making avaialble any space used
by them. Prior to this change they would be orphaned until the DB is re-opened.
### Bug Fixes
* Fix a bug in CompactRange() where result files may not be compacted in any future compaction. This can only happen when users configure CompactRangeOptions::change_level to true and the change level step of manual compaction fails (#13009).
* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
* Fix a bug with manual_wal_flush and auto error recovery from WAL failure that may cause CFs to be inconsistent (#12995). The fix will set potential WAL write failure as fatal error when manual_wal_flush is true, and disables auto error recovery from these errors.
## 9.6.0 (08/19/2024)
### New Features
* *Best efforts recovery supports recovering to incomplete Version with a clean seqno cut that presents a valid point in time view from the user's perspective, if versioning history doesn't include atomic flush.
* New option `BlockBasedTableOptions::decouple_partitioned_filters` should improve efficiency in serving read queries because filter and index partitions can consistently target the configured `metadata_block_size`. This option is currently opt-in.
* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans.
### Public API Changes
* Add ticker stats to count file read retries due to checksum mismatch
* Adds optional installation callback function for remote compaction
### Behavior Changes
* There may be less intra-L0 compaction triggered by total L0 size being too small. We now use compensated file size (tombstones are assigned some value size) when calculating L0 size and reduce the threshold for L0 size limit. This is to avoid accumulating too much data/tombstones in L0.
### Bug Fixes
* *Make DestroyDB supports slow deletion when it's configured in `SstFileManager`. The slow deletion is subject to the configured `rate_bytes_per_sec`, but not subject to the `max_trash_db_ratio`.
* Fixed a bug where we set unprep_seqs_ even when WriteImpl() fails. This was caught by stress test write fault injection in WriteImpl(). This may have incorrectly caused iteration creation failure for unvalidated writes or returned wrong result for WriteUnpreparedTxn::GetUnpreparedSequenceNumbers().
* Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries
* Fixed a data race involving the background error status in `unordered_write` mode.
* *Fix a bug where file snapshot functions like backup, checkpoint may attempt to copy a non-existing manifest file. #12882
* Fix a bug where per kv checksum corruption may be ignored in MultiGet().
* Fix a race condition in pessimistic transactions that could allow multiple transactions with the same name to be registered simultaneously, resulting in a crash or other unpredictable behavior.
## 9.5.0 (07/19/2024) ## 9.5.0 (07/19/2024)
### Public API Changes ### Public API Changes
* Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch * Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch

View File

@ -630,6 +630,11 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
# Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable # Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable
# Work around valgrind hanging on systems with limited internet access
ifneq ($(shell which git 2>/dev/null && git config --get https.proxy),)
export DEBUGINFOD_URLS=
endif
TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST) TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES)) BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES)) CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
@ -1164,16 +1169,16 @@ ubsan_crash_test_with_best_efforts_recovery: clean
$(MAKE) clean $(MAKE) clean
full_valgrind_test: full_valgrind_test:
ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check
full_valgrind_test_some: full_valgrind_test_some:
ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check_some
valgrind_test: valgrind_test:
ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check
valgrind_test_some: valgrind_test_some:
ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check_some
valgrind_check: $(TESTS) valgrind_check: $(TESTS)
$(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests $(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests
@ -2484,7 +2489,7 @@ checkout_folly:
fi fi
@# Pin to a particular version for public CI, so that PR authors don't @# Pin to a particular version for public CI, so that PR authors don't
@# need to worry about folly breaking our integration. Update periodically @# need to worry about folly breaking our integration. Update periodically
cd third-party/folly && git reset --hard c48fdd205c1c291651749d532b8055fe822bba25 cd third-party/folly && git reset --hard 03041f014b6e6ebb6119ffae8b7a37308f52e913
@# NOTE: this hack is required for clang in some cases @# NOTE: this hack is required for clang in some cases
perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp
@# NOTE: this hack is required for gcc in some cases @# NOTE: this hack is required for gcc in some cases

16
TARGETS
View File

@ -362,9 +362,9 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"//folly/experimental/coro:coroutine", "//folly/experimental/coro:coroutine",
"//folly/experimental/coro:task", "//folly/experimental/coro:task",
"//folly/synchronization:distributed_mutex", "//folly/synchronization:distributed_mutex",
], headers=None, link_whole=False, extra_test_libs=False) ], headers=glob(["**/*.h"]), link_whole=False, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False) cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=[], link_whole=True, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_test_lib", srcs=[ cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
"db/db_test_util.cc", "db/db_test_util.cc",
@ -378,7 +378,7 @@ cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
"tools/trace_analyzer_tool.cc", "tools/trace_analyzer_tool.cc",
"utilities/agg_merge/test_agg_merge.cc", "utilities/agg_merge/test_agg_merge.cc",
"utilities/cassandra/test_utils.cc", "utilities/cassandra/test_utils.cc",
], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=True) ], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=True)
cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[ cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[
"test_util/testutil.cc", "test_util/testutil.cc",
@ -386,9 +386,9 @@ cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[
"tools/db_bench_tool.cc", "tools/db_bench_tool.cc",
"tools/simulated_hybrid_file_system.cc", "tools/simulated_hybrid_file_system.cc",
"tools/trace_analyzer_tool.cc", "tools/trace_analyzer_tool.cc",
], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=False) ], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=False) cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)
rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
"db_stress_tool/batched_ops_stress.cc", "db_stress_tool/batched_ops_stress.cc",
@ -410,13 +410,15 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
"test_util/testutil.cc", "test_util/testutil.cc",
"tools/block_cache_analyzer/block_cache_trace_analyzer.cc", "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
"tools/trace_analyzer_tool.cc", "tools/trace_analyzer_tool.cc",
], headers=None) ], headers=[])
cpp_binary_wrapper(name="ldb", srcs=["tools/ldb.cc"], deps=[":rocksdb_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False) cpp_binary_wrapper(name="ldb", srcs=["tools/ldb.cc"], deps=[":rocksdb_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
cpp_binary_wrapper(name="db_stress", srcs=["db_stress_tool/db_stress.cc"], deps=[":rocksdb_stress_lib"], extra_preprocessor_flags=[], extra_bench_libs=False) cpp_binary_wrapper(name="db_stress", srcs=["db_stress_tool/db_stress.cc"], deps=[":rocksdb_stress_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
cpp_binary_wrapper(name="db_bench", srcs=["tools/db_bench.cc"], deps=[":rocksdb_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False) cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True) cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
@ -5024,7 +5026,7 @@ cpp_unittest_wrapper(name="dynamic_bloom_test",
extra_compiler_flags=[]) extra_compiler_flags=[])
cpp_library_wrapper(name="env_basic_test_lib", srcs=["env/env_basic_test.cc"], deps=[":rocksdb_test_lib"], headers=None, link_whole=False, extra_test_libs=True) cpp_library_wrapper(name="env_basic_test_lib", srcs=["env/env_basic_test.cc"], deps=[":rocksdb_test_lib"], headers=[], link_whole=False, extra_test_libs=True)
cpp_unittest_wrapper(name="env_basic_test", cpp_unittest_wrapper(name="env_basic_test",
srcs=["env/env_basic_test.cc"], srcs=["env/env_basic_test.cc"],

View File

@ -1,6 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import absolute_import, division, print_function, unicode_literals
try: try:
from builtins import str from builtins import str
@ -11,7 +10,7 @@ import json
import os import os
import sys import sys
from targets_builder import TARGETSBuilder from targets_builder import TARGETSBuilder, LiteralValue
from util import ColorString from util import ColorString
@ -132,7 +131,7 @@ def generate_targets(repo_path, deps_map):
if len(sys.argv) >= 2: if len(sys.argv) >= 2:
# Heuristically quote and canonicalize whitespace for inclusion # Heuristically quote and canonicalize whitespace for inclusion
# in how the file was generated. # in how the file was generated.
extra_argv = " '{0}'".format(" ".join(sys.argv[1].split())) extra_argv = " '{}'".format(" ".join(sys.argv[1].split()))
TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv) TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv)
@ -150,6 +149,7 @@ def generate_targets(repo_path, deps_map):
"//folly/experimental/coro:task", "//folly/experimental/coro:task",
"//folly/synchronization:distributed_mutex", "//folly/synchronization:distributed_mutex",
], ],
headers=LiteralValue("glob([\"**/*.h\"])")
) )
# rocksdb_whole_archive_lib # rocksdb_whole_archive_lib
TARGETS.add_library( TARGETS.add_library(
@ -158,7 +158,6 @@ def generate_targets(repo_path, deps_map):
deps=[ deps=[
":rocksdb_lib", ":rocksdb_lib",
], ],
headers=None,
extra_external_deps="", extra_external_deps="",
link_whole=True, link_whole=True,
) )
@ -201,6 +200,10 @@ def generate_targets(repo_path, deps_map):
TARGETS.add_binary( TARGETS.add_binary(
"db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"] "db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"]
) )
# db_bench binary
TARGETS.add_binary(
"db_bench", ["tools/db_bench.cc"], [":rocksdb_tools_lib"]
)
# cache_bench binary # cache_bench binary
TARGETS.add_binary( TARGETS.add_binary(
"cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"] "cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"]
@ -209,7 +212,7 @@ def generate_targets(repo_path, deps_map):
for src in src_mk.get("MICROBENCH_SOURCES", []): for src in src_mk.get("MICROBENCH_SOURCES", []):
name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0] name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0]
TARGETS.add_binary(name, [src], [], extra_bench_libs=True) TARGETS.add_binary(name, [src], [], extra_bench_libs=True)
print("Extra dependencies:\n{0}".format(json.dumps(deps_map))) print(f"Extra dependencies:\n{json.dumps(deps_map)}")
# Dictionary test executable name -> relative source file path # Dictionary test executable name -> relative source file path
test_source_map = {} test_source_map = {}

View File

@ -1,5 +1,4 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import absolute_import, division, print_function, unicode_literals
try: try:
from builtins import object, str from builtins import object, str
@ -9,17 +8,28 @@ import pprint
import targets_cfg import targets_cfg
class LiteralValue:
def __init__(self, value):
self.value = value
def __str__(self):
return str(self.value)
def smart_quote_value(val):
if isinstance(val, LiteralValue):
return str(val)
return '"%s"' % val
def pretty_list(lst, indent=8): def pretty_list(lst, indent=8):
if lst is None or len(lst) == 0: if lst is None or len(lst) == 0:
return "" return ""
if len(lst) == 1: if len(lst) == 1:
return '"%s"' % lst[0] return smart_quote_value(lst[0])
separator = '",\n%s"' % (" " * indent) separator = ',\n%s' % (" " * indent)
res = separator.join(sorted(lst)) res = separator.join(sorted(map(smart_quote_value, lst)))
res = "\n" + (" " * indent) + '"' + res + '",\n' + (" " * (indent - 4)) res = "\n" + (" " * indent) + res + ',\n' + (" " * (indent - 4))
return res return res
@ -48,7 +58,12 @@ class TARGETSBuilder:
extra_test_libs=False, extra_test_libs=False,
): ):
if headers is not None: if headers is not None:
headers = "[" + pretty_list(headers) + "]" if isinstance(headers, LiteralValue):
headers = str(headers)
else:
headers = "[" + pretty_list(headers) + "]"
else:
headers = "[]"
with open(self.path, "ab") as targets_file: with open(self.path, "ab") as targets_file:
targets_file.write( targets_file.write(
targets_cfg.library_template.format( targets_cfg.library_template.format(
@ -65,8 +80,7 @@ class TARGETSBuilder:
self.total_lib = self.total_lib + 1 self.total_lib = self.total_lib + 1
def add_rocksdb_library(self, name, srcs, headers=None, external_dependencies=None): def add_rocksdb_library(self, name, srcs, headers=None, external_dependencies=None):
if headers is not None: headers = "[" + pretty_list(headers) + "]"
headers = "[" + pretty_list(headers) + "]"
with open(self.path, "ab") as targets_file: with open(self.path, "ab") as targets_file:
targets_file.write( targets_file.write(
targets_cfg.rocksdb_library_template.format( targets_cfg.rocksdb_library_template.format(

View File

@ -1,5 +1,4 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import absolute_import, division, print_function, unicode_literals
rocksdb_target_header_template = """# This file \100generated by: rocksdb_target_header_template = """# This file \100generated by:
#$ python3 buckifier/buckify_rocksdb.py{extra_argv} #$ python3 buckifier/buckify_rocksdb.py{extra_argv}

View File

@ -2,7 +2,6 @@
""" """
This module keeps commonly used components. This module keeps commonly used components.
""" """
from __future__ import absolute_import, division, print_function, unicode_literals
try: try:
from builtins import object from builtins import object

View File

@ -25,7 +25,6 @@
# #
# The solution is to move the include out of the #ifdef. # The solution is to move the include out of the #ifdef.
from __future__ import print_function
import argparse import argparse
import re import re
@ -62,7 +61,7 @@ def expand_include(
included.add(include_path) included.add(include_path)
with open(include_path) as f: with open(include_path) as f:
print('#line 1 "{}"'.format(include_path), file=source_out) print(f'#line 1 "{include_path}"', file=source_out)
process_file( process_file(
f, include_path, source_out, header_out, include_paths, public_include_paths f, include_path, source_out, header_out, include_paths, public_include_paths
) )
@ -118,7 +117,7 @@ def process_file(
) )
if expanded: if expanded:
print('#line {} "{}"'.format(line + 1, abs_path), file=source_out) print(f'#line {line + 1} "{abs_path}"', file=source_out)
elif text != "#pragma once\n": elif text != "#pragma once\n":
source_out.write(text) source_out.write(text)
@ -157,8 +156,8 @@ def main():
with open(filename) as f, open(args.source_out, "w") as source_out, open( with open(filename) as f, open(args.source_out, "w") as source_out, open(
args.header_out, "w" args.header_out, "w"
) as header_out: ) as header_out:
print('#line 1 "{}"'.format(filename), file=source_out) print(f'#line 1 "{filename}"', file=source_out)
print('#include "{}"'.format(header_out.name), file=source_out) print(f'#include "{header_out.name}"', file=source_out)
process_file( process_file(
f, abs_path, source_out, header_out, include_paths, public_include_paths f, abs_path, source_out, header_out, include_paths, public_include_paths
) )

View File

@ -102,7 +102,7 @@ class BenchmarkUtils:
class ResultParser: class ResultParser:
def __init__(self, field="(\w|[+-:.%])+", intrafield="(\s)+", separator="\t"): def __init__(self, field=r"(\w|[+-:.%])+", intrafield=r"(\s)+", separator="\t"):
self.field = re.compile(field) self.field = re.compile(field)
self.intra = re.compile(intrafield) self.intra = re.compile(intrafield)
self.sep = re.compile(separator) self.sep = re.compile(separator)
@ -159,7 +159,7 @@ class ResultParser:
def load_report_from_tsv(filename: str): def load_report_from_tsv(filename: str):
file = open(filename, "r") file = open(filename)
contents = file.readlines() contents = file.readlines()
file.close() file.close()
parser = ResultParser() parser = ResultParser()

View File

@ -9,7 +9,6 @@
- Prints those error messages to stdout - Prints those error messages to stdout
""" """
from __future__ import absolute_import, division, print_function, unicode_literals
import re import re
import sys import sys
@ -43,7 +42,7 @@ class GTestErrorParser(ErrorParserBase):
return None return None
gtest_fail_match = self._GTEST_FAIL_PATTERN.match(line) gtest_fail_match = self._GTEST_FAIL_PATTERN.match(line)
if gtest_fail_match: if gtest_fail_match:
return "%s failed: %s" % (self._last_gtest_name, gtest_fail_match.group(1)) return "{} failed: {}".format(self._last_gtest_name, gtest_fail_match.group(1))
return None return None
@ -66,52 +65,52 @@ class CompilerErrorParser(MatchErrorParser):
# format (link error): # format (link error):
# '<filename>:<line #>: error: <error msg>' # '<filename>:<line #>: error: <error msg>'
# The below regex catches both # The below regex catches both
super(CompilerErrorParser, self).__init__(r"\S+:\d+: error:") super().__init__(r"\S+:\d+: error:")
class ScanBuildErrorParser(MatchErrorParser): class ScanBuildErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
super(ScanBuildErrorParser, self).__init__(r"scan-build: \d+ bugs found.$") super().__init__(r"scan-build: \d+ bugs found.$")
class DbCrashErrorParser(MatchErrorParser): class DbCrashErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
super(DbCrashErrorParser, self).__init__(r"\*\*\*.*\^$|TEST FAILED.") super().__init__(r"\*\*\*.*\^$|TEST FAILED.")
class WriteStressErrorParser(MatchErrorParser): class WriteStressErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
super(WriteStressErrorParser, self).__init__( super().__init__(
r"ERROR: write_stress died with exitcode=\d+" r"ERROR: write_stress died with exitcode=\d+"
) )
class AsanErrorParser(MatchErrorParser): class AsanErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
super(AsanErrorParser, self).__init__(r"==\d+==ERROR: AddressSanitizer:") super().__init__(r"==\d+==ERROR: AddressSanitizer:")
class UbsanErrorParser(MatchErrorParser): class UbsanErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
# format: '<filename>:<line #>:<column #>: runtime error: <error msg>' # format: '<filename>:<line #>:<column #>: runtime error: <error msg>'
super(UbsanErrorParser, self).__init__(r"\S+:\d+:\d+: runtime error:") super().__init__(r"\S+:\d+:\d+: runtime error:")
class ValgrindErrorParser(MatchErrorParser): class ValgrindErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
# just grab the summary, valgrind doesn't clearly distinguish errors # just grab the summary, valgrind doesn't clearly distinguish errors
# from other log messages. # from other log messages.
super(ValgrindErrorParser, self).__init__(r"==\d+== ERROR SUMMARY:") super().__init__(r"==\d+== ERROR SUMMARY:")
class CompatErrorParser(MatchErrorParser): class CompatErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
super(CompatErrorParser, self).__init__(r"==== .*[Ee]rror.* ====$") super().__init__(r"==== .*[Ee]rror.* ====$")
class TsanErrorParser(MatchErrorParser): class TsanErrorParser(MatchErrorParser):
def __init__(self): def __init__(self):
super(TsanErrorParser, self).__init__(r"WARNING: ThreadSanitizer:") super().__init__(r"WARNING: ThreadSanitizer:")
_TEST_NAME_TO_PARSERS = { _TEST_NAME_TO_PARSERS = {

30
cache/cache.cc vendored
View File

@ -133,19 +133,25 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
std::shared_ptr<Cache>* result) { std::shared_ptr<Cache>* result) {
Status status; Status status;
std::shared_ptr<Cache> cache; std::shared_ptr<Cache> cache;
if (value.find('=') == std::string::npos) { if (StartsWith(value, "null")) {
cache = NewLRUCache(ParseSizeT(value)); cache = nullptr;
} else { } else if (value.find("://") == std::string::npos) {
LRUCacheOptions cache_opts; if (value.find('=') == std::string::npos) {
status = OptionTypeInfo::ParseStruct(config_options, "", cache = NewLRUCache(ParseSizeT(value));
&lru_cache_options_type_info, "", } else {
value, &cache_opts); LRUCacheOptions cache_opts;
if (status.ok()) { status = OptionTypeInfo::ParseStruct(config_options, "",
cache = NewLRUCache(cache_opts); &lru_cache_options_type_info, "",
value, &cache_opts);
if (status.ok()) {
cache = NewLRUCache(cache_opts);
}
} }
} if (status.ok()) {
if (status.ok()) { result->swap(cache);
result->swap(cache); }
} else {
status = LoadSharedObject<Cache>(config_options, value, result);
} }
return status; return status;
} }

26
cache/cache_test.cc vendored
View File

@ -886,6 +886,32 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
ASSERT_EQ(special_count, kSpecialCount); ASSERT_EQ(special_count, kSpecialCount);
} }
TEST_P(CacheTest, ApplyToHandleTest) {
std::string callback_state;
const auto callback = [&](const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper) {
callback_state = std::to_string(DecodeKey(key)) + "," +
std::to_string(DecodeValue(value)) + "," +
std::to_string(charge);
assert(helper == &CacheTest::kHelper);
};
std::vector<std::string> inserted;
for (int i = 0; i < 10; ++i) {
Insert(i, i * 2, i + 1);
inserted.push_back(std::to_string(i) + "," + std::to_string(i * 2) + "," +
std::to_string(i + 1));
}
for (int i = 0; i < 10; ++i) {
Cache::Handle* handle = cache_->Lookup(EncodeKey(i));
cache_->ApplyToHandle(cache_.get(), handle, callback);
EXPECT_EQ(inserted[i], callback_state);
cache_->Release(handle);
}
}
TEST_P(CacheTest, DefaultShardBits) { TEST_P(CacheTest, DefaultShardBits) {
// Prevent excessive allocation (to save time & space) // Prevent excessive allocation (to save time & space)
estimated_value_size_ = 100000; estimated_value_size_ = 100000;

16
cache/clock_cache.cc vendored
View File

@ -1444,6 +1444,22 @@ const Cache::CacheItemHelper* BaseHyperClockCache<Table>::GetCacheItemHelper(
return h->helper; return h->helper;
} }
template <class Table>
void BaseHyperClockCache<Table>::ApplyToHandle(
Cache* cache, Handle* handle,
const std::function<void(const Slice& key, Cache::ObjectPtr value,
size_t charge, const CacheItemHelper* helper)>&
callback) {
BaseHyperClockCache<Table>* cache_ptr =
static_cast<BaseHyperClockCache<Table>*>(cache);
auto h = static_cast<const typename Table::HandleImpl*>(handle);
UniqueId64x2 unhashed;
auto hash_seed = cache_ptr->GetShard(h->GetHash()).GetTable().GetHashSeed();
callback(
ClockCacheShard<Table>::ReverseHash(h->hashed_key, &unhashed, hash_seed),
h->value, h->GetTotalCharge(), h->helper);
}
namespace { namespace {
// For each cache shard, estimate what the table load factor would be if // For each cache shard, estimate what the table load factor would be if

6
cache/clock_cache.h vendored
View File

@ -1128,6 +1128,12 @@ class BaseHyperClockCache : public ShardedCache<ClockCacheShard<Table>> {
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override; const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
void ApplyToHandle(
Cache* cache, Handle* handle,
const std::function<void(const Slice& key, Cache::ObjectPtr obj,
size_t charge, const CacheItemHelper* helper)>&
callback) override;
void ReportProblems( void ReportProblems(
const std::shared_ptr<Logger>& /*info_log*/) const override; const std::shared_ptr<Logger>& /*info_log*/) const override;
}; };

View File

@ -79,7 +79,11 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1, data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1,
static_cast<uint32_t*>(&source_32)); static_cast<uint32_t*>(&source_32));
source = static_cast<CacheTier>(source_32); source = static_cast<CacheTier>(source_32);
handle_value_charge -= (data_ptr - ptr->get()); uint64_t data_size = 0;
data_ptr = GetVarint64Ptr(data_ptr, ptr->get() + handle_value_charge,
static_cast<uint64_t*>(&data_size));
assert(handle_value_charge > data_size);
handle_value_charge = data_size;
} }
MemoryAllocator* allocator = cache_options_.memory_allocator.get(); MemoryAllocator* allocator = cache_options_.memory_allocator.get();
@ -169,13 +173,15 @@ Status CompressedSecondaryCache::InsertInternal(
} }
auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge); auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
char header[10]; char header[20];
char* payload = header; char* payload = header;
payload = EncodeVarint32(payload, static_cast<uint32_t>(type)); payload = EncodeVarint32(payload, static_cast<uint32_t>(type));
payload = EncodeVarint32(payload, static_cast<uint32_t>(source)); payload = EncodeVarint32(payload, static_cast<uint32_t>(source));
size_t data_size = (*helper->size_cb)(value);
char* data_size_ptr = payload;
payload = EncodeVarint64(payload, data_size);
size_t header_size = payload - header; size_t header_size = payload - header;
size_t data_size = (*helper->size_cb)(value);
size_t total_size = data_size + header_size; size_t total_size = data_size + header_size;
CacheAllocationPtr ptr = CacheAllocationPtr ptr =
AllocateBlock(total_size, cache_options_.memory_allocator.get()); AllocateBlock(total_size, cache_options_.memory_allocator.get());
@ -210,6 +216,8 @@ Status CompressedSecondaryCache::InsertInternal(
val = Slice(compressed_val); val = Slice(compressed_val);
data_size = compressed_val.size(); data_size = compressed_val.size();
payload = EncodeVarint64(data_size_ptr, data_size);
header_size = payload - header;
total_size = header_size + data_size; total_size = header_size + data_size;
PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size); PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size);
@ -222,14 +230,21 @@ Status CompressedSecondaryCache::InsertInternal(
PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1); PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
if (cache_options_.enable_custom_split_merge) { if (cache_options_.enable_custom_split_merge) {
size_t charge{0}; size_t split_charge{0};
CacheValueChunk* value_chunks_head = CacheValueChunk* value_chunks_head = SplitValueIntoChunks(
SplitValueIntoChunks(val, cache_options_.compression_type, charge); val, cache_options_.compression_type, split_charge);
return cache_->Insert(key, value_chunks_head, internal_helper, charge); return cache_->Insert(key, value_chunks_head, internal_helper,
split_charge);
} else { } else {
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
size_t charge = malloc_usable_size(ptr.get());
#else
size_t charge = total_size;
#endif
std::memcpy(ptr.get(), header, header_size); std::memcpy(ptr.get(), header, header_size);
CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr)); CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
return cache_->Insert(key, buf, internal_helper, total_size); charge += sizeof(CacheAllocationPtr);
return cache_->Insert(key, buf, internal_helper, charge);
} }
} }
@ -398,6 +413,21 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
} }
} }
size_t CompressedSecondaryCache::TEST_GetCharge(const Slice& key) {
Cache::Handle* lru_handle = cache_->Lookup(key);
if (lru_handle == nullptr) {
return 0;
}
size_t charge = cache_->GetCharge(lru_handle);
if (cache_->Value(lru_handle) != nullptr &&
!cache_options_.enable_custom_split_merge) {
charge -= 10;
}
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
return charge;
}
std::shared_ptr<SecondaryCache> std::shared_ptr<SecondaryCache>
CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const { CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
return std::make_shared<CompressedSecondaryCache>(*this); return std::make_shared<CompressedSecondaryCache>(*this);

View File

@ -139,6 +139,8 @@ class CompressedSecondaryCache : public SecondaryCache {
const Cache::CacheItemHelper* helper, const Cache::CacheItemHelper* helper,
CompressionType type, CacheTier source); CompressionType type, CacheTier source);
size_t TEST_GetCharge(const Slice& key);
// TODO: clean up to use cleaner interfaces in typed_cache.h // TODO: clean up to use cleaner interfaces in typed_cache.h
const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const; const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
std::shared_ptr<Cache> cache_; std::shared_ptr<Cache> cache_;

View File

@ -39,6 +39,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
protected: protected:
void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache, void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
bool sec_cache_is_compressed) { bool sec_cache_is_compressed) {
CompressedSecondaryCache* comp_sec_cache =
static_cast<CompressedSecondaryCache*>(sec_cache.get());
get_perf_context()->Reset(); get_perf_context()->Reset();
bool kept_in_sec_cache{true}; bool kept_in_sec_cache{true};
// Lookup an non-existent key. // Lookup an non-existent key.
@ -66,6 +68,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), 1000);
std::unique_ptr<SecondaryCacheResultHandle> handle1_2 = std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true, sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
/*stats=*/nullptr, kept_in_sec_cache); /*stats=*/nullptr, kept_in_sec_cache);

11
cache/lru_cache.cc vendored
View File

@ -677,6 +677,17 @@ const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper(
return h->helper; return h->helper;
} }
void LRUCache::ApplyToHandle(
Cache* cache, Handle* handle,
const std::function<void(const Slice& key, ObjectPtr value, size_t charge,
const CacheItemHelper* helper)>& callback) {
auto cache_ptr = static_cast<LRUCache*>(cache);
auto h = static_cast<const LRUHandle*>(handle);
callback(h->key(), h->value,
h->GetCharge(cache_ptr->GetShard(0).metadata_charge_policy_),
h->helper);
}
size_t LRUCache::TEST_GetLRUSize() { size_t LRUCache::TEST_GetLRUSize() {
return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); }); return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
} }

6
cache/lru_cache.h vendored
View File

@ -452,6 +452,12 @@ class LRUCache
size_t GetCharge(Handle* handle) const override; size_t GetCharge(Handle* handle) const override;
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override; const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
void ApplyToHandle(
Cache* cache, Handle* handle,
const std::function<void(const Slice& key, ObjectPtr obj, size_t charge,
const CacheItemHelper* helper)>& callback)
override;
// Retrieves number of elements in LRU, for unit test purpose only. // Retrieves number of elements in LRU, for unit test purpose only.
size_t TEST_GetLRUSize(); size_t TEST_GetLRUSize();
// Retrieves high pri pool ratio. // Retrieves high pri pool ratio.

View File

@ -271,7 +271,8 @@ Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value,
// Warm up the secondary cache with the compressed block. The secondary // Warm up the secondary cache with the compressed block. The secondary
// cache may choose to ignore it based on the admission policy. // cache may choose to ignore it based on the admission policy.
if (value != nullptr && !compressed_value.empty() && if (value != nullptr && !compressed_value.empty() &&
adm_policy_ == TieredAdmissionPolicy::kAdmPolicyThreeQueue) { adm_policy_ == TieredAdmissionPolicy::kAdmPolicyThreeQueue &&
helper->IsSecondaryCacheCompatible()) {
Status status = secondary_cache_->InsertSaved(key, compressed_value, type); Status status = secondary_cache_->InsertSaved(key, compressed_value, type);
assert(status.ok() || status.IsNotSupported()); assert(status.ok() || status.IsNotSupported());
} }

View File

@ -253,6 +253,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicTest) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
// Disable paranoid_file_checks so that flush will not read back the newly // Disable paranoid_file_checks so that flush will not read back the newly
@ -364,6 +365,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.paranoid_file_checks = false; options.paranoid_file_checks = false;
@ -506,6 +508,7 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.paranoid_file_checks = false; options.paranoid_file_checks = false;
@ -606,6 +609,7 @@ TEST_F(DBTieredSecondaryCacheTest, ReadyBeforeWaitAllTest) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.statistics = CreateDBStatistics(); options.statistics = CreateDBStatistics();
@ -717,6 +721,7 @@ TEST_F(DBTieredSecondaryCacheTest, IterateTest) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.paranoid_file_checks = false; options.paranoid_file_checks = false;
@ -760,6 +765,54 @@ TEST_F(DBTieredSecondaryCacheTest, IterateTest) {
Destroy(options); Destroy(options);
} }
TEST_F(DBTieredSecondaryCacheTest, VolatileTierTest) {
if (!LZ4_Supported()) {
ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
return;
}
BlockBasedTableOptions table_options;
// We want a block cache of size 5KB, and a compressed secondary cache of
// size 5KB. However, we specify a block cache size of 256KB here in order
// to take into account the cache reservation in the block cache on
// behalf of the compressed cache. The unit of cache reservation is 256KB.
// The effective block cache capacity will be calculated as 256 + 5 = 261KB,
// and 256KB will be reserved for the compressed cache, leaving 5KB for
// the primary block cache. We only have to worry about this here because
// the cache size is so small.
table_options.block_cache = NewCache(256 * 1024, 5 * 1024, 256 * 1024);
table_options.block_size = 4 * 1024;
table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions();
options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
// Disable paranoid_file_checks so that flush will not read back the newly
// written file
options.paranoid_file_checks = false;
options.lowest_used_cache_tier = CacheTier::kVolatileTier;
DestroyAndReopen(options);
Random rnd(301);
const int N = 256;
for (int i = 0; i < N; i++) {
std::string p_v;
test::CompressibleString(&rnd, 0.5, 1007, &p_v);
ASSERT_OK(Put(Key(i), p_v));
}
ASSERT_OK(Flush());
// Since lowest_used_cache_tier is the volatile tier, nothing should be
// inserted in the secondary cache.
std::string v = Get(Key(0));
ASSERT_EQ(1007, v.size());
ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 0u);
ASSERT_EQ(nvm_sec_cache()->num_misses(), 0u);
Destroy(options);
}
class DBTieredAdmPolicyTest class DBTieredAdmPolicyTest
: public DBTieredSecondaryCacheTest, : public DBTieredSecondaryCacheTest,
public testing::WithParamInterface<TieredAdmissionPolicy> {}; public testing::WithParamInterface<TieredAdmissionPolicy> {};
@ -784,6 +837,7 @@ TEST_P(DBTieredAdmPolicyTest, CompressedOnlyTest) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage(); size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
@ -836,6 +890,7 @@ TEST_P(DBTieredAdmPolicyTest, CompressedCacheAdmission) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage(); size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
@ -937,6 +992,7 @@ TEST_F(DBTieredSecondaryCacheTest, FSBufferTest) {
table_options.cache_index_and_filter_blocks = false; table_options.cache_index_and_filter_blocks = false;
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.compression = kLZ4Compression;
options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.statistics = CreateDBStatistics(); options.statistics = CreateDBStatistics();
options.env = wrap_env.get(); options.env = wrap_env.get();

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import print_function
import optparse import optparse
import re import re
@ -109,11 +108,11 @@ def report_coverage():
# Check if we need to display coverage info for interested files. # Check if we need to display coverage info for interested files.
if len(interested_files): if len(interested_files):
per_file_coverage = dict( per_file_coverage = {
(fname, per_file_coverage[fname]) fname: per_file_coverage[fname]
for fname in interested_files for fname in interested_files
if fname in per_file_coverage if fname in per_file_coverage
) }
# If we only interested in several files, it makes no sense to report # If we only interested in several files, it makes no sense to report
# the total_coverage # the total_coverage
total_coverage = None total_coverage = None

View File

@ -45,20 +45,23 @@ void ArenaWrappedDBIter::Init(
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
uint64_t version_number, ReadCallback* read_callback, uint64_t version_number, ReadCallback* read_callback,
ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) { ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
auto mem = arena_.AllocateAligned(sizeof(DBIter));
db_iter_ = new (mem) DBIter(
env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator,
/* iter */ nullptr, version, sequence, true,
max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index);
sv_number_ = version_number;
read_options_ = read_options; read_options_ = read_options;
allow_refresh_ = allow_refresh;
memtable_range_tombstone_iter_ = nullptr;
if (!CheckFSFeatureSupport(env->GetFileSystem().get(), if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
FSSupportedOps::kAsyncIO)) { FSSupportedOps::kAsyncIO)) {
read_options_.async_io = false; read_options_.async_io = false;
} }
read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
auto mem = arena_.AllocateAligned(sizeof(DBIter));
db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
ioptions.user_comparator,
/* iter */ nullptr, version, sequence, true,
max_sequential_skip_in_iteration, read_callback,
cfh, expose_blob_index);
sv_number_ = version_number;
allow_refresh_ = allow_refresh;
memtable_range_tombstone_iter_ = nullptr;
} }
Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); } Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }

View File

@ -83,6 +83,8 @@ class ArenaWrappedDBIter : public Iterator {
Status Refresh() override; Status Refresh() override;
Status Refresh(const Snapshot*) override; Status Refresh(const Snapshot*) override;
bool PrepareValue() override { return db_iter_->PrepareValue(); }
void Init(Env* env, const ReadOptions& read_options, void Init(Env* env, const ReadOptions& read_options,
const ImmutableOptions& ioptions, const ImmutableOptions& ioptions,
const MutableCFOptions& mutable_cf_options, const Version* version, const MutableCFOptions& mutable_cf_options, const Version* version,

View File

@ -13,14 +13,11 @@ namespace ROCKSDB_NAMESPACE {
class AttributeGroupIteratorImpl : public AttributeGroupIterator { class AttributeGroupIteratorImpl : public AttributeGroupIterator {
public: public:
AttributeGroupIteratorImpl( AttributeGroupIteratorImpl(
const Comparator* comparator, const ReadOptions& read_options, const Comparator* comparator,
const std::vector<ColumnFamilyHandle*>& column_families, std::vector<std::pair<ColumnFamilyHandle*, std::unique_ptr<Iterator>>>&&
const std::vector<Iterator*>& child_iterators) cfh_iter_pairs)
: impl_( : impl_(read_options, comparator, std::move(cfh_iter_pairs),
comparator, column_families, child_iterators, [this]() { Reset(); }, ResetFunc(this), PopulateFunc(this)) {}
[this](const autovector<MultiCfIteratorInfo>& items) {
AddToAttributeGroups(items);
}) {}
~AttributeGroupIteratorImpl() override {} ~AttributeGroupIteratorImpl() override {}
// No copy allowed // No copy allowed
@ -45,8 +42,36 @@ class AttributeGroupIteratorImpl : public AttributeGroupIterator {
void Reset() { attribute_groups_.clear(); } void Reset() { attribute_groups_.clear(); }
bool PrepareValue() override { return impl_.PrepareValue(); }
private: private:
MultiCfIteratorImpl impl_; class ResetFunc {
public:
explicit ResetFunc(AttributeGroupIteratorImpl* iter) : iter_(iter) {}
void operator()() const {
assert(iter_);
iter_->Reset();
}
private:
AttributeGroupIteratorImpl* iter_;
};
class PopulateFunc {
public:
explicit PopulateFunc(AttributeGroupIteratorImpl* iter) : iter_(iter) {}
void operator()(const autovector<MultiCfIteratorInfo>& items) const {
assert(iter_);
iter_->AddToAttributeGroups(items);
}
private:
AttributeGroupIteratorImpl* iter_;
};
MultiCfIteratorImpl<ResetFunc, PopulateFunc> impl_;
IteratorAttributeGroups attribute_groups_; IteratorAttributeGroups attribute_groups_;
void AddToAttributeGroups(const autovector<MultiCfIteratorInfo>& items); void AddToAttributeGroups(const autovector<MultiCfIteratorInfo>& items);
}; };

View File

@ -42,6 +42,7 @@ Status BlobFileCache::GetBlobFileReader(
assert(blob_file_reader); assert(blob_file_reader);
assert(blob_file_reader->IsEmpty()); assert(blob_file_reader->IsEmpty());
// NOTE: sharing same Cache with table_cache
const Slice key = GetSliceForKey(&blob_file_number); const Slice key = GetSliceForKey(&blob_file_number);
assert(cache_); assert(cache_);
@ -98,4 +99,13 @@ Status BlobFileCache::GetBlobFileReader(
return Status::OK(); return Status::OK();
} }
void BlobFileCache::Evict(uint64_t blob_file_number) {
// NOTE: sharing same Cache with table_cache
const Slice key = GetSliceForKey(&blob_file_number);
assert(cache_);
cache_.get()->Erase(key);
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -36,6 +36,15 @@ class BlobFileCache {
uint64_t blob_file_number, uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader); CacheHandleGuard<BlobFileReader>* blob_file_reader);
// Called when a blob file is obsolete to ensure it is removed from the cache
// to avoid effectively leaking the open file and assicated memory
void Evict(uint64_t blob_file_number);
// Used to identify cache entries for blob files (not normally useful)
static const Cache::CacheItemHelper* GetHelper() {
return CacheInterface::GetBasicHelper();
}
private: private:
using CacheInterface = using CacheInterface =
BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>; BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>;

View File

@ -20,23 +20,24 @@
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
BlobSource::BlobSource(const ImmutableOptions* immutable_options, BlobSource::BlobSource(const ImmutableOptions& immutable_options,
const MutableCFOptions& mutable_cf_options,
const std::string& db_id, const std::string& db_id,
const std::string& db_session_id, const std::string& db_session_id,
BlobFileCache* blob_file_cache) BlobFileCache* blob_file_cache)
: db_id_(db_id), : db_id_(db_id),
db_session_id_(db_session_id), db_session_id_(db_session_id),
statistics_(immutable_options->statistics.get()), statistics_(immutable_options.statistics.get()),
blob_file_cache_(blob_file_cache), blob_file_cache_(blob_file_cache),
blob_cache_(immutable_options->blob_cache), blob_cache_(immutable_options.blob_cache),
lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) { lowest_used_cache_tier_(immutable_options.lowest_used_cache_tier) {
auto bbto = auto bbto =
immutable_options->table_factory->GetOptions<BlockBasedTableOptions>(); mutable_cf_options.table_factory->GetOptions<BlockBasedTableOptions>();
if (bbto && if (bbto &&
bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache) bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
.charged == CacheEntryRoleOptions::Decision::kEnabled) { .charged == CacheEntryRoleOptions::Decision::kEnabled) {
blob_cache_ = SharedCacheInterface{std::make_shared<ChargedCache>( blob_cache_ = SharedCacheInterface{std::make_shared<ChargedCache>(
immutable_options->blob_cache, bbto->block_cache)}; immutable_options.blob_cache, bbto->block_cache)};
} }
} }

View File

@ -21,6 +21,7 @@
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
struct ImmutableOptions; struct ImmutableOptions;
struct MutableCFOptions;
class Status; class Status;
class FilePrefetchBuffer; class FilePrefetchBuffer;
class Slice; class Slice;
@ -31,7 +32,10 @@ class Slice;
// storage with minimal cost. // storage with minimal cost.
class BlobSource { class BlobSource {
public: public:
BlobSource(const ImmutableOptions* immutable_options, // NOTE: db_id, db_session_id, and blob_file_cache are saved by reference or
// pointer.
BlobSource(const ImmutableOptions& immutable_options,
const MutableCFOptions& mutable_cf_options,
const std::string& db_id, const std::string& db_session_id, const std::string& db_id, const std::string& db_session_id,
BlobFileCache* blob_file_cache); BlobFileCache* blob_file_cache);

View File

@ -148,6 +148,7 @@ TEST_F(BlobSourceTest, GetBlobsFromCache) {
DestroyAndReopen(options_); DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_); ImmutableOptions immutable_options(options_);
MutableCFOptions mutable_cf_options(options_);
constexpr uint32_t column_family_id = 1; constexpr uint32_t column_family_id = 1;
constexpr bool has_ttl = false; constexpr bool has_ttl = false;
@ -193,8 +194,8 @@ TEST_F(BlobSourceTest, GetBlobsFromCache) {
backing_cache.get(), &immutable_options, &file_options, backing_cache.get(), &immutable_options, &file_options,
column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
BlobSource blob_source(&immutable_options, db_id_, db_session_id_, BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
blob_file_cache.get()); db_session_id_, blob_file_cache.get());
ReadOptions read_options; ReadOptions read_options;
read_options.verify_checksums = true; read_options.verify_checksums = true;
@ -464,6 +465,7 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) {
DestroyAndReopen(options_); DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_); ImmutableOptions immutable_options(options_);
MutableCFOptions mutable_cf_options(options_);
constexpr uint32_t column_family_id = 1; constexpr uint32_t column_family_id = 1;
constexpr bool has_ttl = false; constexpr bool has_ttl = false;
@ -498,8 +500,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) {
backing_cache.get(), &immutable_options, &file_options, backing_cache.get(), &immutable_options, &file_options,
column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/); column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/);
BlobSource blob_source(&immutable_options, db_id_, db_session_id_, BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
blob_file_cache.get()); db_session_id_, blob_file_cache.get());
ReadOptions read_options; ReadOptions read_options;
read_options.verify_checksums = true; read_options.verify_checksums = true;
@ -589,6 +591,7 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
DestroyAndReopen(options_); DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_); ImmutableOptions immutable_options(options_);
MutableCFOptions mutable_cf_options(options_);
constexpr uint32_t column_family_id = 1; constexpr uint32_t column_family_id = 1;
constexpr bool has_ttl = false; constexpr bool has_ttl = false;
@ -644,8 +647,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
backing_cache.get(), &immutable_options, &file_options, backing_cache.get(), &immutable_options, &file_options,
column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
BlobSource blob_source(&immutable_options, db_id_, db_session_id_, BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
blob_file_cache.get()); db_session_id_, blob_file_cache.get());
ReadOptions read_options; ReadOptions read_options;
read_options.verify_checksums = true; read_options.verify_checksums = true;
@ -782,6 +785,7 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
DestroyAndReopen(options_); DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_); ImmutableOptions immutable_options(options_);
MutableCFOptions mutable_cf_options(options_);
constexpr uint32_t column_family_id = 1; constexpr uint32_t column_family_id = 1;
constexpr bool has_ttl = false; constexpr bool has_ttl = false;
@ -827,8 +831,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
backing_cache.get(), &immutable_options, &file_options, backing_cache.get(), &immutable_options, &file_options,
column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
BlobSource blob_source(&immutable_options, db_id_, db_session_id_, BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
blob_file_cache.get()); db_session_id_, blob_file_cache.get());
ReadOptions read_options; ReadOptions read_options;
read_options.verify_checksums = true; read_options.verify_checksums = true;
@ -1105,6 +1109,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
DestroyAndReopen(options_); DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_); ImmutableOptions immutable_options(options_);
MutableCFOptions mutable_cf_options(options_);
constexpr uint32_t column_family_id = 1; constexpr uint32_t column_family_id = 1;
constexpr bool has_ttl = false; constexpr bool has_ttl = false;
@ -1137,8 +1142,8 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
backing_cache.get(), &immutable_options, &file_options, column_family_id, backing_cache.get(), &immutable_options, &file_options, column_family_id,
blob_file_read_hist, nullptr /*IOTracer*/)); blob_file_read_hist, nullptr /*IOTracer*/));
BlobSource blob_source(&immutable_options, db_id_, db_session_id_, BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
blob_file_cache.get()); db_session_id_, blob_file_cache.get());
CacheHandleGuard<BlobFileReader> file_reader; CacheHandleGuard<BlobFileReader> file_reader;
ReadOptions read_options; ReadOptions read_options;
@ -1405,6 +1410,7 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
DestroyAndReopen(options_); DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_); ImmutableOptions immutable_options(options_);
MutableCFOptions mutable_cf_options(options_);
constexpr ExpirationRange expiration_range; constexpr ExpirationRange expiration_range;
@ -1426,8 +1432,8 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
backing_cache.get(), &immutable_options, &file_options, backing_cache.get(), &immutable_options, &file_options,
kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
BlobSource blob_source(&immutable_options, db_id_, db_session_id_, BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
blob_file_cache.get()); db_session_id_, blob_file_cache.get());
ConcurrentCacheReservationManager* cache_res_mgr = ConcurrentCacheReservationManager* cache_res_mgr =
static_cast<ChargedCache*>(blob_source.GetBlobCache()) static_cast<ChargedCache*>(blob_source.GetBlobCache())
@ -1519,6 +1525,8 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
DestroyAndReopen(options_); DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_); ImmutableOptions immutable_options(options_);
MutableCFOptions mutable_cf_options(options_);
constexpr size_t blob_size = 24 << 10; // 24KB constexpr size_t blob_size = 24 << 10; // 24KB
for (size_t i = 0; i < kNumBlobs; ++i) { for (size_t i = 0; i < kNumBlobs; ++i) {
blob_file_size_ -= blobs_[i].size(); // old blob size blob_file_size_ -= blobs_[i].size(); // old blob size
@ -1546,8 +1554,8 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
backing_cache.get(), &immutable_options, &file_options, backing_cache.get(), &immutable_options, &file_options,
kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
BlobSource blob_source(&immutable_options, db_id_, db_session_id_, BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
blob_file_cache.get()); db_session_id_, blob_file_cache.get());
ConcurrentCacheReservationManager* cache_res_mgr = ConcurrentCacheReservationManager* cache_res_mgr =
static_cast<ChargedCache*>(blob_source.GetBlobCache()) static_cast<ChargedCache*>(blob_source.GetBlobCache())

View File

@ -374,6 +374,115 @@ TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) {
} }
} }
TEST_F(DBBlobBasicTest, IterateBlobsAllowUnpreparedValue) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
Reopen(options);
constexpr size_t num_blobs = 5;
std::vector<std::string> keys;
std::vector<std::string> blobs;
for (size_t i = 0; i < num_blobs; ++i) {
keys.emplace_back("key" + std::to_string(i));
blobs.emplace_back("blob" + std::to_string(i));
ASSERT_OK(Put(keys[i], blobs[i]));
}
ASSERT_OK(Flush());
ReadOptions read_options;
read_options.allow_unprepared_value = true;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
{
size_t i = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ASSERT_EQ(iter->key(), keys[i]);
ASSERT_TRUE(iter->value().empty());
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->PrepareValue());
ASSERT_EQ(iter->key(), keys[i]);
ASSERT_EQ(iter->value(), blobs[i]);
ASSERT_OK(iter->status());
++i;
}
ASSERT_OK(iter->status());
ASSERT_EQ(i, num_blobs);
}
{
size_t i = 0;
for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
ASSERT_TRUE(iter->value().empty());
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->PrepareValue());
ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
ASSERT_EQ(iter->value(), blobs[num_blobs - 1 - i]);
ASSERT_OK(iter->status());
++i;
}
ASSERT_OK(iter->status());
ASSERT_EQ(i, num_blobs);
}
{
size_t i = 1;
for (iter->Seek(keys[i]); iter->Valid(); iter->Next()) {
ASSERT_EQ(iter->key(), keys[i]);
ASSERT_TRUE(iter->value().empty());
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->PrepareValue());
ASSERT_EQ(iter->key(), keys[i]);
ASSERT_EQ(iter->value(), blobs[i]);
ASSERT_OK(iter->status());
++i;
}
ASSERT_OK(iter->status());
ASSERT_EQ(i, num_blobs);
}
{
size_t i = 1;
for (iter->SeekForPrev(keys[num_blobs - 1 - i]); iter->Valid();
iter->Prev()) {
ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
ASSERT_TRUE(iter->value().empty());
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->PrepareValue());
ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
ASSERT_EQ(iter->value(), blobs[num_blobs - 1 - i]);
ASSERT_OK(iter->status());
++i;
}
ASSERT_OK(iter->status());
ASSERT_EQ(i, num_blobs);
}
}
TEST_F(DBBlobBasicTest, MultiGetBlobs) { TEST_F(DBBlobBasicTest, MultiGetBlobs) {
constexpr size_t min_blob_size = 6; constexpr size_t min_blob_size = 6;
@ -1655,6 +1764,46 @@ TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->ClearAllCallBacks();
} }
TEST_P(DBBlobBasicIOErrorTest, IterateBlobsAllowUnpreparedValue_IOError) {
Options options;
options.env = fault_injection_env_.get();
options.enable_blob_files = true;
Reopen(options);
constexpr char key[] = "key";
constexpr char blob_value[] = "blob_value";
ASSERT_OK(Put(key, blob_value));
ASSERT_OK(Flush());
SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
fault_injection_env_->SetFilesystemActive(false,
Status::IOError(sync_point_));
});
SyncPoint::GetInstance()->EnableProcessing();
ReadOptions read_options;
read_options.allow_unprepared_value = true;
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(iter->key(), key);
ASSERT_TRUE(iter->value().empty());
ASSERT_OK(iter->status());
ASSERT_FALSE(iter->PrepareValue());
ASSERT_FALSE(iter->Valid());
ASSERT_TRUE(iter->status().IsIOError());
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) { TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
Options options = GetDefaultOptions(); Options options = GetDefaultOptions();

View File

@ -53,7 +53,7 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
assert((tboptions.column_family_id == assert((tboptions.column_family_id ==
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
tboptions.column_family_name.empty()); tboptions.column_family_name.empty());
return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file); return tboptions.moptions.table_factory->NewTableBuilder(tboptions, file);
} }
Status BuildTable( Status BuildTable(
@ -206,10 +206,6 @@ Status BuildTable(
/*compaction=*/nullptr, compaction_filter.get(), /*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);
const size_t ts_sz = ucmp->timestamp_size();
const bool logical_strip_timestamp =
ts_sz > 0 && !ioptions.persist_user_defined_timestamps;
SequenceNumber smallest_preferred_seqno = kMaxSequenceNumber; SequenceNumber smallest_preferred_seqno = kMaxSequenceNumber;
std::string key_after_flush_buf; std::string key_after_flush_buf;
std::string value_buf; std::string value_buf;
@ -222,16 +218,6 @@ Status BuildTable(
Slice key_after_flush = key_after_flush_buf; Slice key_after_flush = key_after_flush_buf;
Slice value_after_flush = value; Slice value_after_flush = value;
// If user defined timestamps will be stripped from user key after flush,
// the in memory version of the key act logically the same as one with a
// minimum timestamp. We update the timestamp here so file boundary and
// output validator, block builder all see the effect of the stripping.
if (logical_strip_timestamp) {
key_after_flush_buf.clear();
ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz);
key_after_flush = key_after_flush_buf;
}
if (ikey.type == kTypeValuePreferredSeqno) { if (ikey.type == kTypeValuePreferredSeqno) {
auto [unpacked_value, unix_write_time] = auto [unpacked_value, unix_write_time] =
ParsePackedValueWithWriteTime(value); ParsePackedValueWithWriteTime(value);
@ -291,11 +277,7 @@ Status BuildTable(
Slice last_tombstone_start_user_key{}; Slice last_tombstone_start_user_key{};
for (range_del_it->SeekToFirst(); range_del_it->Valid(); for (range_del_it->SeekToFirst(); range_del_it->Valid();
range_del_it->Next()) { range_del_it->Next()) {
// When user timestamp should not be persisted, we logically strip a auto tombstone = range_del_it->Tombstone();
// range tombstone's start and end key's timestamp (replace it with min
// timestamp) before passing them along to table builder and to update
// file boundaries.
auto tombstone = range_del_it->Tombstone(logical_strip_timestamp);
std::pair<InternalKey, Slice> kv = tombstone.Serialize(); std::pair<InternalKey, Slice> kv = tombstone.Serialize();
builder->Add(kv.first.Encode(), kv.second); builder->Add(kv.first.Encode(), kv.second);
InternalKey tombstone_end = tombstone.SerializeEndKey(); InternalKey tombstone_end = tombstone.SerializeEndKey();
@ -438,8 +420,7 @@ Status BuildTable(
// the goal is to cache it here for further user reads. // the goal is to cache it here for further user reads.
std::unique_ptr<InternalIterator> it(table_cache->NewIterator( std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
tboptions.read_options, file_options, tboptions.internal_comparator, tboptions.read_options, file_options, tboptions.internal_comparator,
*meta, nullptr /* range_del_agg */, *meta, nullptr /* range_del_agg */, mutable_cf_options, nullptr,
mutable_cf_options.prefix_extractor, nullptr,
(internal_stats == nullptr) ? nullptr (internal_stats == nullptr) ? nullptr
: internal_stats->GetFileReadHist(0), : internal_stats->GetFileReadHist(0),
TableReaderCaller::kFlush, /*arena=*/nullptr, TableReaderCaller::kFlush, /*arena=*/nullptr,
@ -447,8 +428,7 @@ Status BuildTable(
MaxFileSizeForL0MetaPin(mutable_cf_options), MaxFileSizeForL0MetaPin(mutable_cf_options),
/*smallest_compaction_key=*/nullptr, /*smallest_compaction_key=*/nullptr,
/*largest_compaction_key*/ nullptr, /*largest_compaction_key*/ nullptr,
/*allow_unprepared_value*/ false, /*allow_unprepared_value*/ false));
mutable_cf_options.block_protection_bytes_per_key));
s = it->status(); s = it->status();
if (s.ok() && paranoid_file_checks) { if (s.ok() && paranoid_file_checks) {
OutputValidator file_validator(tboptions.internal_comparator, OutputValidator file_validator(tboptions.internal_comparator,
@ -480,9 +460,18 @@ Status BuildTable(
Status prepare = Status prepare =
WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts); WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts);
if (prepare.ok()) { if (prepare.ok()) {
// FIXME: track file for "slow" deletion, e.g. into the
// VersionSet::obsolete_files_ pipeline
Status ignored = fs->DeleteFile(fname, opts, dbg); Status ignored = fs->DeleteFile(fname, opts, dbg);
ignored.PermitUncheckedError(); ignored.PermitUncheckedError();
} }
// Ensure we don't leak table cache entries when throwing away output
// files. (The usual logic in PurgeObsoleteFiles is not applicable because
// this function deletes the obsolete file itself, while they should
// probably go into the VersionSet::obsolete_files_ pipeline.)
TableCache::ReleaseObsolete(table_cache->get_cache().get(),
meta->fd.GetNumber(), nullptr /*handle*/,
mutable_cf_options.uncache_aggressiveness);
} }
assert(blob_file_additions || blob_file_paths.empty()); assert(blob_file_additions || blob_file_paths.empty());

View File

@ -4075,6 +4075,15 @@ void rocksdb_options_set_write_dbid_to_manifest(
opt->rep.write_dbid_to_manifest = write_dbid_to_manifest; opt->rep.write_dbid_to_manifest = write_dbid_to_manifest;
} }
unsigned char rocksdb_options_get_write_identity_file(rocksdb_options_t* opt) {
return opt->rep.write_identity_file;
}
void rocksdb_options_set_write_identity_file(
rocksdb_options_t* opt, unsigned char write_identity_file) {
opt->rep.write_identity_file = write_identity_file;
}
unsigned char rocksdb_options_get_track_and_verify_wals_in_manifest( unsigned char rocksdb_options_get_track_and_verify_wals_in_manifest(
rocksdb_options_t* opt) { rocksdb_options_t* opt) {
return opt->rep.track_and_verify_wals_in_manifest; return opt->rep.track_and_verify_wals_in_manifest;

View File

@ -772,6 +772,8 @@ int main(int argc, char** argv) {
rocksdb_options_set_write_buffer_size(options, 100000); rocksdb_options_set_write_buffer_size(options, 100000);
rocksdb_options_set_paranoid_checks(options, 1); rocksdb_options_set_paranoid_checks(options, 1);
rocksdb_options_set_max_open_files(options, 10); rocksdb_options_set_max_open_files(options, 10);
/* Compatibility with how test was written */
rocksdb_options_set_write_dbid_to_manifest(options, 0);
table_options = rocksdb_block_based_options_create(); table_options = rocksdb_block_based_options_create();
rocksdb_block_based_options_set_block_cache(table_options, cache); rocksdb_block_based_options_set_block_cache(table_options, cache);
@ -962,15 +964,24 @@ int main(int argc, char** argv) {
rocksdb_options_t* options_dbid_in_manifest = rocksdb_options_create(); rocksdb_options_t* options_dbid_in_manifest = rocksdb_options_create();
rocksdb_options_set_create_if_missing(options_dbid_in_manifest, 1); rocksdb_options_set_create_if_missing(options_dbid_in_manifest, 1);
rocksdb_options_set_write_dbid_to_manifest(options_dbid_in_manifest, false);
unsigned char write_to_manifest = unsigned char write_to_manifest =
rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest); rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest);
CheckCondition(!write_to_manifest); CheckCondition(!write_to_manifest);
rocksdb_options_set_write_dbid_to_manifest(options_dbid_in_manifest, true); rocksdb_options_set_write_dbid_to_manifest(options_dbid_in_manifest, true);
CheckCondition(!write_to_manifest);
write_to_manifest = write_to_manifest =
rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest); rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest);
CheckCondition(write_to_manifest); CheckCondition(write_to_manifest);
rocksdb_options_set_write_identity_file(options_dbid_in_manifest, true);
unsigned char write_identity_file =
rocksdb_options_get_write_identity_file(options_dbid_in_manifest);
CheckCondition(write_identity_file);
rocksdb_options_set_write_identity_file(options_dbid_in_manifest, false);
write_identity_file =
rocksdb_options_get_write_identity_file(options_dbid_in_manifest);
CheckCondition(!write_identity_file);
db = rocksdb_open(options_dbid_in_manifest, dbbackupname, &err); db = rocksdb_open(options_dbid_in_manifest, dbbackupname, &err);
CheckNoError(err); CheckNoError(err);

View File

@ -12,14 +12,12 @@ namespace ROCKSDB_NAMESPACE {
// EXPERIMENTAL // EXPERIMENTAL
class CoalescingIterator : public Iterator { class CoalescingIterator : public Iterator {
public: public:
CoalescingIterator(const Comparator* comparator, CoalescingIterator(
const std::vector<ColumnFamilyHandle*>& column_families, const ReadOptions& read_options, const Comparator* comparator,
const std::vector<Iterator*>& child_iterators) std::vector<std::pair<ColumnFamilyHandle*, std::unique_ptr<Iterator>>>&&
: impl_( cfh_iter_pairs)
comparator, column_families, child_iterators, [this]() { Reset(); }, : impl_(read_options, comparator, std::move(cfh_iter_pairs),
[this](const autovector<MultiCfIteratorInfo>& items) { ResetFunc(this), PopulateFunc(this)) {}
Coalesce(items);
}) {}
~CoalescingIterator() override {} ~CoalescingIterator() override {}
// No copy allowed // No copy allowed
@ -50,8 +48,36 @@ class CoalescingIterator : public Iterator {
wide_columns_.clear(); wide_columns_.clear();
} }
bool PrepareValue() override { return impl_.PrepareValue(); }
private: private:
MultiCfIteratorImpl impl_; class ResetFunc {
public:
explicit ResetFunc(CoalescingIterator* iter) : iter_(iter) {}
void operator()() const {
assert(iter_);
iter_->Reset();
}
private:
CoalescingIterator* iter_;
};
class PopulateFunc {
public:
explicit PopulateFunc(CoalescingIterator* iter) : iter_(iter) {}
void operator()(const autovector<MultiCfIteratorInfo>& items) const {
assert(iter_);
iter_->Coalesce(items);
}
private:
CoalescingIterator* iter_;
};
MultiCfIteratorImpl<ResetFunc, PopulateFunc> impl_;
Slice value_; Slice value_;
WideColumns wide_columns_; WideColumns wide_columns_;

View File

@ -466,7 +466,7 @@ void SuperVersion::Cleanup() {
// decrement reference to the immutable MemtableList // decrement reference to the immutable MemtableList
// this SV object was pointing to. // this SV object was pointing to.
imm->Unref(&to_delete); imm->Unref(&to_delete);
MemTable* m = mem->Unref(); ReadOnlyMemTable* m = mem->Unref();
if (m != nullptr) { if (m != nullptr) {
auto* memory_usage = current->cfd()->imm()->current_memory_usage(); auto* memory_usage = current->cfd()->imm()->current_memory_usage();
assert(*memory_usage >= m->ApproximateMemoryUsage()); assert(*memory_usage >= m->ApproximateMemoryUsage());
@ -595,8 +595,8 @@ ColumnFamilyData::ColumnFamilyData(
blob_file_cache_.reset( blob_file_cache_.reset(
new BlobFileCache(_table_cache, ioptions(), soptions(), id_, new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
internal_stats_->GetBlobFileReadHist(), io_tracer)); internal_stats_->GetBlobFileReadHist(), io_tracer));
blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id, blob_source_.reset(new BlobSource(ioptions_, mutable_cf_options_, db_id,
blob_file_cache_.get())); db_session_id, blob_file_cache_.get()));
if (ioptions_.compaction_style == kCompactionStyleLevel) { if (ioptions_.compaction_style == kCompactionStyleLevel) {
compaction_picker_.reset( compaction_picker_.reset(
@ -693,9 +693,9 @@ ColumnFamilyData::~ColumnFamilyData() {
if (mem_ != nullptr) { if (mem_ != nullptr) {
delete mem_->Unref(); delete mem_->Unref();
} }
autovector<MemTable*> to_delete; autovector<ReadOnlyMemTable*> to_delete;
imm_.current()->Unref(&to_delete); imm_.current()->Unref(&to_delete);
for (MemTable* m : to_delete) { for (auto* m : to_delete) {
delete m; delete m;
} }
@ -901,7 +901,11 @@ uint64_t GetPendingCompactionBytesForCompactionSpeedup(
return slowdown_threshold; return slowdown_threshold;
} }
uint64_t size_threshold = bottommost_files_size / kBottommostSizeDivisor; // Prevent a small CF from triggering parallel compactions for other CFs.
// Require compaction debt to be more than a full L0 to Lbase compaction.
const uint64_t kMinDebtSize = 2 * mutable_cf_options.max_bytes_for_level_base;
uint64_t size_threshold =
std::max(bottommost_files_size / kBottommostSizeDivisor, kMinDebtSize);
return std::min(size_threshold, slowdown_threshold); return std::min(size_threshold, slowdown_threshold);
} }
@ -1172,10 +1176,12 @@ bool ColumnFamilyData::NeedsCompaction() const {
Compaction* ColumnFamilyData::PickCompaction( Compaction* ColumnFamilyData::PickCompaction(
const MutableCFOptions& mutable_options, const MutableCFOptions& mutable_options,
const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { const MutableDBOptions& mutable_db_options,
const std::vector<SequenceNumber>& existing_snapshots,
const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer) {
auto* result = compaction_picker_->PickCompaction( auto* result = compaction_picker_->PickCompaction(
GetName(), mutable_options, mutable_db_options, current_->storage_info(), GetName(), mutable_options, mutable_db_options, existing_snapshots,
log_buffer); snapshot_checker, current_->storage_info(), log_buffer);
if (result != nullptr) { if (result != nullptr) {
result->FinalizeInputInfo(current_); result->FinalizeInputInfo(current_);
} }
@ -1201,8 +1207,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
read_opts.total_order_seek = true; read_opts.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
merge_iter_builder.AddIterator(super_version->mem->NewIterator( merge_iter_builder.AddIterator(super_version->mem->NewIterator(
read_opts, /*seqno_to_time_mapping=*/nullptr, &arena)); read_opts, /*seqno_to_time_mapping=*/nullptr, &arena,
/*prefix_extractor=*/nullptr));
super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr, super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
/*prefix_extractor=*/nullptr,
&merge_iter_builder, &merge_iter_builder,
false /* add_range_tombstone_iter */); false /* add_range_tombstone_iter */);
ScopedArenaPtr<InternalIterator> memtable_iter(merge_iter_builder.Finish()); ScopedArenaPtr<InternalIterator> memtable_iter(merge_iter_builder.Finish());
@ -1565,28 +1573,6 @@ Status ColumnFamilyData::SetOptions(
return s; return s;
} }
// REQUIRES: DB mutex held
Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
return Env::WLTH_NOT_SET;
}
if (level == 0) {
return Env::WLTH_MEDIUM;
}
int base_level = current_->storage_info()->base_level();
// L1: medium, L2: long, ...
if (level - base_level >= 2) {
return Env::WLTH_EXTREME;
} else if (level < base_level) {
// There is no restriction which prevents level passed in to be smaller
// than base_level.
return Env::WLTH_MEDIUM;
}
return static_cast<Env::WriteLifeTimeHint>(
level - base_level + static_cast<int>(Env::WLTH_MEDIUM));
}
Status ColumnFamilyData::AddDirectories( Status ColumnFamilyData::AddDirectories(
std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) { std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
Status s; Status s;
@ -1652,6 +1638,9 @@ bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
} }
for (const Slice& table_newest_udt : for (const Slice& table_newest_udt :
imm()->GetTablesNewestUDT(max_memtable_id)) { imm()->GetTablesNewestUDT(max_memtable_id)) {
if (table_newest_udt.empty()) {
continue;
}
assert(table_newest_udt.size() == full_history_ts_low.size()); assert(table_newest_udt.size() == full_history_ts_low.size());
// Checking the newest UDT contained in MemTable with ascending ID up to // Checking the newest UDT contained in MemTable with ascending ID up to
// `max_memtable_id`. Return immediately on finding the first MemTable that // `max_memtable_id`. Return immediately on finding the first MemTable that

View File

@ -16,6 +16,7 @@
#include "cache/cache_reservation_manager.h" #include "cache/cache_reservation_manager.h"
#include "db/memtable_list.h" #include "db/memtable_list.h"
#include "db/snapshot_checker.h"
#include "db/table_cache.h" #include "db/table_cache.h"
#include "db/table_properties_collector.h" #include "db/table_properties_collector.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
@ -206,7 +207,7 @@ struct SuperVersion {
// Accessing members of this class is not thread-safe and requires external // Accessing members of this class is not thread-safe and requires external
// synchronization (ie db mutex held or on write thread). // synchronization (ie db mutex held or on write thread).
ColumnFamilyData* cfd; ColumnFamilyData* cfd;
MemTable* mem; ReadOnlyMemTable* mem;
MemTableListVersion* imm; MemTableListVersion* imm;
Version* current; Version* current;
MutableCFOptions mutable_cf_options; MutableCFOptions mutable_cf_options;
@ -268,7 +269,7 @@ struct SuperVersion {
// We need to_delete because during Cleanup(), imm->Unref() returns // We need to_delete because during Cleanup(), imm->Unref() returns
// all memtables that we need to free through this vector. We then // all memtables that we need to free through this vector. We then
// delete all those memtables outside of mutex, during destruction // delete all those memtables outside of mutex, during destruction
autovector<MemTable*> to_delete; autovector<ReadOnlyMemTable*> to_delete;
}; };
Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
@ -385,9 +386,9 @@ class ColumnFamilyData {
uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held
// REQUIRE: DB mutex held
void SetMemtable(MemTable* new_mem) { void SetMemtable(MemTable* new_mem) {
uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; new_mem->SetID(++last_memtable_id_);
new_mem->SetID(memtable_id);
mem_ = new_mem; mem_ = new_mem;
} }
@ -401,15 +402,18 @@ class ColumnFamilyData {
SequenceNumber earliest_seq); SequenceNumber earliest_seq);
TableCache* table_cache() const { return table_cache_.get(); } TableCache* table_cache() const { return table_cache_.get(); }
BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
BlobSource* blob_source() const { return blob_source_.get(); } BlobSource* blob_source() const { return blob_source_.get(); }
// See documentation in compaction_picker.h // See documentation in compaction_picker.h
// REQUIRES: DB mutex held // REQUIRES: DB mutex held
bool NeedsCompaction() const; bool NeedsCompaction() const;
// REQUIRES: DB mutex held // REQUIRES: DB mutex held
Compaction* PickCompaction(const MutableCFOptions& mutable_options, Compaction* PickCompaction(
const MutableDBOptions& mutable_db_options, const MutableCFOptions& mutable_options,
LogBuffer* log_buffer); const MutableDBOptions& mutable_db_options,
const std::vector<SequenceNumber>& existing_snapshots,
const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer);
// Check if the passed range overlap with any running compactions. // Check if the passed range overlap with any running compactions.
// REQUIRES: DB mutex held // REQUIRES: DB mutex held
@ -511,8 +515,6 @@ class ColumnFamilyData {
return initial_cf_options_; return initial_cf_options_;
} }
Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
// created_dirs remembers directory created, so that we don't need to call // created_dirs remembers directory created, so that we don't need to call
// the same data creation operation again. // the same data creation operation again.
Status AddDirectories( Status AddDirectories(
@ -671,7 +673,7 @@ class ColumnFamilyData {
bool allow_2pc_; bool allow_2pc_;
// Memtable id to track flush. // Memtable id to track flush.
std::atomic<uint64_t> last_memtable_id_; uint64_t last_memtable_id_;
// Directories corresponding to cf_paths. // Directories corresponding to cf_paths.
std::vector<std::shared_ptr<FSDirectory>> data_dirs_; std::vector<std::shared_ptr<FSDirectory>> data_dirs_;

View File

@ -3012,19 +3012,25 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForCompactionDebt) {
ASSERT_OK(db_->Flush(FlushOptions())); ASSERT_OK(db_->Flush(FlushOptions()));
{ {
// 1MB debt is way bigger than bottommost data so definitely triggers
// speedup.
VersionStorageInfo* vstorage = cfd->current()->storage_info(); VersionStorageInfo* vstorage = cfd->current()->storage_info();
vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */,
dbmu);
RecalculateWriteStallConditions(cfd, mutable_cf_options);
ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
// Eight bytes is way smaller than bottommost data so definitely does not // Eight bytes is way smaller than bottommost data so definitely does not
// trigger speedup. // trigger speedup.
vstorage->TEST_set_estimated_compaction_needed_bytes(8, dbmu); vstorage->TEST_set_estimated_compaction_needed_bytes(8, dbmu);
RecalculateWriteStallConditions(cfd, mutable_cf_options); RecalculateWriteStallConditions(cfd, mutable_cf_options);
ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
// 1MB is much larger than bottommost level size. However, since it's too
// small in terms of absolute size, it does not trigger parallel compaction
// in this case (see GetPendingCompactionBytesForCompactionSpeedup()).
vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */,
dbmu);
RecalculateWriteStallConditions(cfd, mutable_cf_options);
ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
vstorage->TEST_set_estimated_compaction_needed_bytes(
2 * mutable_cf_options.max_bytes_for_level_base, dbmu);
RecalculateWriteStallConditions(cfd, mutable_cf_options);
ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
} }
} }
@ -3067,12 +3073,20 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
WaitForCompaction(); WaitForCompaction();
AssertFilesPerLevel("0,1", 0 /* cf */); AssertFilesPerLevel("0,1", 0 /* cf */);
// We should calculate the limit by obtaining the number of env background
// threads, because the current test case will share the same env
// with another case that may have already increased the number of
// background threads which is larger than kParallelismLimit
const auto limit = env_->GetBackgroundThreads(Env::Priority::LOW);
// Block the compaction thread pool so marked files accumulate in L0. // Block the compaction thread pool so marked files accumulate in L0.
test::SleepingBackgroundTask sleeping_tasks[kParallelismLimit]; std::vector<std::shared_ptr<test::SleepingBackgroundTask>> sleeping_tasks;
for (int i = 0; i < kParallelismLimit; i++) { for (int i = 0; i < limit; i++) {
sleeping_tasks.emplace_back(
std::make_shared<test::SleepingBackgroundTask>());
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
&sleeping_tasks[i], Env::Priority::LOW); sleeping_tasks[i].get(), Env::Priority::LOW);
sleeping_tasks[i].WaitUntilSleeping(); sleeping_tasks[i]->WaitUntilSleeping();
} }
// Zero marked upper-level files. No speedup. // Zero marked upper-level files. No speedup.
@ -3091,9 +3105,9 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed()); ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed());
AssertFilesPerLevel("2,1", 0 /* cf */); AssertFilesPerLevel("2,1", 0 /* cf */);
for (int i = 0; i < kParallelismLimit; i++) { for (int i = 0; i < limit; i++) {
sleeping_tasks[i].WakeUp(); sleeping_tasks[i]->WakeUp();
sleeping_tasks[i].WaitUntilDone(); sleeping_tasks[i]->WaitUntilDone();
} }
} }
@ -3862,6 +3876,91 @@ TEST_F(ManualFlushSkipRetainUDTTest, ManualFlush) {
Close(); Close();
} }
TEST_F(ManualFlushSkipRetainUDTTest, FlushRemovesStaleEntries) {
column_family_options_.max_write_buffer_number = 4;
Open();
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0)));
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
ColumnFamilyData* cfd =
static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
for (int version = 0; version < 100; version++) {
if (version == 50) {
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable(cfd));
}
ASSERT_OK(
Put(0, "foo", EncodeAsUint64(version), "v" + std::to_string(version)));
}
ASSERT_OK(Flush(0));
TablePropertiesCollection tables_properties;
ASSERT_OK(db_->GetPropertiesOfAllTables(&tables_properties));
ASSERT_EQ(1, tables_properties.size());
std::shared_ptr<const TableProperties> table_properties =
tables_properties.begin()->second;
ASSERT_EQ(1, table_properties->num_entries);
ASSERT_EQ(0, table_properties->num_deletions);
ASSERT_EQ(0, table_properties->num_range_deletions);
CheckEffectiveCutoffTime(100);
CheckAutomaticFlushRetainUDT(101);
Close();
}
TEST_F(ManualFlushSkipRetainUDTTest, RangeDeletionFlushRemovesStaleEntries) {
column_family_options_.max_write_buffer_number = 4;
Open();
// TODO(yuzhangyu): a non 0 full history ts low is needed for this garbage
// collection to kick in. This doesn't work well for the very first flush of
// the column family. Not a big issue, but would be nice to improve this.
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(9)));
for (int i = 10; i < 100; i++) {
ASSERT_OK(Put(0, "foo" + std::to_string(i), EncodeAsUint64(i),
"val" + std::to_string(i)));
if (i % 2 == 1) {
ASSERT_OK(db_->DeleteRange(WriteOptions(), "foo" + std::to_string(i - 1),
"foo" + std::to_string(i), EncodeAsUint64(i)));
}
}
ASSERT_OK(Flush(0));
CheckEffectiveCutoffTime(100);
std::string read_ts = EncodeAsUint64(100);
std::string min_ts = EncodeAsUint64(0);
ReadOptions ropts;
Slice read_ts_slice = read_ts;
std::string value;
ropts.timestamp = &read_ts_slice;
{
Iterator* iter = db_->NewIterator(ropts);
iter->SeekToFirst();
int i = 11;
while (iter->Valid()) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ("foo" + std::to_string(i), iter->key());
ASSERT_EQ("val" + std::to_string(i), iter->value());
ASSERT_EQ(min_ts, iter->timestamp());
iter->Next();
i += 2;
}
ASSERT_OK(iter->status());
delete iter;
}
TablePropertiesCollection tables_properties;
ASSERT_OK(db_->GetPropertiesOfAllTables(&tables_properties));
ASSERT_EQ(1, tables_properties.size());
std::shared_ptr<const TableProperties> table_properties =
tables_properties.begin()->second;
// 45 point data + 45 range deletions. 45 obsolete point data are garbage
// collected.
ASSERT_EQ(90, table_properties->num_entries);
ASSERT_EQ(45, table_properties->num_deletions);
ASSERT_EQ(45, table_properties->num_range_deletions);
Close();
}
TEST_F(ManualFlushSkipRetainUDTTest, ManualCompaction) { TEST_F(ManualFlushSkipRetainUDTTest, ManualCompaction) {
Open(); Open();
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0))); ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0)));

View File

@ -283,9 +283,10 @@ Compaction::Compaction(
uint32_t _output_path_id, CompressionType _compression, uint32_t _output_path_id, CompressionType _compression,
CompressionOptions _compression_opts, Temperature _output_temperature, CompressionOptions _compression_opts, Temperature _output_temperature,
uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents, uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
bool _manual_compaction, const std::string& _trim_ts, double _score, std::optional<SequenceNumber> _earliest_snapshot,
bool _deletion_compaction, bool l0_files_might_overlap, const SnapshotChecker* _snapshot_checker, bool _manual_compaction,
CompactionReason _compaction_reason, const std::string& _trim_ts, double _score, bool _deletion_compaction,
bool l0_files_might_overlap, CompactionReason _compaction_reason,
BlobGarbageCollectionPolicy _blob_garbage_collection_policy, BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
double _blob_garbage_collection_age_cutoff) double _blob_garbage_collection_age_cutoff)
: input_vstorage_(vstorage), : input_vstorage_(vstorage),
@ -307,6 +308,8 @@ Compaction::Compaction(
l0_files_might_overlap_(l0_files_might_overlap), l0_files_might_overlap_(l0_files_might_overlap),
inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
grandparents_(std::move(_grandparents)), grandparents_(std::move(_grandparents)),
earliest_snapshot_(_earliest_snapshot),
snapshot_checker_(_snapshot_checker),
score_(_score), score_(_score),
bottommost_level_( bottommost_level_(
// For simplicity, we don't support the concept of "bottommost level" // For simplicity, we don't support the concept of "bottommost level"
@ -342,8 +345,9 @@ Compaction::Compaction(
_compaction_reason == CompactionReason::kExternalSstIngestion || _compaction_reason == CompactionReason::kExternalSstIngestion ||
_compaction_reason == CompactionReason::kRefitLevel _compaction_reason == CompactionReason::kRefitLevel
? Compaction::kInvalidLevel ? Compaction::kInvalidLevel
: EvaluatePenultimateLevel(vstorage, immutable_options_, : EvaluatePenultimateLevel(vstorage, mutable_cf_options_,
start_level_, output_level_)) { immutable_options_, start_level_,
output_level_)) {
MarkFilesBeingCompacted(true); MarkFilesBeingCompacted(true);
if (is_manual_compaction_) { if (is_manual_compaction_) {
compaction_reason_ = CompactionReason::kManualCompaction; compaction_reason_ = CompactionReason::kManualCompaction;
@ -367,9 +371,13 @@ Compaction::Compaction(
// setup input_levels_ // setup input_levels_
{ {
input_levels_.resize(num_input_levels()); input_levels_.resize(num_input_levels());
for (size_t which = 0; which < num_input_levels(); which++) { if (earliest_snapshot_.has_value()) {
DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, FilterInputsForCompactionIterator();
&arena_); } else {
for (size_t which = 0; which < num_input_levels(); which++) {
DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
&arena_);
}
} }
} }
@ -686,12 +694,11 @@ bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
}; };
// Mark (or clear) each file that is being compacted // Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { void Compaction::MarkFilesBeingCompacted(bool being_compacted) const {
for (size_t i = 0; i < num_input_levels(); i++) { for (size_t i = 0; i < num_input_levels(); i++) {
for (size_t j = 0; j < inputs_[i].size(); j++) { for (size_t j = 0; j < inputs_[i].size(); j++) {
assert(mark_as_compacted ? !inputs_[i][j]->being_compacted assert(being_compacted != inputs_[i][j]->being_compacted);
: inputs_[i][j]->being_compacted); inputs_[i][j]->being_compacted = being_compacted;
inputs_[i][j]->being_compacted = mark_as_compacted;
} }
} }
} }
@ -735,7 +742,7 @@ uint64_t Compaction::CalculateTotalInputSize() const {
return size; return size;
} }
void Compaction::ReleaseCompactionFiles(Status status) { void Compaction::ReleaseCompactionFiles(const Status& status) {
MarkFilesBeingCompacted(false); MarkFilesBeingCompacted(false);
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
} }
@ -746,8 +753,10 @@ void Compaction::ResetNextCompactionIndex() {
} }
namespace { namespace {
int InputSummary(const std::vector<FileMetaData*>& files, char* output, int InputSummary(const std::vector<FileMetaData*>& files,
const std::vector<bool>& files_filtered, char* output,
int len) { int len) {
assert(files_filtered.empty() || (files.size() == files_filtered.size()));
*output = '\0'; *output = '\0';
int write = 0; int write = 0;
for (size_t i = 0; i < files.size(); i++) { for (size_t i = 0; i < files.size(); i++) {
@ -755,8 +764,14 @@ int InputSummary(const std::vector<FileMetaData*>& files, char* output,
int ret; int ret;
char sztxt[16]; char sztxt[16];
AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", if (files_filtered.empty()) {
files.at(i)->fd.GetNumber(), sztxt); ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
files.at(i)->fd.GetNumber(), sztxt);
} else {
ret = snprintf(output + write, sz, "%" PRIu64 "(%s filtered:%s) ",
files.at(i)->fd.GetNumber(), sztxt,
files_filtered.at(i) ? "true" : "false");
}
if (ret < 0 || ret >= sz) { if (ret < 0 || ret >= sz) {
break; break;
} }
@ -782,8 +797,15 @@ void Compaction::Summary(char* output, int len) {
return; return;
} }
} }
write +=
InputSummary(inputs_[level_iter].files, output + write, len - write); assert(non_start_level_input_files_filtered_.empty() ||
non_start_level_input_files_filtered_.size() == inputs_.size() - 1);
write += InputSummary(
inputs_[level_iter].files,
(level_iter == 0 || non_start_level_input_files_filtered_.empty())
? std::vector<bool>{}
: non_start_level_input_files_filtered_[level_iter - 1],
output + write, len - write);
if (write < 0 || write >= len) { if (write < 0 || write >= len) {
return; return;
} }
@ -866,7 +888,7 @@ bool Compaction::ShouldFormSubcompactions() const {
return false; return false;
} }
if (cfd_->ioptions()->table_factory->Name() == if (mutable_cf_options_.table_factory->Name() ==
TableFactory::kPlainTableName()) { TableFactory::kPlainTableName()) {
return false; return false;
} }
@ -914,6 +936,25 @@ bool Compaction::DoesInputReferenceBlobFiles() const {
return false; return false;
} }
uint64_t Compaction::MaxInputFileNewestKeyTime(const InternalKey* start,
const InternalKey* end) const {
uint64_t newest_key_time = kUnknownNewestKeyTime;
const InternalKeyComparator& icmp =
column_family_data()->internal_comparator();
for (const auto& level_files : inputs_) {
for (const auto& file : level_files.files) {
if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
continue;
}
if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
continue;
}
newest_key_time = std::max(newest_key_time, file->TryGetNewestKeyTime());
}
}
return newest_key_time;
}
uint64_t Compaction::MinInputFileOldestAncesterTime( uint64_t Compaction::MinInputFileOldestAncesterTime(
const InternalKey* start, const InternalKey* end) const { const InternalKey* start, const InternalKey* end) const {
uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max(); uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max();
@ -949,6 +990,7 @@ uint64_t Compaction::MinInputFileEpochNumber() const {
int Compaction::EvaluatePenultimateLevel( int Compaction::EvaluatePenultimateLevel(
const VersionStorageInfo* vstorage, const VersionStorageInfo* vstorage,
const MutableCFOptions& mutable_cf_options,
const ImmutableOptions& immutable_options, const int start_level, const ImmutableOptions& immutable_options, const int start_level,
const int output_level) { const int output_level) {
// TODO: currently per_key_placement feature only support level and universal // TODO: currently per_key_placement feature only support level and universal
@ -980,7 +1022,7 @@ int Compaction::EvaluatePenultimateLevel(
} }
bool supports_per_key_placement = bool supports_per_key_placement =
immutable_options.preclude_last_level_data_seconds > 0; mutable_cf_options.preclude_last_level_data_seconds > 0;
// it could be overridden by unittest // it could be overridden by unittest
TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled", TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
@ -992,4 +1034,69 @@ int Compaction::EvaluatePenultimateLevel(
return penultimate_level; return penultimate_level;
} }
void Compaction::FilterInputsForCompactionIterator() {
assert(earliest_snapshot_.has_value());
// cfd_ is not populated at Compaction construction time, get it from
// VersionStorageInfo instead.
assert(input_vstorage_);
const auto* ucmp = input_vstorage_->user_comparator();
assert(ucmp);
// Simply comparing file boundaries when user-defined timestamp is defined
// is not as safe because we need to also compare timestamp to know for
// sure. Although entries with higher timestamp is also supposed to have
// higher sequence number for the same user key (without timestamp).
assert(ucmp->timestamp_size() == 0);
size_t num_input_levels = inputs_.size();
// TODO(yuzhangyu): filtering of older L0 file by new L0 file is not
// supported yet.
FileMetaData* rangedel_candidate = inputs_[0].level == 0
? inputs_[0].files.back()
: inputs_[0].files.front();
assert(rangedel_candidate);
if (!rangedel_candidate->FileIsStandAloneRangeTombstone() ||
!DataIsDefinitelyInSnapshot(rangedel_candidate->fd.smallest_seqno,
earliest_snapshot_.value(),
snapshot_checker_)) {
for (size_t level = 0; level < num_input_levels; level++) {
DoGenerateLevelFilesBrief(&input_levels_[level], inputs_[level].files,
&arena_);
}
return;
}
Slice rangedel_start_ukey = rangedel_candidate->smallest.user_key();
Slice rangedel_end_ukey = rangedel_candidate->largest.user_key();
SequenceNumber rangedel_seqno = rangedel_candidate->fd.smallest_seqno;
std::vector<std::vector<FileMetaData*>> non_start_level_input_files;
non_start_level_input_files.reserve(num_input_levels - 1);
non_start_level_input_files_filtered_.reserve(num_input_levels - 1);
for (size_t level = 1; level < num_input_levels; level++) {
non_start_level_input_files.emplace_back();
non_start_level_input_files_filtered_.emplace_back();
for (FileMetaData* file : inputs_[level].files) {
non_start_level_input_files_filtered_.back().push_back(false);
// When range data and point data has the same sequence number, point
// data wins. Range deletion end key is exclusive, so check it's bigger
// than file right boundary user key.
if (rangedel_seqno > file->fd.largest_seqno &&
ucmp->CompareWithoutTimestamp(rangedel_start_ukey,
file->smallest.user_key()) <= 0 &&
ucmp->CompareWithoutTimestamp(rangedel_end_ukey,
file->largest.user_key()) > 0) {
non_start_level_input_files_filtered_.back().back() = true;
} else {
non_start_level_input_files.back().push_back(file);
}
}
}
DoGenerateLevelFilesBrief(&input_levels_[0], inputs_[0].files, &arena_);
assert(non_start_level_input_files.size() == num_input_levels - 1);
for (size_t level = 1; level < num_input_levels; level++) {
DoGenerateLevelFilesBrief(&input_levels_[level],
non_start_level_input_files[level - 1], &arena_);
}
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -8,6 +8,8 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once #pragma once
#include "db/snapshot_checker.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "memory/arena.h" #include "memory/arena.h"
#include "options/cf_options.h" #include "options/cf_options.h"
@ -90,6 +92,8 @@ class Compaction {
CompressionOptions compression_opts, CompressionOptions compression_opts,
Temperature output_temperature, uint32_t max_subcompactions, Temperature output_temperature, uint32_t max_subcompactions,
std::vector<FileMetaData*> grandparents, std::vector<FileMetaData*> grandparents,
std::optional<SequenceNumber> earliest_snapshot,
const SnapshotChecker* snapshot_checker,
bool manual_compaction = false, const std::string& trim_ts = "", bool manual_compaction = false, const std::string& trim_ts = "",
double score = -1, bool deletion_compaction = false, double score = -1, bool deletion_compaction = false,
bool l0_files_might_overlap = true, bool l0_files_might_overlap = true,
@ -230,7 +234,7 @@ class Compaction {
// Delete this compaction from the list of running compactions. // Delete this compaction from the list of running compactions.
// //
// Requirement: DB mutex held // Requirement: DB mutex held
void ReleaseCompactionFiles(Status status); void ReleaseCompactionFiles(const Status& status);
// Returns the summary of the compaction in "output" with maximum "len" // Returns the summary of the compaction in "output" with maximum "len"
// in bytes. The caller is responsible for the memory management of // in bytes. The caller is responsible for the memory management of
@ -401,6 +405,12 @@ class Compaction {
return blob_garbage_collection_age_cutoff_; return blob_garbage_collection_age_cutoff_;
} }
// start and end are sub compact range. Null if no boundary.
// This is used to calculate the newest_key_time table property after
// compaction.
uint64_t MaxInputFileNewestKeyTime(const InternalKey* start,
const InternalKey* end) const;
// start and end are sub compact range. Null if no boundary. // start and end are sub compact range. Null if no boundary.
// This is used to filter out some input files' ancester's time range. // This is used to filter out some input files' ancester's time range.
uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
@ -430,18 +440,19 @@ class Compaction {
// penultimate level. The safe key range is populated by // penultimate level. The safe key range is populated by
// `PopulatePenultimateLevelOutputRange()`. // `PopulatePenultimateLevelOutputRange()`.
// Which could potentially disable all penultimate level output. // Which could potentially disable all penultimate level output.
static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage, static int EvaluatePenultimateLevel(
const ImmutableOptions& immutable_options, const VersionStorageInfo* vstorage,
const int start_level, const MutableCFOptions& mutable_cf_options,
const int output_level); const ImmutableOptions& immutable_options, const int start_level,
const int output_level);
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool being_compacted) const;
private: private:
Status InitInputTableProperties(); Status InitInputTableProperties();
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool mark_as_compacted);
// get the smallest and largest key present in files to be compacted // get the smallest and largest key present in files to be compacted
static void GetBoundaryKeys(VersionStorageInfo* vstorage, static void GetBoundaryKeys(VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs, const std::vector<CompactionInputFiles>& inputs,
@ -460,6 +471,13 @@ class Compaction {
// `Compaction::WithinPenultimateLevelOutputRange()`. // `Compaction::WithinPenultimateLevelOutputRange()`.
void PopulatePenultimateLevelOutputRange(); void PopulatePenultimateLevelOutputRange();
// If oldest snapshot is specified at Compaction construction time, we have
// an opportunity to optimize inputs for compaction iterator for this case:
// When a standalone range deletion file on the start level is recognized and
// can be determined to completely shadow some input files on non-start level.
// These files will be filtered out and later not feed to compaction iterator.
void FilterInputsForCompactionIterator();
// Get the atomic file boundaries for all files in the compaction. Necessary // Get the atomic file boundaries for all files in the compaction. Necessary
// in order to avoid the scenario described in // in order to avoid the scenario described in
// https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
@ -510,12 +528,27 @@ class Compaction {
// Compaction input files organized by level. Constant after construction // Compaction input files organized by level. Constant after construction
const std::vector<CompactionInputFiles> inputs_; const std::vector<CompactionInputFiles> inputs_;
// A copy of inputs_, organized more closely in memory // All files from inputs_ that are not filtered and will be fed to compaction
// iterator, organized more closely in memory.
autovector<LevelFilesBrief, 2> input_levels_; autovector<LevelFilesBrief, 2> input_levels_;
// State used to check for number of overlapping grandparent files // State used to check for number of overlapping grandparent files
// (grandparent == "output_level_ + 1") // (grandparent == "output_level_ + 1")
std::vector<FileMetaData*> grandparents_; std::vector<FileMetaData*> grandparents_;
// The earliest snapshot and snapshot checker at compaction picking time.
// These fields are only set for deletion triggered compactions picked in
// universal compaction. And when user-defined timestamp is not enabled.
// It will be used to possibly filter out some non start level input files.
std::optional<SequenceNumber> earliest_snapshot_;
const SnapshotChecker* snapshot_checker_;
// Markers for which non start level input files are filtered out if
// applicable. Only applicable if earliest_snapshot_ is provided and input
// start level has a standalone range deletion file.
std::vector<std::vector<bool>> non_start_level_input_files_filtered_;
// bool standalone_range_tombstones_used_for_filtering_inputs_;
const double score_; // score that was used to pick this compaction. const double score_; // score that was used to pick this compaction.
// Is this compaction creating a file in the bottom most level? // Is this compaction creating a file in the bottom most level?

View File

@ -540,18 +540,12 @@ class CompactionIterator {
inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq, inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
SequenceNumber snapshot) { SequenceNumber snapshot) {
return ((seq) <= (snapshot) && return DataIsDefinitelyInSnapshot(seq, snapshot, snapshot_checker_);
(snapshot_checker_ == nullptr ||
LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
SnapshotCheckerResult::kInSnapshot)));
} }
inline bool CompactionIterator::DefinitelyNotInSnapshot( inline bool CompactionIterator::DefinitelyNotInSnapshot(
SequenceNumber seq, SequenceNumber snapshot) { SequenceNumber seq, SequenceNumber snapshot) {
return ((seq) > (snapshot) || return DataIsDefinitelyNotInSnapshot(seq, snapshot, snapshot_checker_);
(snapshot_checker_ != nullptr &&
UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
SnapshotCheckerResult::kNotInSnapshot)));
} }
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -251,12 +251,13 @@ void CompactionJob::Prepare() {
// Generate file_levels_ for compaction before making Iterator // Generate file_levels_ for compaction before making Iterator
auto* c = compact_->compaction; auto* c = compact_->compaction;
ColumnFamilyData* cfd = c->column_family_data(); [[maybe_unused]] ColumnFamilyData* cfd = c->column_family_data();
assert(cfd != nullptr); assert(cfd != nullptr);
assert(cfd->current()->storage_info()->NumLevelFiles( const VersionStorageInfo* storage_info = c->input_version()->storage_info();
compact_->compaction->level()) > 0); assert(storage_info);
assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
write_hint_ = cfd->CalculateSSTWriteHint(c->output_level()); write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
bottommost_level_ = c->bottommost_level(); bottommost_level_ = c->bottommost_level();
if (c->ShouldFormSubcompactions()) { if (c->ShouldFormSubcompactions()) {
@ -287,8 +288,8 @@ void CompactionJob::Prepare() {
// to encode seqno->time to the output files. // to encode seqno->time to the output files.
uint64_t preserve_time_duration = uint64_t preserve_time_duration =
std::max(c->immutable_options()->preserve_internal_time_seconds, std::max(c->mutable_cf_options()->preserve_internal_time_seconds,
c->immutable_options()->preclude_last_level_data_seconds); c->mutable_cf_options()->preclude_last_level_data_seconds);
if (preserve_time_duration > 0) { if (preserve_time_duration > 0) {
const ReadOptions read_options(Env::IOActivity::kCompaction); const ReadOptions read_options(Env::IOActivity::kCompaction);
@ -297,8 +298,8 @@ void CompactionJob::Prepare() {
for (const auto& each_level : *c->inputs()) { for (const auto& each_level : *c->inputs()) {
for (const auto& fmd : each_level.files) { for (const auto& fmd : each_level.files) {
std::shared_ptr<const TableProperties> tp; std::shared_ptr<const TableProperties> tp;
Status s = Status s = c->input_version()->GetTableProperties(read_options, &tp,
cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr); fmd, nullptr);
if (s.ok()) { if (s.ok()) {
s = seqno_to_time_mapping_.DecodeFrom(tp->seqno_to_time_mapping); s = seqno_to_time_mapping_.DecodeFrom(tp->seqno_to_time_mapping);
} }
@ -325,8 +326,8 @@ void CompactionJob::Prepare() {
seqno_to_time_mapping_.Enforce(_current_time); seqno_to_time_mapping_.Enforce(_current_time);
seqno_to_time_mapping_.GetCurrentTieringCutoffSeqnos( seqno_to_time_mapping_.GetCurrentTieringCutoffSeqnos(
static_cast<uint64_t>(_current_time), static_cast<uint64_t>(_current_time),
c->immutable_options()->preserve_internal_time_seconds, c->mutable_cf_options()->preserve_internal_time_seconds,
c->immutable_options()->preclude_last_level_data_seconds, c->mutable_cf_options()->preclude_last_level_data_seconds,
&preserve_time_min_seqno_, &preclude_last_level_min_seqno_); &preserve_time_min_seqno_, &preclude_last_level_min_seqno_);
} }
// For accuracy of the GetProximalSeqnoBeforeTime queries above, we only // For accuracy of the GetProximalSeqnoBeforeTime queries above, we only
@ -468,7 +469,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
ReadOptions read_options(Env::IOActivity::kCompaction); ReadOptions read_options(Env::IOActivity::kCompaction);
read_options.rate_limiter_priority = GetRateLimiterPriority(); read_options.rate_limiter_priority = GetRateLimiterPriority();
auto* c = compact_->compaction; auto* c = compact_->compaction;
if (c->immutable_options()->table_factory->Name() == if (c->mutable_cf_options()->table_factory->Name() ==
TableFactory::kPlainTableName()) { TableFactory::kPlainTableName()) {
return; return;
} }
@ -505,9 +506,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
FileMetaData* f = flevel->files[i].file_metadata; FileMetaData* f = flevel->files[i].file_metadata;
std::vector<TableReader::Anchor> my_anchors; std::vector<TableReader::Anchor> my_anchors;
Status s = cfd->table_cache()->ApproximateKeyAnchors( Status s = cfd->table_cache()->ApproximateKeyAnchors(
read_options, icomp, *f, read_options, icomp, *f, *c->mutable_cf_options(), my_anchors);
c->mutable_cf_options()->block_protection_bytes_per_key,
my_anchors);
if (!s.ok() || my_anchors.empty()) { if (!s.ok() || my_anchors.empty()) {
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
} }
@ -710,8 +709,6 @@ Status CompactionJob::Run() {
} }
} }
ColumnFamilyData* cfd = compact_->compaction->column_family_data(); ColumnFamilyData* cfd = compact_->compaction->column_family_data();
auto& prefix_extractor =
compact_->compaction->mutable_cf_options()->prefix_extractor;
std::atomic<size_t> next_file_idx(0); std::atomic<size_t> next_file_idx(0);
auto verify_table = [&](Status& output_status) { auto verify_table = [&](Status& output_status) {
while (true) { while (true) {
@ -732,7 +729,8 @@ Status CompactionJob::Run() {
InternalIterator* iter = cfd->table_cache()->NewIterator( InternalIterator* iter = cfd->table_cache()->NewIterator(
verify_table_read_options, file_options_, verify_table_read_options, file_options_,
cfd->internal_comparator(), files_output[file_idx]->meta, cfd->internal_comparator(), files_output[file_idx]->meta,
/*range_del_agg=*/nullptr, prefix_extractor, /*range_del_agg=*/nullptr,
*compact_->compaction->mutable_cf_options(),
/*table_reader_ptr=*/nullptr, /*table_reader_ptr=*/nullptr,
cfd->internal_stats()->GetFileReadHist( cfd->internal_stats()->GetFileReadHist(
compact_->compaction->output_level()), compact_->compaction->output_level()),
@ -742,9 +740,7 @@ Status CompactionJob::Run() {
*compact_->compaction->mutable_cf_options()), *compact_->compaction->mutable_cf_options()),
/*smallest_compaction_key=*/nullptr, /*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false, /*allow_unprepared_value=*/false);
compact_->compaction->mutable_cf_options()
->block_protection_bytes_per_key);
auto s = iter->status(); auto s = iter->status();
if (s.ok() && paranoid_file_checks_) { if (s.ok() && paranoid_file_checks_) {
@ -805,6 +801,12 @@ Status CompactionJob::Run() {
} }
} }
// Before the compaction starts, is_remote_compaction was set to true if
// compaction_service is set. We now know whether each sub_compaction was
// done remotely or not. Reset is_remote_compaction back to false and allow
// AggregateCompactionStats() to set the right value.
compaction_job_stats_->is_remote_compaction = false;
// Finish up all bookkeeping to unify the subcompaction results. // Finish up all bookkeeping to unify the subcompaction results.
compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
uint64_t num_input_range_del = 0; uint64_t num_input_range_del = 0;
@ -1083,6 +1085,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
} }
// fallback to local compaction // fallback to local compaction
assert(comp_status == CompactionServiceJobStatus::kUseLocal); assert(comp_status == CompactionServiceJobStatus::kUseLocal);
sub_compact->compaction_job_stats.is_remote_compaction = false;
} }
uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
@ -1911,6 +1914,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
oldest_ancester_time = current_time; oldest_ancester_time = current_time;
} }
uint64_t newest_key_time = sub_compact->compaction->MaxInputFileNewestKeyTime(
sub_compact->start.has_value() ? &tmp_start : nullptr,
sub_compact->end.has_value() ? &tmp_end : nullptr);
// Initialize a SubcompactionState::Output and add it to sub_compact->outputs // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber(); uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber();
{ {
@ -1960,7 +1967,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
cfd->internal_tbl_prop_coll_factories(), cfd->internal_tbl_prop_coll_factories(),
sub_compact->compaction->output_compression(), sub_compact->compaction->output_compression(),
sub_compact->compaction->output_compression_opts(), cfd->GetID(), sub_compact->compaction->output_compression_opts(), cfd->GetID(),
cfd->GetName(), sub_compact->compaction->output_level(), cfd->GetName(), sub_compact->compaction->output_level(), newest_key_time,
bottommost_level_, TableFileCreationReason::kCompaction, bottommost_level_, TableFileCreationReason::kCompaction,
0 /* oldest_key_time */, current_time, db_id_, db_session_id_, 0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
sub_compact->compaction->max_output_file_size(), file_number, sub_compact->compaction->max_output_file_size(), file_number,
@ -2000,10 +2007,12 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
bool has_error = false; bool has_error = false;
const ReadOptions read_options(Env::IOActivity::kCompaction); const ReadOptions read_options(Env::IOActivity::kCompaction);
const auto& input_table_properties = compaction->GetInputTableProperties(); const auto& input_table_properties = compaction->GetInputTableProperties();
// TODO(yuzhangyu): add dedicated stats for filtered files.
for (int input_level = 0; for (int input_level = 0;
input_level < static_cast<int>(compaction->num_input_levels()); input_level < static_cast<int>(compaction->num_input_levels());
++input_level) { ++input_level) {
size_t num_input_files = compaction->num_input_files(input_level); const LevelFilesBrief* flevel = compaction->input_levels(input_level);
size_t num_input_files = flevel->num_files;
uint64_t* bytes_read; uint64_t* bytes_read;
if (compaction->level(input_level) != compaction->output_level()) { if (compaction->level(input_level) != compaction->output_level()) {
compaction_stats_.stats.num_input_files_in_non_output_levels += compaction_stats_.stats.num_input_files_in_non_output_levels +=
@ -2015,7 +2024,7 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
bytes_read = &compaction_stats_.stats.bytes_read_output_level; bytes_read = &compaction_stats_.stats.bytes_read_output_level;
} }
for (size_t i = 0; i < num_input_files; ++i) { for (size_t i = 0; i < num_input_files; ++i) {
const FileMetaData* file_meta = compaction->input(input_level, i); const FileMetaData* file_meta = flevel->files[i].file_metadata;
*bytes_read += file_meta->fd.GetFileSize(); *bytes_read += file_meta->fd.GetFileSize();
uint64_t file_input_entries = file_meta->num_entries; uint64_t file_input_entries = file_meta->num_entries;
uint64_t file_num_range_del = file_meta->num_range_deletions; uint64_t file_num_range_del = file_meta->num_range_deletions;

View File

@ -209,6 +209,8 @@ class CompactionJob {
// Returns true iff compaction_stats_.stats.num_input_records and // Returns true iff compaction_stats_.stats.num_input_records and
// num_input_range_del are calculated successfully. // num_input_range_del are calculated successfully.
bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr); bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
virtual void UpdateCompactionJobStats(
const InternalStats::CompactionStats& stats) const;
void LogCompaction(); void LogCompaction();
virtual void RecordCompactionIOStats(); virtual void RecordCompactionIOStats();
void CleanupCompaction(); void CleanupCompaction();
@ -279,8 +281,7 @@ class CompactionJob {
bool* compaction_released); bool* compaction_released);
Status OpenCompactionOutputFile(SubcompactionState* sub_compact, Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
CompactionOutputs& outputs); CompactionOutputs& outputs);
void UpdateCompactionJobStats(
const InternalStats::CompactionStats& stats) const;
void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
CompactionJobStats* compaction_job_stats = nullptr); CompactionJobStats* compaction_job_stats = nullptr);
@ -377,9 +378,7 @@ class CompactionJob {
// doesn't contain the LSM tree information, which is passed though MANIFEST // doesn't contain the LSM tree information, which is passed though MANIFEST
// file. // file.
struct CompactionServiceInput { struct CompactionServiceInput {
ColumnFamilyDescriptor column_family; std::string cf_name;
DBOptions db_options;
std::vector<SequenceNumber> snapshots; std::vector<SequenceNumber> snapshots;
@ -387,7 +386,7 @@ struct CompactionServiceInput {
// files needed for this compaction, for both input level files and output // files needed for this compaction, for both input level files and output
// level files. // level files.
std::vector<std::string> input_files; std::vector<std::string> input_files;
int output_level; int output_level = 0;
// db_id is used to generate unique id of sst on the remote compactor // db_id is used to generate unique id of sst on the remote compactor
std::string db_id; std::string db_id;
@ -398,13 +397,12 @@ struct CompactionServiceInput {
bool has_end = false; bool has_end = false;
std::string end; std::string end;
uint64_t options_file_number = 0;
// serialization interface to read and write the object // serialization interface to read and write the object
static Status Read(const std::string& data_str, CompactionServiceInput* obj); static Status Read(const std::string& data_str, CompactionServiceInput* obj);
Status Write(std::string* output); Status Write(std::string* output);
// Initialize a dummy ColumnFamilyDescriptor
CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
#ifndef NDEBUG #ifndef NDEBUG
bool TEST_Equals(CompactionServiceInput* other); bool TEST_Equals(CompactionServiceInput* other);
bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch); bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
@ -418,20 +416,25 @@ struct CompactionServiceOutputFile {
SequenceNumber largest_seqno; SequenceNumber largest_seqno;
std::string smallest_internal_key; std::string smallest_internal_key;
std::string largest_internal_key; std::string largest_internal_key;
uint64_t oldest_ancester_time; uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
uint64_t file_creation_time; uint64_t file_creation_time = kUnknownFileCreationTime;
uint64_t epoch_number; uint64_t epoch_number = kUnknownEpochNumber;
std::string file_checksum = kUnknownFileChecksum;
std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
uint64_t paranoid_hash; uint64_t paranoid_hash;
bool marked_for_compaction; bool marked_for_compaction;
UniqueId64x2 unique_id; UniqueId64x2 unique_id{};
TableProperties table_properties;
CompactionServiceOutputFile() = default; CompactionServiceOutputFile() = default;
CompactionServiceOutputFile( CompactionServiceOutputFile(
const std::string& name, SequenceNumber smallest, SequenceNumber largest, const std::string& name, SequenceNumber smallest, SequenceNumber largest,
std::string _smallest_internal_key, std::string _largest_internal_key, std::string _smallest_internal_key, std::string _largest_internal_key,
uint64_t _oldest_ancester_time, uint64_t _file_creation_time, uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
uint64_t _epoch_number, uint64_t _paranoid_hash, uint64_t _epoch_number, const std::string& _file_checksum,
bool _marked_for_compaction, UniqueId64x2 _unique_id) const std::string& _file_checksum_func_name, uint64_t _paranoid_hash,
bool _marked_for_compaction, UniqueId64x2 _unique_id,
const TableProperties& _table_properties)
: file_name(name), : file_name(name),
smallest_seqno(smallest), smallest_seqno(smallest),
largest_seqno(largest), largest_seqno(largest),
@ -440,9 +443,12 @@ struct CompactionServiceOutputFile {
oldest_ancester_time(_oldest_ancester_time), oldest_ancester_time(_oldest_ancester_time),
file_creation_time(_file_creation_time), file_creation_time(_file_creation_time),
epoch_number(_epoch_number), epoch_number(_epoch_number),
file_checksum(_file_checksum),
file_checksum_func_name(_file_checksum_func_name),
paranoid_hash(_paranoid_hash), paranoid_hash(_paranoid_hash),
marked_for_compaction(_marked_for_compaction), marked_for_compaction(_marked_for_compaction),
unique_id(std::move(_unique_id)) {} unique_id(std::move(_unique_id)),
table_properties(_table_properties) {}
}; };
// CompactionServiceResult contains the compaction result from a different db // CompactionServiceResult contains the compaction result from a different db
@ -451,14 +457,11 @@ struct CompactionServiceOutputFile {
struct CompactionServiceResult { struct CompactionServiceResult {
Status status; Status status;
std::vector<CompactionServiceOutputFile> output_files; std::vector<CompactionServiceOutputFile> output_files;
int output_level; int output_level = 0;
// location of the output files // location of the output files
std::string output_path; std::string output_path;
// some statistics about the compaction
uint64_t num_output_records = 0;
uint64_t total_bytes = 0;
uint64_t bytes_read = 0; uint64_t bytes_read = 0;
uint64_t bytes_written = 0; uint64_t bytes_written = 0;
CompactionJobStats stats; CompactionJobStats stats;
@ -504,6 +507,9 @@ class CompactionServiceCompactionJob : private CompactionJob {
protected: protected:
void RecordCompactionIOStats() override; void RecordCompactionIOStats() override;
void UpdateCompactionJobStats(
const InternalStats::CompactionStats& stats) const override;
private: private:
// Get table file name in output_path // Get table file name in output_path
std::string GetTableFileName(uint64_t file_number) override; std::string GetTableFileName(uint64_t file_number) override;

View File

@ -50,7 +50,8 @@ void VerifyInitializationOfCompactionJobStats(
ASSERT_EQ(compaction_job_stats.num_output_records, 0U); ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
ASSERT_EQ(compaction_job_stats.num_output_files, 0U); ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
ASSERT_EQ(compaction_job_stats.is_manual_compaction, true); ASSERT_TRUE(compaction_job_stats.is_manual_compaction);
ASSERT_FALSE(compaction_job_stats.is_remote_compaction);
ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U); ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U); ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
@ -249,6 +250,7 @@ class CompactionJobTestBase : public testing::Test {
} else { } else {
assert(false); assert(false);
} }
mutable_cf_options_.table_factory = cf_options_.table_factory;
} }
std::string GenerateFileName(uint64_t file_number) { std::string GenerateFileName(uint64_t file_number) {
@ -299,13 +301,13 @@ class CompactionJobTestBase : public testing::Test {
const WriteOptions write_options; const WriteOptions write_options;
std::unique_ptr<TableBuilder> table_builder( std::unique_ptr<TableBuilder> table_builder(
cf_options_.table_factory->NewTableBuilder( cf_options_.table_factory->NewTableBuilder(
TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_, TableBuilderOptions(
read_options, write_options, *cfd_->ioptions(), mutable_cf_options_, read_options,
cfd_->internal_comparator(), write_options, cfd_->internal_comparator(),
cfd_->internal_tbl_prop_coll_factories(), cfd_->internal_tbl_prop_coll_factories(),
CompressionType::kNoCompression, CompressionType::kNoCompression, CompressionOptions(),
CompressionOptions(), 0 /* column_family_id */, 0 /* column_family_id */, kDefaultColumnFamilyName,
kDefaultColumnFamilyName, -1 /* level */), -1 /* level */, kUnknownNewestKeyTime),
file_writer.get())); file_writer.get()));
// Build table. // Build table.
for (const auto& kv : contents) { for (const auto& kv : contents) {
@ -545,14 +547,14 @@ class CompactionJobTestBase : public testing::Test {
ASSERT_OK(s); ASSERT_OK(s);
db_options_.info_log = info_log; db_options_.info_log = info_log;
versions_.reset(new VersionSet( versions_.reset(
dbname_, &db_options_, env_options_, table_cache_.get(), new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
&write_buffer_manager_, &write_controller_, &write_buffer_manager_, &write_controller_,
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", test::kUnitTestDbId, /*db_session_id=*/"",
/*error_handler=*/nullptr, /*read_only=*/false)); /*daily_offpeak_time_utc=*/"",
/*error_handler=*/nullptr, /*read_only=*/false));
compaction_job_stats_.Reset(); compaction_job_stats_.Reset();
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
VersionEdit new_db; VersionEdit new_db;
new_db.SetLogNumber(0); new_db.SetLogNumber(0);
@ -575,7 +577,8 @@ class CompactionJobTestBase : public testing::Test {
} }
ASSERT_OK(s); ASSERT_OK(s);
// Make "CURRENT" file that points to the new manifest file. // Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr); s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
Temperature::kUnknown, nullptr);
ASSERT_OK(s); ASSERT_OK(s);
@ -649,7 +652,8 @@ class CompactionJobTestBase : public testing::Test {
mutable_cf_options_.target_file_size_base, mutable_cf_options_.target_file_size_base,
mutable_cf_options_.max_compaction_bytes, 0, kNoCompression, mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
cfd->GetLatestMutableCFOptions()->compression_opts, cfd->GetLatestMutableCFOptions()->compression_opts,
Temperature::kUnknown, max_subcompactions, grandparents, true); Temperature::kUnknown, max_subcompactions, grandparents,
/*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr, true);
compaction.FinalizeInputInfo(cfd->current()); compaction.FinalizeInputInfo(cfd->current());
assert(db_options_.info_log); assert(db_options_.info_log);
@ -1567,17 +1571,7 @@ TEST_F(CompactionJobTest, InputSerialization) {
const int kStrMaxLen = 1000; const int kStrMaxLen = 1000;
Random rnd(static_cast<uint32_t>(time(nullptr))); Random rnd(static_cast<uint32_t>(time(nullptr)));
Random64 rnd64(time(nullptr)); Random64 rnd64(time(nullptr));
input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen)); input.cf_name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
input.column_family.options.comparator = ReverseBytewiseComparator();
input.column_family.options.max_bytes_for_level_base =
rnd64.Uniform(UINT64_MAX);
input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
input.column_family.options.compression = kZSTD;
input.column_family.options.compression_opts.level = 4;
input.db_options.max_background_flushes = 10;
input.db_options.paranoid_checks = rnd.OneIn(2);
input.db_options.statistics = CreateDBStatistics();
input.db_options.env = env_;
while (!rnd.OneIn(10)) { while (!rnd.OneIn(10)) {
input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX)); input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
} }
@ -1605,10 +1599,10 @@ TEST_F(CompactionJobTest, InputSerialization) {
ASSERT_TRUE(deserialized1.TEST_Equals(&input)); ASSERT_TRUE(deserialized1.TEST_Equals(&input));
// Test mismatch // Test mismatch
deserialized1.db_options.max_background_flushes += 10; deserialized1.output_level += 10;
std::string mismatch; std::string mismatch;
ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch)); ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
ASSERT_EQ(mismatch, "db_options.max_background_flushes"); ASSERT_EQ(mismatch, "output_level");
// Test unknown field // Test unknown field
CompactionServiceInput deserialized2; CompactionServiceInput deserialized2;
@ -1664,20 +1658,40 @@ TEST_F(CompactionJobTest, ResultSerialization) {
}; };
result.status = result.status =
status_list.at(rnd.Uniform(static_cast<int>(status_list.size()))); status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
std::string file_checksum = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
std::string file_checksum_func_name = "MyAwesomeChecksumGenerator";
while (!rnd.OneIn(10)) { while (!rnd.OneIn(10)) {
TableProperties tp;
tp.user_collected_properties.emplace(
"UCP_Key1", rnd.RandomString(rnd.Uniform(kStrMaxLen)));
tp.user_collected_properties.emplace(
"UCP_Key2", rnd.RandomString(rnd.Uniform(kStrMaxLen)));
tp.readable_properties.emplace("RP_Key1",
rnd.RandomString(rnd.Uniform(kStrMaxLen)));
tp.readable_properties.emplace("RP_K2y2",
rnd.RandomString(rnd.Uniform(kStrMaxLen)));
UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)}; UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
result.output_files.emplace_back( result.output_files.emplace_back(
rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), rnd.RandomString(rnd.Uniform(kStrMaxLen)) /* file_name */,
rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX) /* smallest_seqno */,
rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX) /* largest_seqno */,
rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd.RandomBinaryString(
rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.Uniform(kStrMaxLen)) /* smallest_internal_key */,
rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); rnd.RandomBinaryString(
rnd.Uniform(kStrMaxLen)) /* largest_internal_key */,
rnd64.Uniform(UINT64_MAX) /* oldest_ancester_time */,
rnd64.Uniform(UINT64_MAX) /* file_creation_time */,
rnd64.Uniform(UINT64_MAX) /* epoch_number */,
file_checksum /* file_checksum */,
file_checksum_func_name /* file_checksum_func_name */,
rnd64.Uniform(UINT64_MAX) /* paranoid_hash */,
rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp);
} }
result.output_level = rnd.Uniform(10); result.output_level = rnd.Uniform(10);
result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
result.num_output_records = rnd64.Uniform(UINT64_MAX); result.stats.num_output_records = rnd64.Uniform(UINT64_MAX);
result.total_bytes = rnd64.Uniform(UINT64_MAX);
result.bytes_read = 123; result.bytes_read = 123;
result.bytes_written = rnd64.Uniform(UINT64_MAX); result.bytes_written = rnd64.Uniform(UINT64_MAX);
result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX); result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
@ -1694,6 +1708,21 @@ TEST_F(CompactionJobTest, ResultSerialization) {
ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1)); ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
ASSERT_TRUE(deserialized1.TEST_Equals(&result)); ASSERT_TRUE(deserialized1.TEST_Equals(&result));
for (size_t i = 0; i < result.output_files.size(); i++) {
for (const auto& prop :
result.output_files[i].table_properties.user_collected_properties) {
ASSERT_EQ(deserialized1.output_files[i]
.table_properties.user_collected_properties[prop.first],
prop.second);
}
for (const auto& prop :
result.output_files[i].table_properties.readable_properties) {
ASSERT_EQ(deserialized1.output_files[i]
.table_properties.readable_properties[prop.first],
prop.second);
}
}
// Test mismatch // Test mismatch
deserialized1.stats.num_input_files += 10; deserialized1.stats.num_input_files += 10;
std::string mismatch; std::string mismatch;
@ -1708,6 +1737,10 @@ TEST_F(CompactionJobTest, ResultSerialization) {
ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch)); ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch));
ASSERT_EQ(mismatch, "output_files.unique_id"); ASSERT_EQ(mismatch, "output_files.unique_id");
deserialized_tmp.status.PermitUncheckedError(); deserialized_tmp.status.PermitUncheckedError();
ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum, file_checksum);
ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum_func_name,
file_checksum_func_name);
} }
// Test unknown field // Test unknown field

View File

@ -62,8 +62,9 @@ class CompactionOutputs {
} }
// TODO: Remove it when remote compaction support tiered compaction // TODO: Remove it when remote compaction support tiered compaction
void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; } void AddBytesWritten(uint64_t bytes) { stats_.bytes_written += bytes; }
void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; } void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
void SetNumOutputFiles(uint64_t num) { stats_.num_output_files = num; }
// TODO: Move the BlobDB builder into CompactionOutputs // TODO: Move the BlobDB builder into CompactionOutputs
const std::vector<BlobFileAddition>& GetBlobFileAdditions() const { const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
@ -107,6 +108,12 @@ class CompactionOutputs {
Status Finish(const Status& intput_status, Status Finish(const Status& intput_status,
const SeqnoToTimeMapping& seqno_to_time_mapping); const SeqnoToTimeMapping& seqno_to_time_mapping);
// Update output table properties from already populated TableProperties.
// Used for remote compaction
void UpdateTableProperties(const TableProperties& table_properties) {
current_output().table_properties =
std::make_shared<TableProperties>(table_properties);
}
// Update output table properties from table builder // Update output table properties from table builder
void UpdateTableProperties() { void UpdateTableProperties() {
current_output().table_properties = current_output().table_properties =

View File

@ -133,7 +133,8 @@ CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
CompactionPicker::~CompactionPicker() = default; CompactionPicker::~CompactionPicker() = default;
// Delete this compaction from the list of running compactions. // Delete this compaction from the list of running compactions.
void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { void CompactionPicker::ReleaseCompactionFiles(Compaction* c,
const Status& status) {
UnregisterCompaction(c); UnregisterCompaction(c);
if (!status.ok()) { if (!status.ok()) {
c->ResetNextCompactionIndex(); c->ResetNextCompactionIndex();
@ -350,11 +351,11 @@ Compaction* CompactionPicker::CompactFiles(
break; break;
} }
} }
assert(output_level == 0 || assert(output_level == 0 || !FilesRangeOverlapWithCompaction(
!FilesRangeOverlapWithCompaction( input_files, output_level,
input_files, output_level, Compaction::EvaluatePenultimateLevel(
Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, vstorage, mutable_cf_options, ioptions_,
start_level, output_level))); start_level, output_level)));
#endif /* !NDEBUG */ #endif /* !NDEBUG */
CompressionType compression_type; CompressionType compression_type;
@ -379,7 +380,8 @@ Compaction* CompactionPicker::CompactFiles(
GetCompressionOptions(mutable_cf_options, vstorage, output_level), GetCompressionOptions(mutable_cf_options, vstorage, output_level),
mutable_cf_options.default_write_temperature, mutable_cf_options.default_write_temperature,
compact_options.max_subcompactions, compact_options.max_subcompactions,
/* grandparents */ {}, true); /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr, true);
RegisterCompaction(c); RegisterCompaction(c);
return c; return c;
} }
@ -657,8 +659,9 @@ Compaction* CompactionPicker::CompactRange(
// overlaping outputs in the same level. // overlaping outputs in the same level.
if (FilesRangeOverlapWithCompaction( if (FilesRangeOverlapWithCompaction(
inputs, output_level, inputs, output_level,
Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
start_level, output_level))) { ioptions_, start_level,
output_level))) {
// This compaction output could potentially conflict with the output // This compaction output could potentially conflict with the output
// of a currently running compaction, we cannot run it. // of a currently running compaction, we cannot run it.
*manual_conflict = true; *manual_conflict = true;
@ -676,7 +679,9 @@ Compaction* CompactionPicker::CompactRange(
GetCompressionOptions(mutable_cf_options, vstorage, output_level), GetCompressionOptions(mutable_cf_options, vstorage, output_level),
mutable_cf_options.default_write_temperature, mutable_cf_options.default_write_temperature,
compact_range_options.max_subcompactions, compact_range_options.max_subcompactions,
/* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1, /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr,
/* is manual */ true, trim_ts, /* score */ -1,
/* deletion_compaction */ false, /* l0_files_might_overlap */ true, /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
CompactionReason::kUnknown, CompactionReason::kUnknown,
compact_range_options.blob_garbage_collection_policy, compact_range_options.blob_garbage_collection_policy,
@ -842,7 +847,8 @@ Compaction* CompactionPicker::CompactRange(
// overlaping outputs in the same level. // overlaping outputs in the same level.
if (FilesRangeOverlapWithCompaction( if (FilesRangeOverlapWithCompaction(
compaction_inputs, output_level, compaction_inputs, output_level,
Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level, Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
ioptions_, input_level,
output_level))) { output_level))) {
// This compaction output could potentially conflict with the output // This compaction output could potentially conflict with the output
// of a currently running compaction, we cannot run it. // of a currently running compaction, we cannot run it.
@ -865,6 +871,7 @@ Compaction* CompactionPicker::CompactRange(
GetCompressionOptions(mutable_cf_options, vstorage, output_level), GetCompressionOptions(mutable_cf_options, vstorage, output_level),
mutable_cf_options.default_write_temperature, mutable_cf_options.default_write_temperature,
compact_range_options.max_subcompactions, std::move(grandparents), compact_range_options.max_subcompactions, std::move(grandparents),
/* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
/* is manual */ true, trim_ts, /* score */ -1, /* is manual */ true, trim_ts, /* score */ -1,
/* deletion_compaction */ false, /* l0_files_might_overlap */ true, /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
CompactionReason::kUnknown, CompactionReason::kUnknown,
@ -1044,10 +1051,12 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
} }
Status CompactionPicker::SanitizeAndConvertCompactionInputFiles( Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
std::unordered_set<uint64_t>* input_files, std::unordered_set<uint64_t>* input_files, const int output_level,
const ColumnFamilyMetaData& cf_meta, const int output_level, Version* version,
const VersionStorageInfo* vstorage,
std::vector<CompactionInputFiles>* converted_input_files) const { std::vector<CompactionInputFiles>* converted_input_files) const {
ColumnFamilyMetaData cf_meta;
version->GetColumnFamilyMetaData(&cf_meta);
assert(static_cast<int>(cf_meta.levels.size()) - 1 == assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
cf_meta.levels[cf_meta.levels.size() - 1].level); cf_meta.levels[cf_meta.levels.size() - 1].level);
assert(converted_input_files); assert(converted_input_files);
@ -1118,7 +1127,8 @@ Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
} }
s = GetCompactionInputsFromFileNumbers(converted_input_files, input_files, s = GetCompactionInputsFromFileNumbers(converted_input_files, input_files,
vstorage, CompactionOptions()); version->storage_info(),
CompactionOptions());
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -1127,8 +1137,8 @@ Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
FilesRangeOverlapWithCompaction( FilesRangeOverlapWithCompaction(
*converted_input_files, output_level, *converted_input_files, output_level,
Compaction::EvaluatePenultimateLevel( Compaction::EvaluatePenultimateLevel(
vstorage, ioptions_, (*converted_input_files)[0].level, version->storage_info(), version->GetMutableCFOptions(),
output_level))) { ioptions_, (*converted_input_files)[0].level, output_level))) {
return Status::Aborted( return Status::Aborted(
"A running compaction is writing to the same output level(s) in an " "A running compaction is writing to the same output level(s) in an "
"overlapping key range"); "overlapping key range");
@ -1170,7 +1180,8 @@ void CompactionPicker::UnregisterCompaction(Compaction* c) {
void CompactionPicker::PickFilesMarkedForCompaction( void CompactionPicker::PickFilesMarkedForCompaction(
const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level, const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
int* output_level, CompactionInputFiles* start_level_inputs) { int* output_level, CompactionInputFiles* start_level_inputs,
std::function<bool(const FileMetaData*)> skip_marked_file) {
if (vstorage->FilesMarkedForCompaction().empty()) { if (vstorage->FilesMarkedForCompaction().empty()) {
return; return;
} }
@ -1180,6 +1191,9 @@ void CompactionPicker::PickFilesMarkedForCompaction(
// If this assert() fails that means that some function marked some // If this assert() fails that means that some function marked some
// files as being_compacted, but didn't call ComputeCompactionScore() // files as being_compacted, but didn't call ComputeCompactionScore()
assert(!level_file.second->being_compacted); assert(!level_file.second->being_compacted);
if (skip_marked_file(level_file.second)) {
return false;
}
*start_level = level_file.first; *start_level = level_file.first;
*output_level = *output_level =
(*start_level == 0) ? vstorage->base_level() : *start_level + 1; (*start_level == 0) ? vstorage->base_level() : *start_level + 1;

View File

@ -16,6 +16,7 @@
#include <vector> #include <vector>
#include "db/compaction/compaction.h" #include "db/compaction/compaction.h"
#include "db/snapshot_checker.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "options/cf_options.h" #include "options/cf_options.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
@ -55,17 +56,17 @@ class CompactionPicker {
// Returns nullptr if there is no compaction to be done. // Returns nullptr if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that // Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result. // describes the compaction. Caller should delete the result.
virtual Compaction* PickCompaction(const std::string& cf_name, // Currently, only universal compaction will query existing snapshots and
const MutableCFOptions& mutable_cf_options, // pass it to aid compaction picking. And it's only passed when user-defined
const MutableDBOptions& mutable_db_options, // timestamps is not enabled. The other compaction styles do not pass or use
VersionStorageInfo* vstorage, // `existing_snapshots` or `snapshot_checker`.
LogBuffer* log_buffer) = 0; virtual Compaction* PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options,
const std::vector<SequenceNumber>& existing_snapshots,
const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) = 0;
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns nullptr if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
//
// The returned Compaction might not include the whole requested range. // The returned Compaction might not include the whole requested range.
// In that case, compaction_end will be set to the next key that needs // In that case, compaction_end will be set to the next key that needs
// compacting. In case the compaction will compact the whole range, // compacting. In case the compaction will compact the whole range,
@ -96,15 +97,14 @@ class CompactionPicker {
// non-ok status with specific reason. // non-ok status with specific reason.
// //
Status SanitizeAndConvertCompactionInputFiles( Status SanitizeAndConvertCompactionInputFiles(
std::unordered_set<uint64_t>* input_files, std::unordered_set<uint64_t>* input_files, const int output_level,
const ColumnFamilyMetaData& cf_meta, const int output_level, Version* version,
const VersionStorageInfo* vstorage,
std::vector<CompactionInputFiles>* converted_input_files) const; std::vector<CompactionInputFiles>* converted_input_files) const;
// Free up the files that participated in a compaction // Free up the files that participated in a compaction
// //
// Requirement: DB mutex held // Requirement: DB mutex held
void ReleaseCompactionFiles(Compaction* c, Status status); void ReleaseCompactionFiles(Compaction* c, const Status& status);
// Returns true if any one of the specified files are being compacted // Returns true if any one of the specified files are being compacted
bool AreFilesInCompaction(const std::vector<FileMetaData*>& files); bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
@ -203,10 +203,11 @@ class CompactionPicker {
const CompactionInputFiles& output_level_inputs, const CompactionInputFiles& output_level_inputs,
std::vector<FileMetaData*>* grandparents); std::vector<FileMetaData*>* grandparents);
void PickFilesMarkedForCompaction(const std::string& cf_name, void PickFilesMarkedForCompaction(
VersionStorageInfo* vstorage, const std::string& cf_name, VersionStorageInfo* vstorage,
int* start_level, int* output_level, int* start_level, int* output_level,
CompactionInputFiles* start_level_inputs); CompactionInputFiles* start_level_inputs,
std::function<bool(const FileMetaData*)> skip_marked_file);
bool GetOverlappingL0Files(VersionStorageInfo* vstorage, bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
CompactionInputFiles* start_level_inputs, CompactionInputFiles* start_level_inputs,
@ -257,11 +258,13 @@ class NullCompactionPicker : public CompactionPicker {
virtual ~NullCompactionPicker() {} virtual ~NullCompactionPicker() {}
// Always return "nullptr" // Always return "nullptr"
Compaction* PickCompaction(const std::string& /*cf_name*/, Compaction* PickCompaction(
const MutableCFOptions& /*mutable_cf_options*/, const std::string& /*cf_name*/,
const MutableDBOptions& /*mutable_db_options*/, const MutableCFOptions& /*mutable_cf_options*/,
VersionStorageInfo* /*vstorage*/, const MutableDBOptions& /*mutable_db_options*/,
LogBuffer* /* log_buffer */) override { const std::vector<SequenceNumber>& /*existing_snapshots*/,
const SnapshotChecker* /*snapshot_checker*/,
VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */) override {
return nullptr; return nullptr;
} }

View File

@ -79,10 +79,14 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
FileMetaData* f = *ritr; FileMetaData* f = *ritr;
assert(f); assert(f);
if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
uint64_t newest_key_time = f->TryGetNewestKeyTime();
uint64_t creation_time = uint64_t creation_time =
f->fd.table_reader->GetTableProperties()->creation_time; f->fd.table_reader->GetTableProperties()->creation_time;
if (creation_time == 0 || uint64_t est_newest_key_time = newest_key_time == kUnknownNewestKeyTime
creation_time >= (current_time - mutable_cf_options.ttl)) { ? creation_time
: newest_key_time;
if (est_newest_key_time == kUnknownNewestKeyTime ||
est_newest_key_time >= (current_time - mutable_cf_options.ttl)) {
break; break;
} }
} }
@ -102,15 +106,19 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
} }
for (const auto& f : inputs[0].files) { for (const auto& f : inputs[0].files) {
uint64_t creation_time = 0;
assert(f); assert(f);
uint64_t newest_key_time = f->TryGetNewestKeyTime();
uint64_t creation_time = 0;
if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) { if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
creation_time = f->fd.table_reader->GetTableProperties()->creation_time; creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
} }
uint64_t est_newest_key_time = newest_key_time == kUnknownNewestKeyTime
? creation_time
: newest_key_time;
ROCKS_LOG_BUFFER(log_buffer, ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64 "[%s] FIFO compaction: picking file %" PRIu64
" with creation time %" PRIu64 " for deletion", " with estimated newest key time %" PRIu64 " for deletion",
cf_name.c_str(), f->fd.GetNumber(), creation_time); cf_name.c_str(), f->fd.GetNumber(), est_newest_key_time);
} }
Compaction* c = new Compaction( Compaction* c = new Compaction(
@ -118,7 +126,9 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
std::move(inputs), 0, 0, 0, 0, kNoCompression, std::move(inputs), 0, 0, 0, 0, kNoCompression,
mutable_cf_options.compression_opts, mutable_cf_options.compression_opts,
mutable_cf_options.default_write_temperature, mutable_cf_options.default_write_temperature,
/* max_subcompactions */ 0, {}, /* is manual */ false, /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr,
/* is manual */ false,
/* trim_ts */ "", vstorage->CompactionScore(0), /* trim_ts */ "", vstorage->CompactionScore(0),
/* is deletion compaction */ true, /* l0_files_might_overlap */ true, /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
CompactionReason::kFIFOTtl); CompactionReason::kFIFOTtl);
@ -188,7 +198,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
0 /* output path ID */, mutable_cf_options.compression, 0 /* output path ID */, mutable_cf_options.compression,
mutable_cf_options.compression_opts, mutable_cf_options.compression_opts,
mutable_cf_options.default_write_temperature, mutable_cf_options.default_write_temperature,
0 /* max_subcompactions */, {}, /* is manual */ false, 0 /* max_subcompactions */, {},
/* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr, /* is manual */ false,
/* trim_ts */ "", vstorage->CompactionScore(0), /* trim_ts */ "", vstorage->CompactionScore(0),
/* is deletion compaction */ false, /* is deletion compaction */ false,
/* l0_files_might_overlap */ true, /* l0_files_might_overlap */ true,
@ -284,7 +296,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
/* output_path_id */ 0, kNoCompression, /* output_path_id */ 0, kNoCompression,
mutable_cf_options.compression_opts, mutable_cf_options.compression_opts,
mutable_cf_options.default_write_temperature, mutable_cf_options.default_write_temperature,
/* max_subcompactions */ 0, {}, /* is manual */ false, /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr,
/* is manual */ false,
/* trim_ts */ "", vstorage->CompactionScore(0), /* trim_ts */ "", vstorage->CompactionScore(0),
/* is deletion compaction */ true, /* is deletion compaction */ true,
/* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize); /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
@ -294,7 +308,7 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) { LogBuffer* log_buffer) const {
const std::vector<FileTemperatureAge>& ages = const std::vector<FileTemperatureAge>& ages =
mutable_cf_options.compaction_options_fifo mutable_cf_options.compaction_options_fifo
.file_temperature_age_thresholds; .file_temperature_age_thresholds;
@ -344,73 +358,47 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
Temperature compaction_target_temp = Temperature::kLastTemperature; Temperature compaction_target_temp = Temperature::kLastTemperature;
if (current_time > min_age) { if (current_time > min_age) {
uint64_t create_time_threshold = current_time - min_age; uint64_t create_time_threshold = current_time - min_age;
uint64_t compaction_size = 0;
// We will ideally identify a file qualifying for temperature change by
// knowing the timestamp for the youngest entry in the file. However, right
// now we don't have the information. We infer it by looking at timestamp of
// the previous file's (which is just younger) oldest entry's timestamp.
Temperature cur_target_temp;
// avoid index underflow
assert(level_files.size() >= 1); assert(level_files.size() >= 1);
for (size_t index = level_files.size() - 1; index >= 1; --index) { for (size_t index = level_files.size(); index >= 1; --index) {
// Try to add cur_file to compaction inputs. // Try to add cur_file to compaction inputs.
FileMetaData* cur_file = level_files[index]; FileMetaData* cur_file = level_files[index - 1];
// prev_file is just younger than cur_file FileMetaData* prev_file = index < 2 ? nullptr : level_files[index - 2];
FileMetaData* prev_file = level_files[index - 1];
if (cur_file->being_compacted) { if (cur_file->being_compacted) {
// Should not happen since we check for // Should not happen since we check for
// `level0_compactions_in_progress_` above. Here we simply just don't // `level0_compactions_in_progress_` above. Here we simply just don't
// schedule anything. // schedule anything.
return nullptr; return nullptr;
} }
uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime(); uint64_t est_newest_key_time = cur_file->TryGetNewestKeyTime(prev_file);
if (oldest_ancestor_time == kUnknownOldestAncesterTime) { // Newer file could have newest_key_time populated
// Older files might not have enough information. It is possible to if (est_newest_key_time == kUnknownNewestKeyTime) {
// handle these files by looking at newer files, but maintaining the continue;
// logic isn't worth it. }
if (est_newest_key_time > create_time_threshold) {
break; break;
} }
if (oldest_ancestor_time > create_time_threshold) { Temperature cur_target_temp = ages[0].temperature;
// cur_file is too fresh
break;
}
cur_target_temp = ages[0].temperature;
for (size_t i = 1; i < ages.size(); ++i) { for (size_t i = 1; i < ages.size(); ++i) {
if (current_time >= ages[i].age && if (current_time >= ages[i].age &&
oldest_ancestor_time <= current_time - ages[i].age) { est_newest_key_time <= current_time - ages[i].age) {
cur_target_temp = ages[i].temperature; cur_target_temp = ages[i].temperature;
} }
} }
if (cur_file->temperature == cur_target_temp) { if (cur_file->temperature == cur_target_temp) {
if (inputs[0].empty()) { continue;
continue;
} else {
break;
}
} }
// cur_file needs to change temperature // cur_file needs to change temperature
if (compaction_target_temp == Temperature::kLastTemperature) { assert(compaction_target_temp == Temperature::kLastTemperature);
assert(inputs[0].empty()); compaction_target_temp = cur_target_temp;
compaction_target_temp = cur_target_temp; inputs[0].files.push_back(cur_file);
} else if (cur_target_temp != compaction_target_temp) { ROCKS_LOG_BUFFER(
assert(!inputs[0].empty()); log_buffer,
break; "[%s] FIFO compaction: picking file %" PRIu64
} " with estimated newest key time %" PRIu64 " for temperature %s.",
if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <= cf_name.c_str(), cur_file->fd.GetNumber(), est_newest_key_time,
mutable_cf_options.max_compaction_bytes) { temperature_to_string[cur_target_temp].c_str());
inputs[0].files.push_back(cur_file); break;
compaction_size += cur_file->fd.GetFileSize();
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64
" with next file's oldest time %" PRIu64 " for temperature %s.",
cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time,
temperature_to_string[cur_target_temp].c_str());
}
if (compaction_size > mutable_cf_options.max_compaction_bytes) {
break;
}
} }
} }
@ -418,15 +406,18 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
return nullptr; return nullptr;
} }
assert(compaction_target_temp != Temperature::kLastTemperature); assert(compaction_target_temp != Temperature::kLastTemperature);
// Only compact one file at a time.
assert(inputs.size() == 1);
assert(inputs[0].size() == 1);
Compaction* c = new Compaction( Compaction* c = new Compaction(
vstorage, ioptions_, mutable_cf_options, mutable_db_options, vstorage, ioptions_, mutable_cf_options, mutable_db_options,
std::move(inputs), 0, 0 /* output file size limit */, std::move(inputs), 0, 0 /* output file size limit */,
0 /* max compaction bytes, not applicable */, 0 /* output path ID */, 0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
mutable_cf_options.compression, mutable_cf_options.compression_opts, mutable_cf_options.compression, mutable_cf_options.compression_opts,
compaction_target_temp, compaction_target_temp,
/* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
vstorage->CompactionScore(0), /* snapshot_checker */ nullptr,
/* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0),
/* is deletion compaction */ false, /* l0_files_might_overlap */ true, /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
CompactionReason::kChangeTemperature); CompactionReason::kChangeTemperature);
return c; return c;
@ -434,7 +425,9 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
Compaction* FIFOCompactionPicker::PickCompaction( Compaction* FIFOCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, const MutableDBOptions& mutable_db_options,
const std::vector<SequenceNumber>& /* existing_snapshots */,
const SnapshotChecker* /* snapshot_checker */, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) { LogBuffer* log_buffer) {
Compaction* c = nullptr; Compaction* c = nullptr;
if (mutable_cf_options.ttl > 0) { if (mutable_cf_options.ttl > 0) {
@ -469,8 +462,10 @@ Compaction* FIFOCompactionPicker::CompactRange(
assert(output_level == 0); assert(output_level == 0);
*compaction_end = nullptr; *compaction_end = nullptr;
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
Compaction* c = PickCompaction(cf_name, mutable_cf_options, Compaction* c =
mutable_db_options, vstorage, &log_buffer); PickCompaction(cf_name, mutable_cf_options, mutable_db_options,
/*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr,
vstorage, &log_buffer);
log_buffer.FlushBufferToLog(); log_buffer.FlushBufferToLog();
return c; return c;
} }

View File

@ -18,11 +18,12 @@ class FIFOCompactionPicker : public CompactionPicker {
const InternalKeyComparator* icmp) const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {} : CompactionPicker(ioptions, icmp) {}
Compaction* PickCompaction(const std::string& cf_name, Compaction* PickCompaction(
const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, const MutableDBOptions& mutable_db_options,
VersionStorageInfo* version, const std::vector<SequenceNumber>& /* existing_snapshots */,
LogBuffer* log_buffer) override; const SnapshotChecker* /* snapshot_checker */,
VersionStorageInfo* version, LogBuffer* log_buffer) override;
Compaction* CompactRange(const std::string& cf_name, Compaction* CompactRange(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options, const MutableCFOptions& mutable_cf_options,
@ -53,9 +54,10 @@ class FIFOCompactionPicker : public CompactionPicker {
VersionStorageInfo* version, VersionStorageInfo* version,
LogBuffer* log_buffer); LogBuffer* log_buffer);
// Will pick one file to compact at a time, starting from the oldest file.
Compaction* PickTemperatureChangeCompaction( Compaction* PickTemperatureChangeCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
LogBuffer* log_buffer); LogBuffer* log_buffer) const;
}; };
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -262,7 +262,10 @@ void LevelCompactionBuilder::SetupInitialFiles() {
parent_index_ = base_index_ = -1; parent_index_ = base_index_ = -1;
compaction_picker_->PickFilesMarkedForCompaction( compaction_picker_->PickFilesMarkedForCompaction(
cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_,
/*skip_marked_file*/ [](const FileMetaData* /* file */) {
return false;
});
if (!start_level_inputs_.empty()) { if (!start_level_inputs_.empty()) {
compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
return; return;
@ -411,8 +414,9 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
&tmp_start_level_inputs) || &tmp_start_level_inputs) ||
compaction_picker_->FilesRangeOverlapWithCompaction( compaction_picker_->FilesRangeOverlapWithCompaction(
{tmp_start_level_inputs}, output_level_, {tmp_start_level_inputs}, output_level_,
Compaction::EvaluatePenultimateLevel( Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
vstorage_, ioptions_, start_level_, output_level_))) { ioptions_, start_level_,
output_level_))) {
// Constraint 1a // Constraint 1a
tmp_start_level_inputs.clear(); tmp_start_level_inputs.clear();
return; return;
@ -486,8 +490,9 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
// We need to disallow this from happening. // We need to disallow this from happening.
if (compaction_picker_->FilesRangeOverlapWithCompaction( if (compaction_picker_->FilesRangeOverlapWithCompaction(
compaction_inputs_, output_level_, compaction_inputs_, output_level_,
Compaction::EvaluatePenultimateLevel( Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
vstorage_, ioptions_, start_level_, output_level_))) { ioptions_, start_level_,
output_level_))) {
// This compaction output could potentially conflict with the output // This compaction output could potentially conflict with the output
// of a currently running compaction, we cannot run it. // of a currently running compaction, we cannot run it.
return false; return false;
@ -554,7 +559,9 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
vstorage_->base_level()), vstorage_->base_level()),
GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
mutable_cf_options_.default_write_temperature, mutable_cf_options_.default_write_temperature,
/* max_subcompactions */ 0, std::move(grandparents_), is_manual_, /* max_subcompactions */ 0, std::move(grandparents_),
/* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
is_manual_,
/* trim_ts */ "", start_level_score_, false /* deletion_compaction */, /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
l0_files_might_overlap, compaction_reason_); l0_files_might_overlap, compaction_reason_);
@ -839,8 +846,9 @@ bool LevelCompactionBuilder::PickFileToCompact() {
&start_level_inputs_) || &start_level_inputs_) ||
compaction_picker_->FilesRangeOverlapWithCompaction( compaction_picker_->FilesRangeOverlapWithCompaction(
{start_level_inputs_}, output_level_, {start_level_inputs_}, output_level_,
Compaction::EvaluatePenultimateLevel( Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
vstorage_, ioptions_, start_level_, output_level_))) { ioptions_, start_level_,
output_level_))) {
// A locked (pending compaction) input-level file was pulled in due to // A locked (pending compaction) input-level file was pulled in due to
// user-key overlap. // user-key overlap.
start_level_inputs_.clear(); start_level_inputs_.clear();
@ -925,11 +933,15 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
} }
uint64_t l0_size = 0; uint64_t l0_size = 0;
for (const auto& file : l0_files) { for (const auto& file : l0_files) {
l0_size += file->fd.GetFileSize(); assert(file->compensated_file_size >= file->fd.GetFileSize());
// Compact down L0s with more deletions.
l0_size += file->compensated_file_size;
} }
const uint64_t min_lbase_size =
l0_size * static_cast<uint64_t>(std::max( // Avoid L0->Lbase compactions that are inefficient for write-amp.
10.0, mutable_cf_options_.max_bytes_for_level_multiplier)); const double kMultiplier =
std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
assert(min_lbase_size >= l0_size); assert(min_lbase_size >= l0_size);
const std::vector<FileMetaData*>& lbase_files = const std::vector<FileMetaData*>& lbase_files =
vstorage_->LevelFiles(/*level=*/base_level); vstorage_->LevelFiles(/*level=*/base_level);
@ -963,7 +975,9 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
Compaction* LevelCompactionPicker::PickCompaction( Compaction* LevelCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, const MutableDBOptions& mutable_db_options,
const std::vector<SequenceNumber>& /*existing_snapshots */,
const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) { LogBuffer* log_buffer) {
LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
mutable_cf_options, ioptions_, mutable_cf_options, ioptions_,

View File

@ -20,11 +20,12 @@ class LevelCompactionPicker : public CompactionPicker {
LevelCompactionPicker(const ImmutableOptions& ioptions, LevelCompactionPicker(const ImmutableOptions& ioptions,
const InternalKeyComparator* icmp) const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {} : CompactionPicker(ioptions, icmp) {}
Compaction* PickCompaction(const std::string& cf_name, Compaction* PickCompaction(
const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, const MutableDBOptions& mutable_db_options,
VersionStorageInfo* vstorage, const std::vector<SequenceNumber>& /* existing_snapshots */,
LogBuffer* log_buffer) override; const SnapshotChecker* /* snapshot_checker */,
VersionStorageInfo* vstorage, LogBuffer* log_buffer) override;
bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
}; };

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,9 @@ class UniversalCompactionBuilder {
UniversalCompactionBuilder( UniversalCompactionBuilder(
const ImmutableOptions& ioptions, const InternalKeyComparator* icmp, const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, const MutableDBOptions& mutable_db_options,
const std::vector<SequenceNumber>& existing_snapshots,
const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
UniversalCompactionPicker* picker, LogBuffer* log_buffer) UniversalCompactionPicker* picker, LogBuffer* log_buffer)
: ioptions_(ioptions), : ioptions_(ioptions),
icmp_(icmp), icmp_(icmp),
@ -44,7 +46,19 @@ class UniversalCompactionBuilder {
mutable_db_options_(mutable_db_options), mutable_db_options_(mutable_db_options),
vstorage_(vstorage), vstorage_(vstorage),
picker_(picker), picker_(picker),
log_buffer_(log_buffer) {} log_buffer_(log_buffer) {
assert(icmp_);
const auto* ucmp = icmp_->user_comparator();
assert(ucmp);
// These parameters are only passed when user-defined timestamp is not
// enabled.
if (ucmp->timestamp_size() == 0) {
earliest_snapshot_ = existing_snapshots.empty()
? kMaxSequenceNumber
: existing_snapshots.at(0);
snapshot_checker_ = snapshot_checker;
}
}
// Form and return the compaction object. The caller owns return object. // Form and return the compaction object. The caller owns return object.
Compaction* PickCompaction(); Compaction* PickCompaction();
@ -52,12 +66,15 @@ class UniversalCompactionBuilder {
private: private:
struct SortedRun { struct SortedRun {
SortedRun(int _level, FileMetaData* _file, uint64_t _size, SortedRun(int _level, FileMetaData* _file, uint64_t _size,
uint64_t _compensated_file_size, bool _being_compacted) uint64_t _compensated_file_size, bool _being_compacted,
bool _level_has_marked_standalone_rangedel)
: level(_level), : level(_level),
file(_file), file(_file),
size(_size), size(_size),
compensated_file_size(_compensated_file_size), compensated_file_size(_compensated_file_size),
being_compacted(_being_compacted) { being_compacted(_being_compacted),
level_has_marked_standalone_rangedel(
_level_has_marked_standalone_rangedel) {
assert(compensated_file_size > 0); assert(compensated_file_size > 0);
assert(level != 0 || file != nullptr); assert(level != 0 || file != nullptr);
} }
@ -79,6 +96,10 @@ class UniversalCompactionBuilder {
uint64_t size; uint64_t size;
uint64_t compensated_file_size; uint64_t compensated_file_size;
bool being_compacted; bool being_compacted;
// True if this level has any file that is a standalone range deletion file
// marked for compaction. Best effort is made to make only deletion
// triggered compaction pick this type of file.
bool level_has_marked_standalone_rangedel;
}; };
// Pick Universal compaction to limit read amplification // Pick Universal compaction to limit read amplification
@ -98,6 +119,11 @@ class UniversalCompactionBuilder {
Compaction* PickDeleteTriggeredCompaction(); Compaction* PickDeleteTriggeredCompaction();
// Returns true if this given file (that is marked be compaction) should be
// skipped from being picked for now. We do this to best use standalone range
// tombstone files.
bool ShouldSkipMarkedFile(const FileMetaData* file) const;
// Form a compaction from the sorted run indicated by start_index to the // Form a compaction from the sorted run indicated by start_index to the
// oldest sorted run. // oldest sorted run.
// The caller is responsible for making sure that those files are not in // The caller is responsible for making sure that those files are not in
@ -116,7 +142,7 @@ class UniversalCompactionBuilder {
bool ShouldSkipLastSortedRunForSizeAmpCompaction() const { bool ShouldSkipLastSortedRunForSizeAmpCompaction() const {
assert(!sorted_runs_.empty()); assert(!sorted_runs_.empty());
return ioptions_.preclude_last_level_data_seconds > 0 && return mutable_cf_options_.preclude_last_level_data_seconds > 0 &&
ioptions_.num_levels > 2 && ioptions_.num_levels > 2 &&
sorted_runs_.back().level == ioptions_.num_levels - 1 && sorted_runs_.back().level == ioptions_.num_levels - 1 &&
sorted_runs_.size() > 1; sorted_runs_.size() > 1;
@ -234,8 +260,18 @@ class UniversalCompactionBuilder {
VersionStorageInfo* vstorage_; VersionStorageInfo* vstorage_;
UniversalCompactionPicker* picker_; UniversalCompactionPicker* picker_;
LogBuffer* log_buffer_; LogBuffer* log_buffer_;
// Optional earliest snapshot at time of compaction picking. This is only
// provided if the column family doesn't enable user-defined timestamps.
// And this information is only passed to `Compaction` picked by deletion
// triggered compaction for possible optimizations.
std::optional<SequenceNumber> earliest_snapshot_;
const SnapshotChecker* snapshot_checker_;
// Mapping from file id to its index in the sorted run for the files that are
// marked for compaction. This is only populated when snapshot info is
// populated.
std::map<uint64_t, size_t> file_marked_for_compaction_to_sorted_run_index_;
static std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns( std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
const VersionStorageInfo& vstorage, int last_level, const VersionStorageInfo& vstorage, int last_level,
uint64_t* max_run_size); uint64_t* max_run_size);
@ -394,11 +430,13 @@ bool UniversalCompactionPicker::NeedsCompaction(
Compaction* UniversalCompactionPicker::PickCompaction( Compaction* UniversalCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, const MutableDBOptions& mutable_db_options,
const std::vector<SequenceNumber>& existing_snapshots,
const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) { LogBuffer* log_buffer) {
UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, UniversalCompactionBuilder builder(
mutable_cf_options, mutable_db_options, ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options,
vstorage, this, log_buffer); existing_snapshots, snapshot_checker, vstorage, this, log_buffer);
return builder.PickCompaction(); return builder.PickCompaction();
} }
@ -448,14 +486,20 @@ UniversalCompactionBuilder::CalculateSortedRuns(
*max_run_size = 0; *max_run_size = 0;
std::vector<UniversalCompactionBuilder::SortedRun> ret; std::vector<UniversalCompactionBuilder::SortedRun> ret;
for (FileMetaData* f : vstorage.LevelFiles(0)) { for (FileMetaData* f : vstorage.LevelFiles(0)) {
ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, if (earliest_snapshot_.has_value() && f->marked_for_compaction) {
f->being_compacted); file_marked_for_compaction_to_sorted_run_index_.emplace(f->fd.GetNumber(),
ret.size());
}
ret.emplace_back(
0, f, f->fd.GetFileSize(), f->compensated_file_size, f->being_compacted,
f->marked_for_compaction && f->FileIsStandAloneRangeTombstone());
*max_run_size = std::max(*max_run_size, f->fd.GetFileSize()); *max_run_size = std::max(*max_run_size, f->fd.GetFileSize());
} }
for (int level = 1; level <= last_level; level++) { for (int level = 1; level <= last_level; level++) {
uint64_t total_compensated_size = 0U; uint64_t total_compensated_size = 0U;
uint64_t total_size = 0U; uint64_t total_size = 0U;
bool being_compacted = false; bool being_compacted = false;
bool level_has_marked_standalone_rangedel = false;
for (FileMetaData* f : vstorage.LevelFiles(level)) { for (FileMetaData* f : vstorage.LevelFiles(level)) {
total_compensated_size += f->compensated_file_size; total_compensated_size += f->compensated_file_size;
total_size += f->fd.GetFileSize(); total_size += f->fd.GetFileSize();
@ -467,16 +511,57 @@ UniversalCompactionBuilder::CalculateSortedRuns(
if (f->being_compacted) { if (f->being_compacted) {
being_compacted = f->being_compacted; being_compacted = f->being_compacted;
} }
level_has_marked_standalone_rangedel =
level_has_marked_standalone_rangedel ||
(f->marked_for_compaction && f->FileIsStandAloneRangeTombstone());
if (earliest_snapshot_.has_value() && f->marked_for_compaction) {
file_marked_for_compaction_to_sorted_run_index_.emplace(
f->fd.GetNumber(), ret.size());
}
} }
if (total_compensated_size > 0) { if (total_compensated_size > 0) {
ret.emplace_back(level, nullptr, total_size, total_compensated_size, ret.emplace_back(level, nullptr, total_size, total_compensated_size,
being_compacted); being_compacted, level_has_marked_standalone_rangedel);
} }
*max_run_size = std::max(*max_run_size, total_size); *max_run_size = std::max(*max_run_size, total_size);
} }
return ret; return ret;
} }
bool UniversalCompactionBuilder::ShouldSkipMarkedFile(
const FileMetaData* file) const {
assert(file->marked_for_compaction);
if (!earliest_snapshot_.has_value()) {
return false;
}
if (!file->FileIsStandAloneRangeTombstone()) {
return false;
}
// Skip until earliest snapshot advances at or above this standalone range
// tombstone file. `DB::ReleaseSnapshot` will re-examine and schedule
// compaction for it.
if (!DataIsDefinitelyInSnapshot(file->fd.largest_seqno,
earliest_snapshot_.value(),
snapshot_checker_)) {
return true;
}
auto iter = file_marked_for_compaction_to_sorted_run_index_.find(
file->fd.GetNumber());
assert(iter != file_marked_for_compaction_to_sorted_run_index_.end());
size_t idx = iter->second;
const SortedRun* succeeding_sorted_run =
idx < sorted_runs_.size() - 1 ? &sorted_runs_[idx + 1] : nullptr;
// Marked standalone range tombstone file is best used if it's in the start
// input level. Skip to let that compaction happen first.
if (succeeding_sorted_run &&
succeeding_sorted_run->level_has_marked_standalone_rangedel) {
return true;
}
return false;
}
// Universal style of compaction. Pick files that are contiguous in // Universal style of compaction. Pick files that are contiguous in
// time-range to compact. // time-range to compact.
Compaction* UniversalCompactionBuilder::PickCompaction() { Compaction* UniversalCompactionBuilder::PickCompaction() {
@ -580,7 +665,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
// Get the total number of sorted runs that are not being compacted // Get the total number of sorted runs that are not being compacted
int num_sr_not_compacted = 0; int num_sr_not_compacted = 0;
for (size_t i = 0; i < sorted_runs_.size(); i++) { for (size_t i = 0; i < sorted_runs_.size(); i++) {
if (sorted_runs_[i].being_compacted == false) { if (sorted_runs_[i].being_compacted == false &&
!sorted_runs_[i].level_has_marked_standalone_rangedel) {
num_sr_not_compacted++; num_sr_not_compacted++;
} }
} }
@ -743,16 +829,24 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
for (sr = nullptr; loop < sorted_runs_.size(); loop++) { for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
sr = &sorted_runs_[loop]; sr = &sorted_runs_[loop];
if (!sr->being_compacted) { if (!sr->being_compacted && !sr->level_has_marked_standalone_rangedel) {
candidate_count = 1; candidate_count = 1;
break; break;
} }
char file_num_buf[kFormatFileNumberBufSize]; char file_num_buf[kFormatFileNumberBufSize];
sr->Dump(file_num_buf, sizeof(file_num_buf)); sr->Dump(file_num_buf, sizeof(file_num_buf));
ROCKS_LOG_BUFFER(log_buffer_, if (sr->being_compacted) {
"[%s] Universal: %s" ROCKS_LOG_BUFFER(log_buffer_,
"[%d] being compacted, skipping", "[%s] Universal: %s"
cf_name_.c_str(), file_num_buf, loop); "[%d] being compacted, skipping",
cf_name_.c_str(), file_num_buf, loop);
} else if (sr->level_has_marked_standalone_rangedel) {
ROCKS_LOG_BUFFER(log_buffer_,
"[%s] Universal: %s"
"[%d] has standalone range tombstone files marked for "
"compaction, skipping",
cf_name_.c_str(), file_num_buf, loop);
}
sr = nullptr; sr = nullptr;
} }
@ -773,7 +867,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
candidate_count < max_files_to_compact && i < sorted_runs_.size(); candidate_count < max_files_to_compact && i < sorted_runs_.size();
i++) { i++) {
const SortedRun* succeeding_sr = &sorted_runs_[i]; const SortedRun* succeeding_sr = &sorted_runs_[i];
if (succeeding_sr->being_compacted) { if (succeeding_sr->being_compacted ||
succeeding_sr->level_has_marked_standalone_rangedel) {
break; break;
} }
// Pick files if the total/last candidate file size (increased by the // Pick files if the total/last candidate file size (increased by the
@ -899,11 +994,11 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level); grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level);
} }
if (output_level != 0 && if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
picker_->FilesRangeOverlapWithCompaction( inputs, output_level,
inputs, output_level, Compaction::EvaluatePenultimateLevel(
Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, vstorage_, mutable_cf_options_, ioptions_,
start_level, output_level))) { start_level, output_level))) {
return nullptr; return nullptr;
} }
CompactionReason compaction_reason; CompactionReason compaction_reason;
@ -923,6 +1018,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
output_level, enable_compression), output_level, enable_compression),
mutable_cf_options_.default_write_temperature, mutable_cf_options_.default_write_temperature,
/* max_subcompactions */ 0, grandparents, /* max_subcompactions */ 0, grandparents,
/* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr,
/* is manual */ false, /* trim_ts */ "", score_, /* is manual */ false, /* trim_ts */ "", score_,
false /* deletion_compaction */, false /* deletion_compaction */,
/* l0_files_might_overlap */ true, compaction_reason); /* l0_files_might_overlap */ true, compaction_reason);
@ -939,7 +1036,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
const size_t end_index = ShouldSkipLastSortedRunForSizeAmpCompaction() const size_t end_index = ShouldSkipLastSortedRunForSizeAmpCompaction()
? sorted_runs_.size() - 2 ? sorted_runs_.size() - 2
: sorted_runs_.size() - 1; : sorted_runs_.size() - 1;
if (sorted_runs_[end_index].being_compacted) { if (sorted_runs_[end_index].being_compacted ||
sorted_runs_[end_index].level_has_marked_standalone_rangedel) {
return nullptr; return nullptr;
} }
const uint64_t base_sr_size = sorted_runs_[end_index].size; const uint64_t base_sr_size = sorted_runs_[end_index].size;
@ -950,14 +1048,23 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
// Get longest span (i.e, [start_index, end_index]) of available sorted runs // Get longest span (i.e, [start_index, end_index]) of available sorted runs
while (start_index > 0) { while (start_index > 0) {
const SortedRun* sr = &sorted_runs_[start_index - 1]; const SortedRun* sr = &sorted_runs_[start_index - 1];
if (sr->being_compacted) { if (sr->being_compacted || sr->level_has_marked_standalone_rangedel) {
char file_num_buf[kFormatFileNumberBufSize]; char file_num_buf[kFormatFileNumberBufSize];
sr->Dump(file_num_buf, sizeof(file_num_buf), true); sr->Dump(file_num_buf, sizeof(file_num_buf), true);
ROCKS_LOG_BUFFER( if (sr->being_compacted) {
log_buffer_, ROCKS_LOG_BUFFER(
"[%s] Universal: stopping at sorted run undergoing compaction: " log_buffer_,
"%s[%" ROCKSDB_PRIszt "]", "[%s] Universal: stopping at sorted run undergoing compaction: "
cf_name_.c_str(), file_num_buf, start_index - 1); "%s[%" ROCKSDB_PRIszt "]",
cf_name_.c_str(), file_num_buf, start_index - 1);
} else if (sr->level_has_marked_standalone_rangedel) {
ROCKS_LOG_BUFFER(
log_buffer_,
"[%s] Universal: stopping at sorted run that has standalone range "
"tombstone files marked for compaction: "
"%s[%" ROCKSDB_PRIszt "]",
cf_name_.c_str(), file_num_buf, start_index - 1);
}
break; break;
} }
candidate_size += sr->compensated_file_size; candidate_size += sr->compensated_file_size;
@ -1236,11 +1343,11 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
} }
// intra L0 compactions outputs could have overlap // intra L0 compactions outputs could have overlap
if (output_level != 0 && if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
picker_->FilesRangeOverlapWithCompaction( inputs, output_level,
inputs, output_level, Compaction::EvaluatePenultimateLevel(
Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, vstorage_, mutable_cf_options_, ioptions_,
start_level, output_level))) { start_level, output_level))) {
return nullptr; return nullptr;
} }
@ -1257,7 +1364,10 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
true /* enable_compression */), true /* enable_compression */),
mutable_cf_options_.default_write_temperature, mutable_cf_options_.default_write_temperature,
/* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, /* max_subcompactions */ 0, /* grandparents */ {},
/* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr,
/* is manual */ false,
/* trim_ts */ "", score_, false /* deletion_compaction */, /* trim_ts */ "", score_, false /* deletion_compaction */,
/* l0_files_might_overlap */ true, /* l0_files_might_overlap */ true,
CompactionReason::kUniversalSizeAmplification); CompactionReason::kUniversalSizeAmplification);
@ -1288,7 +1398,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
continue; continue;
} }
FileMetaData* f = vstorage_->LevelFiles(0)[loop]; FileMetaData* f = vstorage_->LevelFiles(0)[loop];
if (f->marked_for_compaction) { if (f->marked_for_compaction && !ShouldSkipMarkedFile(f)) {
start_level_inputs.files.push_back(f); start_level_inputs.files.push_back(f);
start_index = start_index =
static_cast<int>(loop); // Consider this as the first candidate. static_cast<int>(loop); // Consider this as the first candidate.
@ -1302,7 +1412,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) { for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) {
SortedRun* sr = &sorted_runs_[loop]; SortedRun* sr = &sorted_runs_[loop];
if (sr->being_compacted) { if (sr->being_compacted || sr->level_has_marked_standalone_rangedel) {
break; break;
} }
@ -1321,7 +1431,10 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
// leveled. We pick one of the files marked for compaction and compact with // leveled. We pick one of the files marked for compaction and compact with
// overlapping files in the adjacent level. // overlapping files in the adjacent level.
picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
&output_level, &start_level_inputs); &output_level, &start_level_inputs,
[this](const FileMetaData* file) {
return ShouldSkipMarkedFile(file);
});
if (start_level_inputs.empty()) { if (start_level_inputs.empty()) {
return nullptr; return nullptr;
} }
@ -1374,7 +1487,8 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
if (picker_->FilesRangeOverlapWithCompaction( if (picker_->FilesRangeOverlapWithCompaction(
inputs, output_level, inputs, output_level,
Compaction::EvaluatePenultimateLevel( Compaction::EvaluatePenultimateLevel(
vstorage_, ioptions_, start_level, output_level))) { vstorage_, mutable_cf_options_, ioptions_, start_level,
output_level))) {
return nullptr; return nullptr;
} }
@ -1401,7 +1515,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1), GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
mutable_cf_options_.default_write_temperature, mutable_cf_options_.default_write_temperature,
/* max_subcompactions */ 0, grandparents, /* is manual */ false, /* max_subcompactions */ 0, grandparents, earliest_snapshot_,
snapshot_checker_,
/* is manual */ false,
/* trim_ts */ "", score_, false /* deletion_compaction */, /* trim_ts */ "", score_, false /* deletion_compaction */,
/* l0_files_might_overlap */ true, /* l0_files_might_overlap */ true,
CompactionReason::kFilesMarkedForCompaction); CompactionReason::kFilesMarkedForCompaction);
@ -1472,11 +1588,11 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
} }
// intra L0 compactions outputs could have overlap // intra L0 compactions outputs could have overlap
if (output_level != 0 && if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
picker_->FilesRangeOverlapWithCompaction( inputs, output_level,
inputs, output_level, Compaction::EvaluatePenultimateLevel(
Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_, vstorage_, mutable_cf_options_, ioptions_,
start_level, output_level))) { start_level, output_level))) {
return nullptr; return nullptr;
} }
@ -1494,7 +1610,10 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
true /* enable_compression */), true /* enable_compression */),
mutable_cf_options_.default_write_temperature, mutable_cf_options_.default_write_temperature,
/* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, /* max_subcompactions */ 0, /* grandparents */ {},
/* earliest_snapshot */ std::nullopt,
/* snapshot_checker */ nullptr,
/* is manual */ false,
/* trim_ts */ "", score_, false /* deletion_compaction */, /* trim_ts */ "", score_, false /* deletion_compaction */,
/* l0_files_might_overlap */ true, compaction_reason); /* l0_files_might_overlap */ true, compaction_reason);
} }
@ -1515,7 +1634,8 @@ Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
// included in the compaction. // included in the compaction.
size_t start_index = sorted_runs_.size(); size_t start_index = sorted_runs_.size();
while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted &&
!sorted_runs_[start_index - 1].level_has_marked_standalone_rangedel) {
start_index--; start_index--;
} }
if (start_index == sorted_runs_.size()) { if (start_index == sorted_runs_.size()) {

View File

@ -10,6 +10,7 @@
#pragma once #pragma once
#include "db/compaction/compaction_picker.h" #include "db/compaction/compaction_picker.h"
#include "db/snapshot_checker.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
class UniversalCompactionPicker : public CompactionPicker { class UniversalCompactionPicker : public CompactionPicker {
@ -17,11 +18,12 @@ class UniversalCompactionPicker : public CompactionPicker {
UniversalCompactionPicker(const ImmutableOptions& ioptions, UniversalCompactionPicker(const ImmutableOptions& ioptions,
const InternalKeyComparator* icmp) const InternalKeyComparator* icmp)
: CompactionPicker(ioptions, icmp) {} : CompactionPicker(ioptions, icmp) {}
Compaction* PickCompaction(const std::string& cf_name, Compaction* PickCompaction(
const MutableCFOptions& mutable_cf_options, const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, const MutableDBOptions& mutable_db_options,
VersionStorageInfo* vstorage, const std::vector<SequenceNumber>& existing_snapshots,
LogBuffer* log_buffer) override; const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) override;
int MaxOutputLevel() const override { return NumberLevels() - 1; } int MaxOutputLevel() const override { return NumberLevels() - 1; }
bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;

View File

@ -39,12 +39,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
MakeTableFileName(file->fd.GetNumber())); MakeTableFileName(file->fd.GetNumber()));
} }
} }
compaction_input.column_family.name =
compaction->column_family_data()->GetName(); compaction_input.cf_name = compaction->column_family_data()->GetName();
compaction_input.column_family.options =
compaction->column_family_data()->GetLatestCFOptions();
compaction_input.db_options =
BuildDBOptions(db_options_, mutable_db_options_copy_);
compaction_input.snapshots = existing_snapshots_; compaction_input.snapshots = existing_snapshots_;
compaction_input.has_begin = sub_compact->start.has_value(); compaction_input.has_begin = sub_compact->start.has_value();
compaction_input.begin = compaction_input.begin =
@ -52,6 +48,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
compaction_input.has_end = sub_compact->end.has_value(); compaction_input.has_end = sub_compact->end.has_value();
compaction_input.end = compaction_input.end =
compaction_input.has_end ? sub_compact->end->ToString() : ""; compaction_input.has_end ? sub_compact->end->ToString() : "";
compaction_input.options_file_number =
sub_compact->compaction->input_version()
->version_set()
->options_file_number();
TEST_SYNC_POINT_CALLBACK(
"CompactionServiceJob::ProcessKeyValueCompactionWithCompactionService",
&compaction_input);
std::string compaction_input_binary; std::string compaction_input_binary;
Status s = compaction_input.Write(&compaction_input_binary); Status s = compaction_input.Write(&compaction_input_binary);
@ -70,10 +74,13 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
ROCKS_LOG_INFO( ROCKS_LOG_INFO(
db_options_.info_log, db_options_.info_log,
"[%s] [JOB %d] Starting remote compaction (output level: %d): %s", "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
compaction_input.column_family.name.c_str(), job_id_, compaction->column_family_data()->GetName().c_str(), job_id_,
compaction_input.output_level, input_files_oss.str().c_str()); compaction_input.output_level, input_files_oss.str().c_str());
CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_, CompactionServiceJobInfo info(
GetCompactionId(sub_compact), thread_pri_); dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact),
thread_pri_, compaction->compaction_reason(),
compaction->is_full_compaction(), compaction->is_manual_compaction(),
compaction->bottommost_level());
CompactionServiceScheduleResponse response = CompactionServiceScheduleResponse response =
db_options_.compaction_service->Schedule(info, compaction_input_binary); db_options_.compaction_service->Schedule(info, compaction_input_binary);
switch (response.status) { switch (response.status) {
@ -84,13 +91,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
"CompactionService failed to schedule a remote compaction job."); "CompactionService failed to schedule a remote compaction job.");
ROCKS_LOG_WARN(db_options_.info_log, ROCKS_LOG_WARN(db_options_.info_log,
"[%s] [JOB %d] Remote compaction failed to start.", "[%s] [JOB %d] Remote compaction failed to start.",
compaction_input.column_family.name.c_str(), job_id_); compaction->column_family_data()->GetName().c_str(),
job_id_);
return response.status; return response.status;
case CompactionServiceJobStatus::kUseLocal: case CompactionServiceJobStatus::kUseLocal:
ROCKS_LOG_INFO( ROCKS_LOG_INFO(
db_options_.info_log, db_options_.info_log,
"[%s] [JOB %d] Remote compaction fallback to local by API (Schedule)", "[%s] [JOB %d] Remote compaction fallback to local by API (Schedule)",
compaction_input.column_family.name.c_str(), job_id_); compaction->column_family_data()->GetName().c_str(), job_id_);
return response.status; return response.status;
default: default:
assert(false); // unknown status assert(false); // unknown status
@ -99,7 +107,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
ROCKS_LOG_INFO(db_options_.info_log, ROCKS_LOG_INFO(db_options_.info_log,
"[%s] [JOB %d] Waiting for remote compaction...", "[%s] [JOB %d] Waiting for remote compaction...",
compaction_input.column_family.name.c_str(), job_id_); compaction->column_family_data()->GetName().c_str(), job_id_);
std::string compaction_result_binary; std::string compaction_result_binary;
CompactionServiceJobStatus compaction_status = CompactionServiceJobStatus compaction_status =
db_options_.compaction_service->Wait(response.scheduled_job_id, db_options_.compaction_service->Wait(response.scheduled_job_id,
@ -109,7 +117,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
ROCKS_LOG_INFO( ROCKS_LOG_INFO(
db_options_.info_log, db_options_.info_log,
"[%s] [JOB %d] Remote compaction fallback to local by API (Wait)", "[%s] [JOB %d] Remote compaction fallback to local by API (Wait)",
compaction_input.column_family.name.c_str(), job_id_); compaction->column_family_data()->GetName().c_str(), job_id_);
return compaction_status; return compaction_status;
} }
@ -134,15 +142,19 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
"result is returned)."); "result is returned).");
compaction_result.status.PermitUncheckedError(); compaction_result.status.PermitUncheckedError();
} }
ROCKS_LOG_WARN(db_options_.info_log, ROCKS_LOG_WARN(
"[%s] [JOB %d] Remote compaction failed.", db_options_.info_log, "[%s] [JOB %d] Remote compaction failed.",
compaction_input.column_family.name.c_str(), job_id_); compaction->column_family_data()->GetName().c_str(), job_id_);
return compaction_status; return compaction_status;
} }
// CompactionServiceJobStatus::kSuccess was returned, but somehow we failed to
// read the result. Consider this as an installation failure
if (!s.ok()) { if (!s.ok()) {
sub_compact->status = s; sub_compact->status = s;
compaction_result.status.PermitUncheckedError(); compaction_result.status.PermitUncheckedError();
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure; return CompactionServiceJobStatus::kFailure;
} }
sub_compact->status = compaction_result.status; sub_compact->status = compaction_result.status;
@ -154,18 +166,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
is_first_one = false; is_first_one = false;
} }
ROCKS_LOG_INFO(db_options_.info_log, ROCKS_LOG_INFO(
"[%s] [JOB %d] Receive remote compaction result, output path: " db_options_.info_log,
"%s, files: %s", "[%s] [JOB %d] Received remote compaction result, output path: "
compaction_input.column_family.name.c_str(), job_id_, "%s, files: %s",
compaction_result.output_path.c_str(), compaction->column_family_data()->GetName().c_str(), job_id_,
output_files_oss.str().c_str()); compaction_result.output_path.c_str(), output_files_oss.str().c_str());
if (!s.ok()) {
sub_compact->status = s;
return CompactionServiceJobStatus::kFailure;
}
// Installation Starts
for (const auto& file : compaction_result.output_files) { for (const auto& file : compaction_result.output_files) {
uint64_t file_num = versions_->NewFileNumber(); uint64_t file_num = versions_->NewFileNumber();
auto src_file = compaction_result.output_path + "/" + file.file_name; auto src_file = compaction_result.output_path + "/" + file.file_name;
@ -174,6 +182,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr); s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
if (!s.ok()) { if (!s.ok()) {
sub_compact->status = s; sub_compact->status = s;
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure; return CompactionServiceJobStatus::kFailure;
} }
@ -182,6 +192,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr); s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
if (!s.ok()) { if (!s.ok()) {
sub_compact->status = s; sub_compact->status = s;
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
return CompactionServiceJobStatus::kFailure; return CompactionServiceJobStatus::kFailure;
} }
meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size, meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
@ -191,6 +203,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
meta.oldest_ancester_time = file.oldest_ancester_time; meta.oldest_ancester_time = file.oldest_ancester_time;
meta.file_creation_time = file.file_creation_time; meta.file_creation_time = file.file_creation_time;
meta.epoch_number = file.epoch_number; meta.epoch_number = file.epoch_number;
meta.file_checksum = file.file_checksum;
meta.file_checksum_func_name = file.file_checksum_func_name;
meta.marked_for_compaction = file.marked_for_compaction; meta.marked_for_compaction = file.marked_for_compaction;
meta.unique_id = file.unique_id; meta.unique_id = file.unique_id;
@ -198,14 +212,19 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
sub_compact->Current().AddOutput(std::move(meta), sub_compact->Current().AddOutput(std::move(meta),
cfd->internal_comparator(), false, true, cfd->internal_comparator(), false, true,
file.paranoid_hash); file.paranoid_hash);
sub_compact->Current().UpdateTableProperties(file.table_properties);
} }
sub_compact->compaction_job_stats = compaction_result.stats; sub_compact->compaction_job_stats = compaction_result.stats;
sub_compact->Current().SetNumOutputRecords( sub_compact->Current().SetNumOutputRecords(
compaction_result.num_output_records); compaction_result.stats.num_output_records);
sub_compact->Current().SetTotalBytes(compaction_result.total_bytes); sub_compact->Current().SetNumOutputFiles(
compaction_result.stats.num_output_files);
sub_compact->Current().AddBytesWritten(compaction_result.bytes_written);
RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
compaction_result.bytes_written); compaction_result.bytes_written);
db_options_.compaction_service->OnInstallation(
response.scheduled_job_id, CompactionServiceJobStatus::kSuccess);
return CompactionServiceJobStatus::kSuccess; return CompactionServiceJobStatus::kSuccess;
} }
@ -220,6 +239,18 @@ void CompactionServiceCompactionJob::RecordCompactionIOStats() {
CompactionJob::RecordCompactionIOStats(); CompactionJob::RecordCompactionIOStats();
} }
void CompactionServiceCompactionJob::UpdateCompactionJobStats(
const InternalStats::CompactionStats& stats) const {
compaction_job_stats_->elapsed_micros = stats.micros;
// output information only in remote compaction
compaction_job_stats_->total_output_bytes = stats.bytes_written;
compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
compaction_job_stats_->num_output_records = stats.num_output_records;
compaction_job_stats_->num_output_files = stats.num_output_files;
compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
}
CompactionServiceCompactionJob::CompactionServiceCompactionJob( CompactionServiceCompactionJob::CompactionServiceCompactionJob(
int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
const MutableDBOptions& mutable_db_options, const FileOptions& file_options, const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
@ -255,11 +286,11 @@ Status CompactionServiceCompactionJob::Run() {
auto* c = compact_->compaction; auto* c = compact_->compaction;
assert(c->column_family_data() != nullptr); assert(c->column_family_data() != nullptr);
assert(c->column_family_data()->current()->storage_info()->NumLevelFiles( const VersionStorageInfo* storage_info = c->input_version()->storage_info();
compact_->compaction->level()) > 0); assert(storage_info);
assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
write_hint_ =
c->column_family_data()->CalculateSSTWriteHint(c->output_level());
bottommost_level_ = c->bottommost_level(); bottommost_level_ = c->bottommost_level();
Slice begin = compaction_input_.begin; Slice begin = compaction_input_.begin;
@ -274,6 +305,9 @@ Status CompactionServiceCompactionJob::Run() {
log_buffer_->FlushBufferToLog(); log_buffer_->FlushBufferToLog();
LogCompaction(); LogCompaction();
compaction_result_->stats.Reset();
const uint64_t start_micros = db_options_.clock->NowMicros(); const uint64_t start_micros = db_options_.clock->NowMicros();
c->GetOrInitInputTableProperties(); c->GetOrInitInputTableProperties();
@ -314,20 +348,32 @@ Status CompactionServiceCompactionJob::Run() {
if (status.ok()) { if (status.ok()) {
status = io_s; status = io_s;
} }
if (status.ok()) {
// TODO: Add verify_table()
}
// Finish up all book-keeping to unify the subcompaction results
compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
UpdateCompactionStats();
RecordCompactionIOStats();
LogFlush(db_options_.info_log); LogFlush(db_options_.info_log);
compact_->status = status; compact_->status = status;
compact_->status.PermitUncheckedError(); compact_->status.PermitUncheckedError();
// Build compaction result // Build Compaction Job Stats
// 1. Aggregate CompactionOutputStats into Internal Compaction Stats
// (compaction_stats_) and aggregate Compaction Job Stats
// (compaction_job_stats_) from the sub compactions
compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
// 2. Update the Output information in the Compaction Job Stats with
// aggregated Internal Compaction Stats.
UpdateCompactionJobStats(compaction_stats_.stats);
// 3. Set fields that are not propagated as part of aggregations above
compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
compaction_result_->stats.is_full_compaction = c->is_full_compaction();
compaction_result_->stats.is_remote_compaction = true;
// 4. Update IO Stats that are not part of the aggregations above (bytes_read,
// bytes_written)
RecordCompactionIOStats();
// Build Output
compaction_result_->output_level = compact_->compaction->output_level(); compaction_result_->output_level = compact_->compaction->output_level();
compaction_result_->output_path = output_path_; compaction_result_->output_path = output_path_;
for (const auto& output_file : sub_compact->GetOutputs()) { for (const auto& output_file : sub_compact->GetOutputs()) {
@ -336,16 +382,14 @@ Status CompactionServiceCompactionJob::Run() {
MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
meta.fd.largest_seqno, meta.smallest.Encode().ToString(), meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
meta.largest.Encode().ToString(), meta.oldest_ancester_time, meta.largest.Encode().ToString(), meta.oldest_ancester_time,
meta.file_creation_time, meta.epoch_number, meta.file_creation_time, meta.epoch_number, meta.file_checksum,
output_file.validator.GetHash(), meta.marked_for_compaction, meta.file_checksum_func_name, output_file.validator.GetHash(),
meta.unique_id); meta.marked_for_compaction, meta.unique_id,
*output_file.table_properties);
} }
InternalStats::CompactionStatsFull compaction_stats;
sub_compact->AggregateCompactionStats(compaction_stats);
compaction_result_->num_output_records =
compaction_stats.stats.num_output_records;
compaction_result_->total_bytes = compaction_stats.TotalBytesWritten();
TEST_SYNC_POINT_CALLBACK("CompactionServiceCompactionJob::Run:0",
&compaction_result_);
return status; return status;
} }
@ -398,42 +442,9 @@ static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
}; };
static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = { static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
{"column_family", {"cf_name",
OptionTypeInfo::Struct( {offsetof(struct CompactionServiceInput, cf_name),
"column_family", &cfd_type_info, OptionType::kEncodedString}},
offsetof(struct CompactionServiceInput, column_family),
OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
{"db_options",
{offsetof(struct CompactionServiceInput, db_options),
OptionType::kConfigurable, OptionVerificationType::kNormal,
OptionTypeFlags::kNone,
[](const ConfigOptions& opts, const std::string& /*name*/,
const std::string& value, void* addr) {
auto options = static_cast<DBOptions*>(addr);
return GetDBOptionsFromString(opts, DBOptions(), value, options);
},
[](const ConfigOptions& opts, const std::string& /*name*/,
const void* addr, std::string* value) {
const auto options = static_cast<const DBOptions*>(addr);
std::string result;
auto status = GetStringFromDBOptions(opts, *options, &result);
*value = "{" + result + "}";
return status;
},
[](const ConfigOptions& opts, const std::string& name, const void* addr1,
const void* addr2, std::string* mismatch) {
const auto this_one = static_cast<const DBOptions*>(addr1);
const auto that_one = static_cast<const DBOptions*>(addr2);
auto this_conf = DBOptionsAsConfigurable(*this_one);
auto that_conf = DBOptionsAsConfigurable(*that_one);
std::string mismatch_opt;
bool result =
this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
if (!result) {
*mismatch = name + "." + mismatch_opt;
}
return result;
}}},
{"snapshots", OptionTypeInfo::Vector<uint64_t>( {"snapshots", OptionTypeInfo::Vector<uint64_t>(
offsetof(struct CompactionServiceInput, snapshots), offsetof(struct CompactionServiceInput, snapshots),
OptionVerificationType::kNormal, OptionTypeFlags::kNone, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
@ -461,6 +472,10 @@ static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
{"end", {"end",
{offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString, {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString,
OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
{"options_file_number",
{offsetof(struct CompactionServiceInput, options_file_number),
OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
}; };
static std::unordered_map<std::string, OptionTypeInfo> static std::unordered_map<std::string, OptionTypeInfo>
@ -497,6 +512,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct CompactionServiceOutputFile, epoch_number), {offsetof(struct CompactionServiceOutputFile, epoch_number),
OptionType::kUInt64T, OptionVerificationType::kNormal, OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}}, OptionTypeFlags::kNone}},
{"file_checksum",
{offsetof(struct CompactionServiceOutputFile, file_checksum),
OptionType::kEncodedString, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"file_checksum_func_name",
{offsetof(struct CompactionServiceOutputFile, file_checksum_func_name),
OptionType::kEncodedString, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"paranoid_hash", {"paranoid_hash",
{offsetof(struct CompactionServiceOutputFile, paranoid_hash), {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
OptionType::kUInt64T, OptionVerificationType::kNormal, OptionType::kUInt64T, OptionVerificationType::kNormal,
@ -510,7 +533,30 @@ static std::unordered_map<std::string, OptionTypeInfo>
offsetof(struct CompactionServiceOutputFile, unique_id), offsetof(struct CompactionServiceOutputFile, unique_id),
OptionVerificationType::kNormal, OptionTypeFlags::kNone, OptionVerificationType::kNormal, OptionTypeFlags::kNone,
{0, OptionType::kUInt64T})}, {0, OptionType::kUInt64T})},
}; {"table_properties",
{offsetof(struct CompactionServiceOutputFile, table_properties),
OptionType::kStruct, OptionVerificationType::kNormal,
OptionTypeFlags::kNone,
[](const ConfigOptions& opts, const std::string& /*name*/,
const std::string& value, void* addr) {
auto table_properties = static_cast<TableProperties*>(addr);
return TableProperties::Parse(opts, value, table_properties);
},
[](const ConfigOptions& opts, const std::string& /*name*/,
const void* addr, std::string* value) {
const auto table_properties =
static_cast<const TableProperties*>(addr);
std::string result;
auto status = table_properties->Serialize(opts, &result);
*value = "{" + result + "}";
return status;
},
[](const ConfigOptions& opts, const std::string& /*name*/,
const void* addr1, const void* addr2, std::string* mismatch) {
const auto this_one = static_cast<const TableProperties*>(addr1);
const auto that_one = static_cast<const TableProperties*>(addr2);
return this_one->AreEqual(opts, that_one, mismatch);
}}}};
static std::unordered_map<std::string, OptionTypeInfo> static std::unordered_map<std::string, OptionTypeInfo>
compaction_job_stats_type_info = { compaction_job_stats_type_info = {
@ -557,6 +603,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct CompactionJobStats, is_manual_compaction), {offsetof(struct CompactionJobStats, is_manual_compaction),
OptionType::kBoolean, OptionVerificationType::kNormal, OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}}, OptionTypeFlags::kNone}},
{"is_remote_compaction",
{offsetof(struct CompactionJobStats, is_remote_compaction),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"total_input_bytes", {"total_input_bytes",
{offsetof(struct CompactionJobStats, total_input_bytes), {offsetof(struct CompactionJobStats, total_input_bytes),
OptionType::kUInt64T, OptionVerificationType::kNormal, OptionType::kUInt64T, OptionVerificationType::kNormal,
@ -725,14 +775,6 @@ static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
{offsetof(struct CompactionServiceResult, output_path), {offsetof(struct CompactionServiceResult, output_path),
OptionType::kEncodedString, OptionVerificationType::kNormal, OptionType::kEncodedString, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}}, OptionTypeFlags::kNone}},
{"num_output_records",
{offsetof(struct CompactionServiceResult, num_output_records),
OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"total_bytes",
{offsetof(struct CompactionServiceResult, total_bytes),
OptionType::kUInt64T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"bytes_read", {"bytes_read",
{offsetof(struct CompactionServiceResult, bytes_read), {offsetof(struct CompactionServiceResult, bytes_read),
OptionType::kUInt64T, OptionVerificationType::kNormal, OptionType::kUInt64T, OptionVerificationType::kNormal,

View File

@ -3,9 +3,9 @@
// COPYING file in the root directory) and Apache 2.0 License // COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory). // (found in the LICENSE.Apache file in the root directory).
#include "db/db_test_util.h" #include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "rocksdb/utilities/options_util.h"
#include "table/unique_id_impl.h" #include "table/unique_id_impl.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -21,8 +21,10 @@ class MyTestCompactionService : public CompactionService {
: db_path_(std::move(db_path)), : db_path_(std::move(db_path)),
options_(options), options_(options),
statistics_(statistics), statistics_(statistics),
start_info_("na", "na", "na", 0, Env::TOTAL), start_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
wait_info_("na", "na", "na", 0, Env::TOTAL), false, false, false),
wait_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
false, false, false),
listeners_(listeners), listeners_(listeners),
table_properties_collector_factories_( table_properties_collector_factories_(
std::move(table_properties_collector_factories)) {} std::move(table_properties_collector_factories)) {}
@ -97,8 +99,12 @@ class MyTestCompactionService : public CompactionService {
Status s = Status s =
DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id, DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id,
compaction_input, result, options_override); compaction_input, result, options_override);
if (is_override_wait_result_) { {
*result = override_wait_result_; InstrumentedMutexLock l(&mutex_);
if (is_override_wait_result_) {
*result = override_wait_result_;
}
result_ = *result;
} }
compaction_num_.fetch_add(1); compaction_num_.fetch_add(1);
if (s.ok()) { if (s.ok()) {
@ -108,6 +114,11 @@ class MyTestCompactionService : public CompactionService {
} }
} }
void OnInstallation(const std::string& /*scheduled_job_id*/,
CompactionServiceJobStatus status) override {
final_updated_status_ = status;
}
int GetCompactionNum() { return compaction_num_.load(); } int GetCompactionNum() { return compaction_num_.load(); }
CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; } CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
@ -136,6 +147,14 @@ class MyTestCompactionService : public CompactionService {
void SetCanceled(bool canceled) { canceled_ = canceled; } void SetCanceled(bool canceled) { canceled_ = canceled; }
void GetResult(CompactionServiceResult* deserialized) {
CompactionServiceResult::Read(result_, deserialized).PermitUncheckedError();
}
CompactionServiceJobStatus GetFinalCompactionServiceJobStatus() {
return final_updated_status_.load();
}
private: private:
InstrumentedMutex mutex_; InstrumentedMutex mutex_;
std::atomic_int compaction_num_{0}; std::atomic_int compaction_num_{0};
@ -153,11 +172,14 @@ class MyTestCompactionService : public CompactionService {
CompactionServiceJobStatus override_wait_status_ = CompactionServiceJobStatus override_wait_status_ =
CompactionServiceJobStatus::kFailure; CompactionServiceJobStatus::kFailure;
bool is_override_wait_result_ = false; bool is_override_wait_result_ = false;
std::string result_;
std::string override_wait_result_; std::string override_wait_result_;
std::vector<std::shared_ptr<EventListener>> listeners_; std::vector<std::shared_ptr<EventListener>> listeners_;
std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
table_properties_collector_factories_; table_properties_collector_factories_;
std::atomic_bool canceled_{false}; std::atomic_bool canceled_{false};
std::atomic<CompactionServiceJobStatus> final_updated_status_{
CompactionServiceJobStatus::kUseLocal};
}; };
class CompactionServiceTest : public DBTestBase { class CompactionServiceTest : public DBTestBase {
@ -255,6 +277,8 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
auto my_cs = GetCompactionService(); auto my_cs = GetCompactionService();
ASSERT_GE(my_cs->GetCompactionNum(), 1); ASSERT_GE(my_cs->GetCompactionNum(), 1);
ASSERT_EQ(CompactionServiceJobStatus::kSuccess,
my_cs->GetFinalCompactionServiceJobStatus());
// make sure the compaction statistics is only recorded on the remote side // make sure the compaction statistics is only recorded on the remote side
ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1); ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
@ -318,6 +342,34 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "cf_1", "cf_2", "cf_3"}, ReopenWithColumnFamilies({kDefaultColumnFamilyName, "cf_1", "cf_2", "cf_3"},
options); options);
ASSERT_GT(verify_passed, 0); ASSERT_GT(verify_passed, 0);
CompactionServiceResult result;
my_cs->GetResult(&result);
if (s.IsAborted()) {
ASSERT_NOK(result.status);
} else {
ASSERT_OK(result.status);
}
ASSERT_GE(result.stats.elapsed_micros, 1);
ASSERT_GE(result.stats.cpu_micros, 1);
ASSERT_EQ(20, result.stats.num_output_records);
ASSERT_EQ(result.output_files.size(), result.stats.num_output_files);
uint64_t total_size = 0;
for (auto output_file : result.output_files) {
std::string file_name = result.output_path + "/" + output_file.file_name;
uint64_t file_size = 0;
ASSERT_OK(options.env->GetFileSize(file_name, &file_size));
ASSERT_GT(file_size, 0);
total_size += file_size;
}
ASSERT_EQ(total_size, result.stats.total_output_bytes);
ASSERT_TRUE(result.stats.is_remote_compaction);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_FALSE(result.stats.is_full_compaction);
Close(); Close();
} }
@ -356,6 +408,507 @@ TEST_F(CompactionServiceTest, ManualCompaction) {
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
VerifyTestData(); VerifyTestData();
CompactionServiceResult result;
my_cs->GetResult(&result);
ASSERT_OK(result.status);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_TRUE(result.stats.is_remote_compaction);
}
TEST_F(CompactionServiceTest, PreservedOptionsLocalCompaction) {
Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = 2;
options.disable_auto_compactions = true;
DestroyAndReopen(options);
Random rnd(301);
for (auto i = 0; i < 2; ++i) {
for (auto j = 0; j < 10; ++j) {
ASSERT_OK(
Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
}
ASSERT_OK(Flush());
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
auto compaction = static_cast<Compaction*>(arg);
std::string options_file_name = OptionsFileName(
dbname_,
compaction->input_version()->version_set()->options_file_number());
// Change option twice to make sure the very first OPTIONS file gets
// purged
ASSERT_OK(dbfull()->SetOptions(
{{"level0_file_num_compaction_trigger", "4"}}));
ASSERT_EQ(4, dbfull()->GetOptions().level0_file_num_compaction_trigger);
ASSERT_OK(dbfull()->SetOptions(
{{"level0_file_num_compaction_trigger", "6"}}));
ASSERT_EQ(6, dbfull()->GetOptions().level0_file_num_compaction_trigger);
dbfull()->TEST_DeleteObsoleteFiles();
// For non-remote compactions, OPTIONS file can be deleted while
// using option at the start of the compaction
Status s = env_->FileExists(options_file_name);
ASSERT_NOK(s);
ASSERT_TRUE(s.IsNotFound());
// Should be old value
ASSERT_EQ(2, compaction->mutable_cf_options()
->level0_file_num_compaction_trigger);
ASSERT_TRUE(dbfull()->min_options_file_numbers_.empty());
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_TRUE(s.ok());
}
TEST_F(CompactionServiceTest, PreservedOptionsRemoteCompaction) {
// For non-remote compaction do not preserve options file
Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = 2;
options.disable_auto_compactions = true;
ReopenWithCompactionService(&options);
GenerateTestData();
auto my_cs = GetCompactionService();
Random rnd(301);
for (auto i = 0; i < 2; ++i) {
for (auto j = 0; j < 10; ++j) {
ASSERT_OK(
Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
}
ASSERT_OK(Flush());
}
bool is_primary_called = false;
// This will be called twice. One from primary and one from remote.
// Try changing the option when called from remote. Otherwise, the new option
// will be used
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", [&](void* /*arg*/) {
if (!is_primary_called) {
is_primary_called = true;
return;
}
// Change the option right before the compaction run
ASSERT_OK(dbfull()->SetOptions(
{{"level0_file_num_compaction_trigger", "4"}}));
ASSERT_EQ(4, dbfull()->GetOptions().level0_file_num_compaction_trigger);
dbfull()->TEST_DeleteObsoleteFiles();
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionServiceJob::ProcessKeyValueCompactionWithCompactionService",
[&](void* arg) {
auto input = static_cast<CompactionServiceInput*>(arg);
std::string options_file_name =
OptionsFileName(dbname_, input->options_file_number);
ASSERT_OK(env_->FileExists(options_file_name));
ASSERT_FALSE(dbfull()->min_options_file_numbers_.empty());
ASSERT_EQ(dbfull()->min_options_file_numbers_.front(),
input->options_file_number);
DBOptions db_options;
ConfigOptions config_options;
std::vector<ColumnFamilyDescriptor> all_column_families;
config_options.env = env_;
ASSERT_OK(LoadOptionsFromFile(config_options, options_file_name,
&db_options, &all_column_families));
bool has_cf = false;
for (auto& cf : all_column_families) {
if (cf.name == input->cf_name) {
// Should be old value
ASSERT_EQ(2, cf.options.level0_file_num_compaction_trigger);
has_cf = true;
}
}
ASSERT_TRUE(has_cf);
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
auto compaction = static_cast<Compaction*>(arg);
ASSERT_EQ(2, compaction->mutable_cf_options()
->level0_file_num_compaction_trigger);
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_TRUE(s.ok());
CompactionServiceResult result;
my_cs->GetResult(&result);
ASSERT_OK(result.status);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_TRUE(result.stats.is_remote_compaction);
}
class EventVerifier : public EventListener {
public:
explicit EventVerifier(uint64_t expected_num_input_records,
size_t expected_num_input_files,
uint64_t expected_num_output_records,
size_t expected_num_output_files,
const std::string& expected_smallest_output_key_prefix,
const std::string& expected_largest_output_key_prefix,
bool expected_is_remote_compaction_on_begin,
bool expected_is_remote_compaction_on_complete)
: expected_num_input_records_(expected_num_input_records),
expected_num_input_files_(expected_num_input_files),
expected_num_output_records_(expected_num_output_records),
expected_num_output_files_(expected_num_output_files),
expected_smallest_output_key_prefix_(
expected_smallest_output_key_prefix),
expected_largest_output_key_prefix_(expected_largest_output_key_prefix),
expected_is_remote_compaction_on_begin_(
expected_is_remote_compaction_on_begin),
expected_is_remote_compaction_on_complete_(
expected_is_remote_compaction_on_complete) {}
void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
ASSERT_EQ(expected_num_input_files_, ci.input_files.size());
ASSERT_EQ(expected_num_input_files_, ci.input_file_infos.size());
ASSERT_EQ(expected_is_remote_compaction_on_begin_,
ci.stats.is_remote_compaction);
ASSERT_TRUE(ci.stats.is_manual_compaction);
ASSERT_FALSE(ci.stats.is_full_compaction);
}
void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
ASSERT_GT(ci.stats.elapsed_micros, 0);
ASSERT_GT(ci.stats.cpu_micros, 0);
ASSERT_EQ(expected_num_input_records_, ci.stats.num_input_records);
ASSERT_EQ(expected_num_input_files_, ci.stats.num_input_files);
ASSERT_EQ(expected_num_output_records_, ci.stats.num_output_records);
ASSERT_EQ(expected_num_output_files_, ci.stats.num_output_files);
ASSERT_EQ(expected_smallest_output_key_prefix_,
ci.stats.smallest_output_key_prefix);
ASSERT_EQ(expected_largest_output_key_prefix_,
ci.stats.largest_output_key_prefix);
ASSERT_GT(ci.stats.total_input_bytes, 0);
ASSERT_GT(ci.stats.total_output_bytes, 0);
ASSERT_EQ(ci.stats.num_input_records,
ci.stats.num_output_records + ci.stats.num_records_replaced);
ASSERT_EQ(expected_is_remote_compaction_on_complete_,
ci.stats.is_remote_compaction);
ASSERT_TRUE(ci.stats.is_manual_compaction);
ASSERT_FALSE(ci.stats.is_full_compaction);
}
private:
uint64_t expected_num_input_records_;
size_t expected_num_input_files_;
uint64_t expected_num_output_records_;
size_t expected_num_output_files_;
std::string expected_smallest_output_key_prefix_;
std::string expected_largest_output_key_prefix_;
bool expected_is_remote_compaction_on_begin_;
bool expected_is_remote_compaction_on_complete_;
};
TEST_F(CompactionServiceTest, VerifyStats) {
Options options = CurrentOptions();
options.disable_auto_compactions = true;
auto event_verifier = std::make_shared<EventVerifier>(
30 /* expected_num_input_records */, 3 /* expected_num_input_files */,
20 /* expected_num_output_records */, 1 /* expected_num_output_files */,
"key00000" /* expected_smallest_output_key_prefix */,
"key00001" /* expected_largest_output_key_prefix */,
true /* expected_is_remote_compaction_on_begin */,
true /* expected_is_remote_compaction_on_complete */);
options.listeners.push_back(event_verifier);
ReopenWithCompactionService(&options);
GenerateTestData();
auto my_cs = GetCompactionService();
std::string start_str = Key(0);
std::string end_str = Key(1);
Slice start(start_str);
Slice end(end_str);
uint64_t comp_num = my_cs->GetCompactionNum();
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
VerifyTestData();
CompactionServiceResult result;
my_cs->GetResult(&result);
ASSERT_OK(result.status);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_TRUE(result.stats.is_remote_compaction);
}
TEST_F(CompactionServiceTest, VerifyStatsLocalFallback) {
Options options = CurrentOptions();
options.disable_auto_compactions = true;
auto event_verifier = std::make_shared<EventVerifier>(
30 /* expected_num_input_records */, 3 /* expected_num_input_files */,
20 /* expected_num_output_records */, 1 /* expected_num_output_files */,
"key00000" /* expected_smallest_output_key_prefix */,
"key00001" /* expected_largest_output_key_prefix */,
true /* expected_is_remote_compaction_on_begin */,
false /* expected_is_remote_compaction_on_complete */);
options.listeners.push_back(event_verifier);
ReopenWithCompactionService(&options);
GenerateTestData();
auto my_cs = GetCompactionService();
my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal);
std::string start_str = Key(0);
std::string end_str = Key(1);
Slice start(start_str);
Slice end(end_str);
uint64_t comp_num = my_cs->GetCompactionNum();
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
// Remote Compaction did not happen
ASSERT_EQ(my_cs->GetCompactionNum(), comp_num);
VerifyTestData();
}
TEST_F(CompactionServiceTest, CorruptedOutput) {
Options options = CurrentOptions();
options.disable_auto_compactions = true;
ReopenWithCompactionService(&options);
GenerateTestData();
auto my_cs = GetCompactionService();
std::string start_str = Key(15);
std::string end_str = Key(45);
Slice start(start_str);
Slice end(end_str);
uint64_t comp_num = my_cs->GetCompactionNum();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionServiceCompactionJob::Run:0", [&](void* arg) {
CompactionServiceResult* compaction_result =
*(static_cast<CompactionServiceResult**>(arg));
ASSERT_TRUE(compaction_result != nullptr &&
!compaction_result->output_files.empty());
// Corrupt files here
for (const auto& output_file : compaction_result->output_files) {
std::string file_name =
compaction_result->output_path + "/" + output_file.file_name;
uint64_t file_size = 0;
Status s = options.env->GetFileSize(file_name, &file_size);
ASSERT_OK(s);
ASSERT_GT(file_size, 0);
ASSERT_OK(test::CorruptFile(env_, file_name, 0,
static_cast<int>(file_size),
true /* verifyChecksum */));
}
});
SyncPoint::GetInstance()->EnableProcessing();
// CompactRange() should fail
Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
ASSERT_NOK(s);
ASSERT_TRUE(s.IsCorruption());
ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
// On the worker side, the compaction is considered success
// Verification is done on the primary side
CompactionServiceResult result;
my_cs->GetResult(&result);
ASSERT_OK(result.status);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_TRUE(result.stats.is_remote_compaction);
}
TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) {
for (bool paranoid_file_check_enabled : {false, true}) {
SCOPED_TRACE("paranoid_file_check_enabled=" +
std::to_string(paranoid_file_check_enabled));
Options options = CurrentOptions();
Destroy(options);
options.disable_auto_compactions = true;
options.paranoid_file_checks = paranoid_file_check_enabled;
ReopenWithCompactionService(&options);
GenerateTestData();
auto my_cs = GetCompactionService();
std::string start_str = Key(15);
std::string end_str = Key(45);
Slice start(start_str);
Slice end(end_str);
uint64_t comp_num = my_cs->GetCompactionNum();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionServiceCompactionJob::Run:0", [&](void* arg) {
CompactionServiceResult* compaction_result =
*(static_cast<CompactionServiceResult**>(arg));
ASSERT_TRUE(compaction_result != nullptr &&
!compaction_result->output_files.empty());
// Corrupt files here
for (const auto& output_file : compaction_result->output_files) {
std::string file_name =
compaction_result->output_path + "/" + output_file.file_name;
// Corrupt very small range of bytes. This corruption is so small
// that this isn't caught by default light-weight check
ASSERT_OK(test::CorruptFile(env_, file_name, 0, 1,
false /* verifyChecksum */));
}
});
SyncPoint::GetInstance()->EnableProcessing();
Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
if (paranoid_file_check_enabled) {
ASSERT_NOK(s);
ASSERT_EQ(Status::Corruption("Paranoid checksums do not match"), s);
} else {
// CompactRange() goes through if paranoid file check is not enabled
ASSERT_OK(s);
}
ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
// On the worker side, the compaction is considered success
// Verification is done on the primary side
CompactionServiceResult result;
my_cs->GetResult(&result);
ASSERT_OK(result.status);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_TRUE(result.stats.is_remote_compaction);
}
}
TEST_F(CompactionServiceTest, TruncatedOutput) {
Options options = CurrentOptions();
options.disable_auto_compactions = true;
ReopenWithCompactionService(&options);
GenerateTestData();
auto my_cs = GetCompactionService();
std::string start_str = Key(15);
std::string end_str = Key(45);
Slice start(start_str);
Slice end(end_str);
uint64_t comp_num = my_cs->GetCompactionNum();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionServiceCompactionJob::Run:0", [&](void* arg) {
CompactionServiceResult* compaction_result =
*(static_cast<CompactionServiceResult**>(arg));
ASSERT_TRUE(compaction_result != nullptr &&
!compaction_result->output_files.empty());
// Truncate files here
for (const auto& output_file : compaction_result->output_files) {
std::string file_name =
compaction_result->output_path + "/" + output_file.file_name;
uint64_t file_size = 0;
Status s = options.env->GetFileSize(file_name, &file_size);
ASSERT_OK(s);
ASSERT_GT(file_size, 0);
ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 2));
}
});
SyncPoint::GetInstance()->EnableProcessing();
// CompactRange() should fail
Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
ASSERT_NOK(s);
ASSERT_TRUE(s.IsCorruption());
ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
// On the worker side, the compaction is considered success
// Verification is done on the primary side
CompactionServiceResult result;
my_cs->GetResult(&result);
ASSERT_OK(result.status);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_TRUE(result.stats.is_remote_compaction);
}
TEST_F(CompactionServiceTest, CustomFileChecksum) {
Options options = CurrentOptions();
options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
ReopenWithCompactionService(&options);
GenerateTestData();
auto my_cs = GetCompactionService();
std::string start_str = Key(15);
std::string end_str = Key(45);
Slice start(start_str);
Slice end(end_str);
uint64_t comp_num = my_cs->GetCompactionNum();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionServiceCompactionJob::Run:0", [&](void* arg) {
CompactionServiceResult* compaction_result =
*(static_cast<CompactionServiceResult**>(arg));
ASSERT_TRUE(compaction_result != nullptr &&
!compaction_result->output_files.empty());
// Validate Checksum files here
for (const auto& output_file : compaction_result->output_files) {
std::string file_name =
compaction_result->output_path + "/" + output_file.file_name;
FileChecksumGenContext gen_context;
gen_context.file_name = file_name;
std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
gen_context);
std::unique_ptr<SequentialFile> file_reader;
uint64_t file_size = 0;
Status s = options.env->GetFileSize(file_name, &file_size);
ASSERT_OK(s);
ASSERT_GT(file_size, 0);
s = options.env->NewSequentialFile(file_name, &file_reader,
EnvOptions());
ASSERT_OK(s);
Slice result;
std::unique_ptr<char[]> scratch(new char[file_size]);
s = file_reader->Read(file_size, &result, scratch.get());
ASSERT_OK(s);
file_checksum_gen->Update(scratch.get(), result.size());
file_checksum_gen->Finalize();
// Verify actual checksum and the func name
ASSERT_EQ(file_checksum_gen->Name(),
output_file.file_checksum_func_name);
ASSERT_EQ(file_checksum_gen->GetChecksum(),
output_file.file_checksum);
}
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
CompactionServiceResult result;
my_cs->GetResult(&result);
ASSERT_OK(result.status);
ASSERT_TRUE(result.stats.is_manual_compaction);
ASSERT_TRUE(result.stats.is_remote_compaction);
} }
TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) { TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) {
@ -437,6 +990,8 @@ TEST_F(CompactionServiceTest, InvalidResult) {
Slice end(end_str); Slice end(end_str);
Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
ASSERT_FALSE(s.ok()); ASSERT_FALSE(s.ok());
ASSERT_EQ(CompactionServiceJobStatus::kFailure,
my_cs->GetFinalCompactionServiceJobStatus());
} }
TEST_F(CompactionServiceTest, SubCompaction) { TEST_F(CompactionServiceTest, SubCompaction) {
@ -586,11 +1141,20 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
{file.db_path + "/" + file.name}, 2)); {file.db_path + "/" + file.name}, 2));
info = my_cs->GetCompactionInfoForStart(); info = my_cs->GetCompactionInfoForStart();
ASSERT_EQ(Env::USER, info.priority); ASSERT_EQ(Env::USER, info.priority);
ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
ASSERT_EQ(true, info.is_manual_compaction);
ASSERT_EQ(false, info.is_full_compaction);
ASSERT_EQ(true, info.bottommost_level);
info = my_cs->GetCompactionInfoForWait(); info = my_cs->GetCompactionInfoForWait();
ASSERT_EQ(Env::USER, info.priority); ASSERT_EQ(Env::USER, info.priority);
ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
ASSERT_EQ(true, info.is_manual_compaction);
ASSERT_EQ(false, info.is_full_compaction);
ASSERT_EQ(true, info.bottommost_level);
// Test priority BOTTOM // Test priority BOTTOM
env_->SetBackgroundThreads(1, Env::BOTTOM); env_->SetBackgroundThreads(1, Env::BOTTOM);
// This will set bottommost_level = true but is_full_compaction = false
options.num_levels = 2; options.num_levels = 2;
ReopenWithCompactionService(&options); ReopenWithCompactionService(&options);
my_cs = my_cs =
@ -613,9 +1177,71 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
} }
ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(dbfull()->TEST_WaitForCompact());
info = my_cs->GetCompactionInfoForStart(); info = my_cs->GetCompactionInfoForStart();
ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
ASSERT_EQ(false, info.is_manual_compaction);
ASSERT_EQ(false, info.is_full_compaction);
ASSERT_EQ(true, info.bottommost_level);
ASSERT_EQ(Env::BOTTOM, info.priority); ASSERT_EQ(Env::BOTTOM, info.priority);
info = my_cs->GetCompactionInfoForWait(); info = my_cs->GetCompactionInfoForWait();
ASSERT_EQ(Env::BOTTOM, info.priority); ASSERT_EQ(Env::BOTTOM, info.priority);
ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
ASSERT_EQ(false, info.is_manual_compaction);
ASSERT_EQ(false, info.is_full_compaction);
ASSERT_EQ(true, info.bottommost_level);
// Test Non-Bottommost Level
options.num_levels = 4;
ReopenWithCompactionService(&options);
my_cs =
static_cast_with_check<MyTestCompactionService>(GetCompactionService());
for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
for (int j = 0; j < 10; j++) {
int key_id = i * 10 + j;
ASSERT_OK(Put(Key(key_id), "value_new_new" + std::to_string(key_id)));
}
ASSERT_OK(Flush());
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
info = my_cs->GetCompactionInfoForStart();
ASSERT_EQ(false, info.is_manual_compaction);
ASSERT_EQ(false, info.is_full_compaction);
ASSERT_EQ(false, info.bottommost_level);
info = my_cs->GetCompactionInfoForWait();
ASSERT_EQ(false, info.is_manual_compaction);
ASSERT_EQ(false, info.is_full_compaction);
ASSERT_EQ(false, info.bottommost_level);
// Test Full Compaction + Bottommost Level
options.num_levels = 6;
ReopenWithCompactionService(&options);
my_cs =
static_cast_with_check<MyTestCompactionService>(GetCompactionService());
for (int i = 0; i < 20; i++) {
for (int j = 0; j < 10; j++) {
int key_id = i * 10 + j;
ASSERT_OK(Put(Key(key_id), "value_new_new" + std::to_string(key_id)));
}
ASSERT_OK(Flush());
}
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
info = my_cs->GetCompactionInfoForStart();
ASSERT_EQ(true, info.is_manual_compaction);
ASSERT_EQ(true, info.is_full_compaction);
ASSERT_EQ(true, info.bottommost_level);
ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
info = my_cs->GetCompactionInfoForWait();
ASSERT_EQ(true, info.is_manual_compaction);
ASSERT_EQ(true, info.is_full_compaction);
ASSERT_EQ(true, info.bottommost_level);
ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
} }
TEST_F(CompactionServiceTest, FallbackLocalAuto) { TEST_F(CompactionServiceTest, FallbackLocalAuto) {

View File

@ -39,7 +39,7 @@ void CompactionState::AggregateCompactionStats(
InternalStats::CompactionStatsFull& compaction_stats, InternalStats::CompactionStatsFull& compaction_stats,
CompactionJobStats& compaction_job_stats) { CompactionJobStats& compaction_job_stats) {
for (const auto& sc : sub_compact_states) { for (const auto& sc : sub_compact_states) {
sc.AggregateCompactionStats(compaction_stats); sc.AggregateCompactionOutputStats(compaction_stats);
compaction_job_stats.Add(sc.compaction_job_stats); compaction_job_stats.Add(sc.compaction_job_stats);
} }
} }

View File

@ -13,7 +13,7 @@
#include "rocksdb/sst_partitioner.h" #include "rocksdb/sst_partitioner.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
void SubcompactionState::AggregateCompactionStats( void SubcompactionState::AggregateCompactionOutputStats(
InternalStats::CompactionStatsFull& compaction_stats) const { InternalStats::CompactionStatsFull& compaction_stats) const {
compaction_stats.stats.Add(compaction_outputs_.stats_); compaction_stats.stats.Add(compaction_outputs_.stats_);
if (HasPenultimateLevelOutputs()) { if (HasPenultimateLevelOutputs()) {
@ -34,9 +34,16 @@ void SubcompactionState::Cleanup(Cache* cache) {
if (!status.ok()) { if (!status.ok()) {
for (const auto& out : GetOutputs()) { for (const auto& out : GetOutputs()) {
// If this file was inserted into the table cache then remove // If this file was inserted into the table cache then remove it here
// them here because this compaction was not committed. // because this compaction was not committed. This is not strictly
TableCache::Evict(cache, out.meta.fd.GetNumber()); // required because of a backstop TableCache::Evict() in
// PurgeObsoleteFiles() but is our opportunity to apply
// uncache_aggressiveness. TODO: instead, put these files into the
// VersionSet::obsolete_files_ pipeline so that they don't have to
// be picked up by scanning the DB directory.
TableCache::ReleaseObsolete(
cache, out.meta.fd.GetNumber(), nullptr /*handle*/,
compaction->mutable_cf_options()->uncache_aggressiveness);
} }
} }
// TODO: sub_compact.io_status is not checked like status. Not sure if thats // TODO: sub_compact.io_status is not checked like status. Not sure if thats

View File

@ -179,7 +179,7 @@ class SubcompactionState {
void Cleanup(Cache* cache); void Cleanup(Cache* cache);
void AggregateCompactionStats( void AggregateCompactionOutputStats(
InternalStats::CompactionStatsFull& compaction_stats) const; InternalStats::CompactionStatsFull& compaction_stats) const;
CompactionOutputs& Current() const { CompactionOutputs& Current() const {

View File

@ -2512,6 +2512,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromMemtables) {
start_time + kSecondsPerRecording * (i + 1)); start_time + kSecondsPerRecording * (i + 1));
} }
} }
ASSERT_EQ(kNumKeys, i);
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
} }
@ -2531,12 +2532,13 @@ TEST_P(IteratorWriteTimeTest, ReadFromMemtables) {
} }
} }
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
ASSERT_EQ(-1, i);
} }
// Reopen the DB and disable the seqno to time recording, data with user // Reopen the DB and disable the seqno to time recording, data with user
// specified write time can still get a write time before it's flushed. // specified write time can still get a write time before it's flushed.
options.preserve_internal_time_seconds = 0; options.preserve_internal_time_seconds = 0;
DestroyAndReopen(options); Reopen(options);
ASSERT_OK(TimedPut(Key(kKeyWithWriteTime), rnd.RandomString(100), ASSERT_OK(TimedPut(Key(kKeyWithWriteTime), rnd.RandomString(100),
kUserSpecifiedWriteTime)); kUserSpecifiedWriteTime));
{ {
@ -2613,6 +2615,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
} }
} }
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
ASSERT_EQ(kNumKeys, i);
} }
// Backward iteration // Backward iteration
@ -2632,12 +2635,13 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
} }
} }
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
ASSERT_EQ(-1, i);
} }
// Reopen the DB and disable the seqno to time recording. Data retrieved from // Reopen the DB and disable the seqno to time recording. Data retrieved from
// SST files still have write time available. // SST files still have write time available.
options.preserve_internal_time_seconds = 0; options.preserve_internal_time_seconds = 0;
DestroyAndReopen(options); Reopen(options);
dbfull()->TEST_WaitForPeriodicTaskRun( dbfull()->TEST_WaitForPeriodicTaskRun(
[&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); }); [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
@ -2663,6 +2667,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
start_time + kSecondsPerRecording * (i + 1)); start_time + kSecondsPerRecording * (i + 1));
} }
} }
ASSERT_EQ(kNumKeys, i);
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
} }
@ -2686,6 +2691,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
VerifyKeyAndWriteTime(iter.get(), Key(i), 0); VerifyKeyAndWriteTime(iter.get(), Key(i), 0);
} }
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
ASSERT_EQ(kNumKeys, i);
} }
Close(); Close();
} }

View File

@ -87,7 +87,7 @@ Status VerifySstFileChecksumInternal(const Options& options,
options.block_protection_bytes_per_key, false /* skip_filters */, options.block_protection_bytes_per_key, false /* skip_filters */,
!kImmortal, false /* force_direct_prefetch */, -1 /* level */); !kImmortal, false /* force_direct_prefetch */, -1 /* level */);
reader_options.largest_seqno = largest_seqno; reader_options.largest_seqno = largest_seqno;
s = ioptions.table_factory->NewTableReader( s = options.table_factory->NewTableReader(
read_options, reader_options, std::move(file_reader), file_size, read_options, reader_options, std::move(file_reader), file_size,
&table_reader, false /* prefetch_index_and_filter_in_cache */); &table_reader, false /* prefetch_index_and_filter_in_cache */);
if (!s.ok()) { if (!s.ok()) {

View File

@ -688,76 +688,100 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) {
constexpr size_t kMinIdSize = 10; constexpr size_t kMinIdSize = 10;
do { do {
for (bool with_manifest : {false, true}) { for (bool with_manifest : {false, true}) {
std::string idfilename = IdentityFileName(dbname_); for (bool write_file : {false, true}) {
std::string id1, tmp; std::string idfilename = IdentityFileName(dbname_);
ASSERT_OK(db_->GetDbIdentity(id1)); std::string id1, tmp;
ASSERT_GE(id1.size(), kMinIdSize); ASSERT_OK(db_->GetDbIdentity(id1));
ASSERT_GE(id1.size(), kMinIdSize);
Options options = CurrentOptions(); Options options = CurrentOptions();
options.write_dbid_to_manifest = with_manifest; options.write_dbid_to_manifest = with_manifest;
Reopen(options); options.write_identity_file = true; // initially
std::string id2; Reopen(options);
ASSERT_OK(db_->GetDbIdentity(id2)); std::string id2;
// id2 should match id1 because identity was not regenerated ASSERT_OK(db_->GetDbIdentity(id2));
ASSERT_EQ(id1, id2); // id2 should match id1 because identity was not regenerated
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); ASSERT_EQ(id1, id2);
ASSERT_EQ(tmp, id2); ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
ASSERT_EQ(tmp, id2);
// Recover from deleted/missing IDENTITY if (write_file) {
ASSERT_OK(env_->DeleteFile(idfilename)); // Recover from deleted/missing IDENTITY
Reopen(options); ASSERT_OK(env_->DeleteFile(idfilename));
std::string id3; } else {
ASSERT_OK(db_->GetDbIdentity(id3)); // Transition to no IDENTITY file
if (with_manifest) { options.write_identity_file = false;
// id3 should match id1 because identity was restored from manifest if (!with_manifest) {
ASSERT_EQ(id1, id3); // Incompatible options, should fail
} else { ASSERT_NOK(TryReopen(options));
// id3 should NOT match id1 because identity was regenerated // Back to a usable config and continue
ASSERT_NE(id1, id3); options.write_identity_file = true;
ASSERT_GE(id3.size(), kMinIdSize); Reopen(options);
} continue;
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); }
ASSERT_EQ(tmp, id3); }
Reopen(options);
std::string id3;
ASSERT_OK(db_->GetDbIdentity(id3));
if (with_manifest) {
// id3 should match id1 because identity was restored from manifest
ASSERT_EQ(id1, id3);
} else {
// id3 should NOT match id1 because identity was regenerated
ASSERT_NE(id1, id3);
ASSERT_GE(id3.size(), kMinIdSize);
}
if (write_file) {
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
ASSERT_EQ(tmp, id3);
// Recover from truncated IDENTITY // Recover from truncated IDENTITY
{ std::unique_ptr<WritableFile> w;
std::unique_ptr<WritableFile> w; ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions())); ASSERT_OK(w->Close());
ASSERT_OK(w->Close()); } else {
} ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
Reopen(options); }
std::string id4; Reopen(options);
ASSERT_OK(db_->GetDbIdentity(id4)); std::string id4;
if (with_manifest) { ASSERT_OK(db_->GetDbIdentity(id4));
// id4 should match id1 because identity was restored from manifest if (with_manifest) {
ASSERT_EQ(id1, id4); // id4 should match id1 because identity was restored from manifest
} else { ASSERT_EQ(id1, id4);
// id4 should NOT match id1 because identity was regenerated } else {
ASSERT_NE(id1, id4); // id4 should NOT match id1 because identity was regenerated
ASSERT_GE(id4.size(), kMinIdSize); ASSERT_NE(id1, id4);
} ASSERT_GE(id4.size(), kMinIdSize);
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); }
ASSERT_EQ(tmp, id4); std::string silly_id = "asdf123456789";
if (write_file) {
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
ASSERT_EQ(tmp, id4);
// Recover from overwritten IDENTITY // Recover from overwritten IDENTITY
std::string silly_id = "asdf123456789"; std::unique_ptr<WritableFile> w;
{ ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
std::unique_ptr<WritableFile> w; ASSERT_OK(w->Append(silly_id));
ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions())); ASSERT_OK(w->Close());
ASSERT_OK(w->Append(silly_id)); } else {
ASSERT_OK(w->Close()); ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
}
Reopen(options);
std::string id5;
ASSERT_OK(db_->GetDbIdentity(id5));
if (with_manifest) {
// id4 should match id1 because identity was restored from manifest
ASSERT_EQ(id1, id5);
} else {
ASSERT_EQ(id5, silly_id);
}
if (write_file) {
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
ASSERT_EQ(tmp, id5);
} else {
ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
}
} }
Reopen(options);
std::string id5;
ASSERT_OK(db_->GetDbIdentity(id5));
if (with_manifest) {
// id4 should match id1 because identity was restored from manifest
ASSERT_EQ(id1, id5);
} else {
ASSERT_EQ(id5, silly_id);
}
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
ASSERT_EQ(tmp, id5);
} }
} while (ChangeCompactOptions()); } while (ChangeCompactOptions());
} }
@ -3407,6 +3431,46 @@ class TableFileListener : public EventListener {
InstrumentedMutex mutex_; InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_; std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
}; };
class FlushTableFileListener : public EventListener {
public:
void OnTableFileCreated(const TableFileCreationInfo& info) override {
InstrumentedMutexLock lock(&mutex_);
if (info.reason != TableFileCreationReason::kFlush) {
return;
}
cf_to_flushed_files_[info.cf_name].push_back(info.file_path);
}
std::vector<std::string>& GetFlushedFiles(const std::string& cf_name) {
InstrumentedMutexLock lock(&mutex_);
return cf_to_flushed_files_[cf_name];
}
private:
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>>
cf_to_flushed_files_;
};
class FlushBlobFileListener : public EventListener {
public:
void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
InstrumentedMutexLock lock(&mutex_);
if (info.reason != BlobFileCreationReason::kFlush) {
return;
}
cf_to_flushed_blobs_files_[info.cf_name].push_back(info.file_path);
}
std::vector<std::string>& GetFlushedBlobFiles(const std::string& cf_name) {
InstrumentedMutexLock lock(&mutex_);
return cf_to_flushed_blobs_files_[cf_name];
}
private:
InstrumentedMutex mutex_;
std::unordered_map<std::string, std::vector<std::string>>
cf_to_flushed_blobs_files_;
};
} // anonymous namespace } // anonymous namespace
TEST_F(DBBasicTest, LastSstFileNotInManifest) { TEST_F(DBBasicTest, LastSstFileNotInManifest) {
@ -3512,6 +3576,121 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) {
} }
} }
// Param 0: whether to enable blob DB.
// Param 1: when blob DB is enabled, whether to also delete the missing L0
// file's associated blob file.
class BestEffortsRecoverIncompleteVersionTest
: public DBTestBase,
public testing::WithParamInterface<std::tuple<bool, bool>> {
public:
BestEffortsRecoverIncompleteVersionTest()
: DBTestBase("best_efforts_recover_incomplete_version_test",
/*env_do_fsync=*/false) {}
};
TEST_P(BestEffortsRecoverIncompleteVersionTest, Basic) {
Options options = CurrentOptions();
options.enable_blob_files = std::get<0>(GetParam());
bool delete_blob_file_too = std::get<1>(GetParam());
DestroyAndReopen(options);
FlushTableFileListener* flush_table_listener = new FlushTableFileListener();
FlushBlobFileListener* flush_blob_listener = new FlushBlobFileListener();
// Disable auto compaction to simplify SST file name tracking.
options.disable_auto_compactions = true;
options.listeners.emplace_back(flush_table_listener);
options.listeners.emplace_back(flush_blob_listener);
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
"eevee"};
int num_cfs = static_cast<int>(handles_.size());
ASSERT_EQ(3, num_cfs);
std::string start = "a";
Slice start_slice = start;
std::string end = "d";
Slice end_slice = end;
for (int cf = 0; cf != num_cfs; ++cf) {
ASSERT_OK(Put(cf, "a", "a_value"));
ASSERT_OK(Flush(cf));
// Compact file to L1 to avoid trivial file move in the next compaction
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
&start_slice, &end_slice));
ASSERT_OK(Put(cf, "a", "a_value_new"));
ASSERT_OK(Flush(cf));
ASSERT_OK(Put(cf, "b", "b_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(Put(cf, "f", "f_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
&start_slice, &end_slice));
}
dbfull()->TEST_DeleteObsoleteFiles();
// Delete the most recent L0 file which is before a compaction.
for (int i = 0; i < num_cfs; ++i) {
std::vector<std::string>& files =
flush_table_listener->GetFlushedFiles(all_cf_names[i]);
ASSERT_EQ(4, files.size());
ASSERT_OK(env_->DeleteFile(files[files.size() - 1]));
if (options.enable_blob_files) {
std::vector<std::string>& blob_files =
flush_blob_listener->GetFlushedBlobFiles(all_cf_names[i]);
ASSERT_EQ(4, blob_files.size());
if (delete_blob_file_too) {
ASSERT_OK(env_->DeleteFile(blob_files[files.size() - 1]));
}
}
}
options.best_efforts_recovery = true;
ReopenWithColumnFamilies(all_cf_names, options);
for (int i = 0; i < num_cfs; ++i) {
auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
ColumnFamilyData* cfd = cfh->cfd();
VersionStorageInfo* vstorage = cfd->current()->storage_info();
// The L0 file flushed right before the last compaction is missing.
ASSERT_EQ(0, vstorage->LevelFiles(0).size());
// Only the output of the last compaction is available.
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
}
// Verify data
ReadOptions read_opts;
read_opts.total_order_seek = true;
for (int i = 0; i < num_cfs; ++i) {
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[i]));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("a", iter->key());
ASSERT_EQ("a_value_new", iter->value());
iter->Next();
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("b", iter->key());
ASSERT_EQ("b_value", iter->value());
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
}
// Write more data.
for (int cf = 0; cf < num_cfs; ++cf) {
ASSERT_OK(Put(cf, "g", "g_value"));
ASSERT_OK(Flush(cf));
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
nullptr));
std::string value;
ASSERT_OK(db_->Get(ReadOptions(), handles_[cf], "g", &value));
ASSERT_EQ("g_value", value);
}
}
INSTANTIATE_TEST_CASE_P(BestEffortsRecoverIncompleteVersionTest,
BestEffortsRecoverIncompleteVersionTest,
testing::Values(std::make_tuple(false, false),
std::make_tuple(true, false),
std::make_tuple(true, true)));
TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.env = env_; options.env = env_;

View File

@ -563,7 +563,7 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
} }
} }
TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) { TEST_F(DBBlockCacheTest, DynamicOptions) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.create_if_missing = true; options.create_if_missing = true;
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
@ -578,39 +578,74 @@ TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) {
DestroyAndReopen(options); DestroyAndReopen(options);
std::string value(kValueSize, 'a'); std::string value(kValueSize, 'a');
auto st = options.statistics;
for (size_t i = 1; i <= 5; i++) { size_t i = 1;
ASSERT_OK(Put(std::to_string(i), value)); ASSERT_OK(Put(std::to_string(i), value));
ASSERT_OK(Flush()); ASSERT_OK(Flush());
ASSERT_EQ(1, ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(value, Get(std::to_string(i))); ASSERT_EQ(value, Get(std::to_string(i)));
ASSERT_EQ(0, ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
ASSERT_EQ( ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
ASSERT_EQ(1,
options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
}
++i;
ASSERT_OK(dbfull()->SetOptions( ASSERT_OK(dbfull()->SetOptions(
{{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}})); {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}}));
for (size_t i = 6; i <= kNumBlocks; i++) { ASSERT_OK(Put(std::to_string(i), value));
ASSERT_OK(Put(std::to_string(i), value)); ASSERT_OK(Flush());
ASSERT_OK(Flush()); ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(0,
options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(value, Get(std::to_string(i))); ASSERT_EQ(value, Get(std::to_string(i)));
ASSERT_EQ(1, ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
ASSERT_EQ( ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
ASSERT_EQ(0, ++i;
options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); ASSERT_OK(dbfull()->SetOptions({{"block_based_table_factory",
} "{prepopulate_block_cache=kFlushOnly;}"}}));
ASSERT_OK(Put(std::to_string(i), value));
ASSERT_OK(Flush());
ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(value, Get(std::to_string(i)));
ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
++i;
// NOT YET SUPPORTED
// FIXME: find a way to make this fail again (until well supported)
// ASSERT_NOK(dbfull()->SetOptions(
// {{"block_based_table_factory", "{block_cache=null;}"}}));
// ASSERT_OK(Put(std::to_string(i), value));
// ASSERT_OK(Flush());
// ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
// ASSERT_EQ(value, Get(std::to_string(i)));
// ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
// ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
// ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
// ++i;
// NOT YET SUPPORTED
// FIXME: find a way to make this fail again (until well supported)
// ASSERT_NOK(dbfull()->SetOptions(
// {{"block_based_table_factory", "{block_cache=1M;}"}}));
// ASSERT_OK(Put(std::to_string(i), value));
// ASSERT_OK(Flush());
// ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
// ASSERT_EQ(value, Get(std::to_string(i)));
// ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
// ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
// ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
} }
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -6146,7 +6146,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
std::vector<std::string> pending_compaction_cfs; std::vector<std::string> pending_compaction_cfs;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SchedulePendingCompaction::cfd", [&](void* arg) { "EnqueuePendingCompaction::cfd", [&](void* arg) {
const std::string& cf_name = const std::string& cf_name =
static_cast<ColumnFamilyData*>(arg)->GetName(); static_cast<ColumnFamilyData*>(arg)->GetName();
pending_compaction_cfs.emplace_back(cf_name); pending_compaction_cfs.emplace_back(cf_name);
@ -9357,12 +9357,13 @@ TEST_F(DBCompactionTest, FIFOChangeTemperature) {
ASSERT_OK(Flush()); ASSERT_OK(Flush());
ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(0), "value1"));
env_->MockSleepForSeconds(800);
ASSERT_OK(Put(Key(2), "value2")); ASSERT_OK(Put(Key(2), "value2"));
ASSERT_OK(Flush()); ASSERT_OK(Flush());
// First two L0 files both become eligible for temperature change compaction
// They should be compacted one-by-one.
ASSERT_OK(Put(Key(0), "value1")); ASSERT_OK(Put(Key(0), "value1"));
env_->MockSleepForSeconds(800); env_->MockSleepForSeconds(1200);
ASSERT_OK(Put(Key(2), "value2")); ASSERT_OK(Put(Key(2), "value2"));
ASSERT_OK(Flush()); ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(dbfull()->TEST_WaitForCompact());
@ -10622,6 +10623,97 @@ TEST_F(DBCompactionTest, ReleaseCompactionDuringManifestWrite) {
SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->ClearAllCallBacks();
} }
TEST_F(DBCompactionTest, RecordNewestKeyTimeForTtlCompaction) {
Options options;
SetTimeElapseOnlySleepOnReopen(&options);
options.env = CurrentOptions().env;
options.compaction_style = kCompactionStyleFIFO;
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
options.write_buffer_size = 10 << 10; // 10KB
options.arena_block_size = 4096;
options.compression = kNoCompression;
options.create_if_missing = true;
options.compaction_options_fifo.allow_compaction = false;
options.num_levels = 1;
env_->SetMockSleep();
options.env = env_;
options.ttl = 1 * 60 * 60; // 1 hour
ASSERT_OK(TryReopen(options));
// Generate and flush 4 files, each about 10KB
// Compaction is manually disabled at this point so we can check
// each file's newest_key_time
Random rnd(301);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
}
ASSERT_OK(Flush());
env_->MockSleepForSeconds(5);
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_EQ(NumTableFilesAtLevel(0), 4);
// Check that we are populating newest_key_time on flush
std::vector<FileMetaData*> file_metadatas = GetLevelFileMetadatas(0);
ASSERT_EQ(file_metadatas.size(), 4);
uint64_t first_newest_key_time =
file_metadatas[0]->fd.table_reader->GetTableProperties()->newest_key_time;
ASSERT_NE(first_newest_key_time, kUnknownNewestKeyTime);
// Check that the newest_key_times are in expected ordering
uint64_t prev_newest_key_time = first_newest_key_time;
for (size_t idx = 1; idx < file_metadatas.size(); idx++) {
uint64_t newest_key_time = file_metadatas[idx]
->fd.table_reader->GetTableProperties()
->newest_key_time;
ASSERT_LT(newest_key_time, prev_newest_key_time);
prev_newest_key_time = newest_key_time;
ASSERT_EQ(newest_key_time, file_metadatas[idx]
->fd.table_reader->GetTableProperties()
->creation_time);
}
// The delta between the first and last newest_key_times is 15s
uint64_t last_newest_key_time = prev_newest_key_time;
ASSERT_EQ(15, first_newest_key_time - last_newest_key_time);
// After compaction, the newest_key_time of the output file should be the max
// of the input files
options.compaction_options_fifo.allow_compaction = true;
ASSERT_OK(TryReopen(options));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_EQ(NumTableFilesAtLevel(0), 1);
file_metadatas = GetLevelFileMetadatas(0);
ASSERT_EQ(file_metadatas.size(), 1);
ASSERT_EQ(
file_metadatas[0]->fd.table_reader->GetTableProperties()->newest_key_time,
first_newest_key_time);
// Contrast newest_key_time with creation_time, which records the oldest
// ancestor time (15s older than newest_key_time)
ASSERT_EQ(
file_metadatas[0]->fd.table_reader->GetTableProperties()->creation_time,
last_newest_key_time);
ASSERT_EQ(file_metadatas[0]->oldest_ancester_time, last_newest_key_time);
// Make sure TTL of 5s causes compaction
env_->MockSleepForSeconds(6);
// The oldest input file is older than 15s
// However the newest of the compaction input files is younger than 15s, so
// we don't compact
ASSERT_OK(dbfull()->SetOptions({{"ttl", "15"}}));
ASSERT_EQ(dbfull()->GetOptions().ttl, 15);
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_EQ(NumTableFilesAtLevel(0), 1);
// Now even the youngest input file is too old
ASSERT_OK(dbfull()->SetOptions({{"ttl", "5"}}));
ASSERT_EQ(dbfull()->GetOptions().ttl, 5);
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -289,10 +289,12 @@ TEST_F(DBFollowerTest, RetryCatchup) {
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"}, {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
{"DBImpl::BackgroundCompaction:Start", {"DBImpl::BackgroundCompaction:Start",
"DBImplFollower::TryCatchupWithLeader:Begin2"}, "DBImplFollower::TryCatchupWithLeader:Begin2"},
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1", {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1",
"DBImpl::BackgroundCompaction:BeforeCompaction"}, "DBImpl::BackgroundCompaction:BeforeCompaction"},
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"}, "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2"},
{"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"}, {"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"},
}); });
SyncPoint::GetInstance()->EnableProcessing(); SyncPoint::GetInstance()->EnableProcessing();
@ -335,10 +337,12 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
SyncPoint::GetInstance()->LoadDependency({ SyncPoint::GetInstance()->LoadDependency({
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"}, {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
{"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"}, {"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"},
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1", {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin1",
"Leader::Done"}, "Leader::Done"},
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"}, "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
"Begin2"},
{"DBImplFollower::TryCatchupWithLeader:End", {"DBImplFollower::TryCatchupWithLeader:End",
"Follower::WaitForCatchup:1"}, "Follower::WaitForCatchup:1"},
}); });

View File

@ -17,6 +17,7 @@
#include <cstdio> #include <cstdio>
#include <map> #include <map>
#include <memory> #include <memory>
#include <optional>
#include <set> #include <set>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
@ -472,7 +473,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
if (s.ok()) { if (s.ok()) {
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
SchedulePendingCompaction(cfd); EnqueuePendingCompaction(cfd);
} }
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
} }
@ -529,6 +530,11 @@ Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() {
return Status::OK(); return Status::OK();
} }
void DBImpl::UntrackDataFiles() {
TrackOrUntrackFiles(/*existing_data_files=*/{},
/*track=*/false);
}
Status DBImpl::CloseHelper() { Status DBImpl::CloseHelper() {
// Guarantee that there is no background error recovery in progress before // Guarantee that there is no background error recovery in progress before
// continuing with the shutdown // continuing with the shutdown
@ -653,8 +659,9 @@ Status DBImpl::CloseHelper() {
// We need to release them before the block cache is destroyed. The block // We need to release them before the block cache is destroyed. The block
// cache may be destroyed inside versions_.reset(), when column family data // cache may be destroyed inside versions_.reset(), when column family data
// list is destroyed, so leaving handles in table cache after // list is destroyed, so leaving handles in table cache after
// versions_.reset() may cause issues. // versions_.reset() may cause issues. Here we clean all unreferenced handles
// Here we clean all unreferenced handles in table cache. // in table cache, and (for certain builds/conditions) assert that no obsolete
// files are hanging around unreferenced (leak) in the table/blob file cache.
// Now we assume all user queries have finished, so only version set itself // Now we assume all user queries have finished, so only version set itself
// can possibly hold the blocks from block cache. After releasing unreferenced // can possibly hold the blocks from block cache. After releasing unreferenced
// handles here, only handles held by version set left and inside // handles here, only handles held by version set left and inside
@ -662,12 +669,22 @@ Status DBImpl::CloseHelper() {
// time a handle is released, we erase it from the cache too. By doing that, // time a handle is released, we erase it from the cache too. By doing that,
// we can guarantee that after versions_.reset(), table cache is empty // we can guarantee that after versions_.reset(), table cache is empty
// so the cache can be safely destroyed. // so the cache can be safely destroyed.
#ifndef NDEBUG
TEST_VerifyNoObsoleteFilesCached(/*db_mutex_already_held=*/true);
#endif // !NDEBUG
table_cache_->EraseUnRefEntries(); table_cache_->EraseUnRefEntries();
for (auto& txn_entry : recovered_transactions_) { for (auto& txn_entry : recovered_transactions_) {
delete txn_entry.second; delete txn_entry.second;
} }
// Return an unowned SstFileManager to a consistent state
if (immutable_db_options_.sst_file_manager && !own_sfm_) {
mutex_.Unlock();
UntrackDataFiles();
mutex_.Lock();
}
// versions need to be destroyed before table_cache since it can hold // versions need to be destroyed before table_cache since it can hold
// references to table_cache. // references to table_cache.
{ {
@ -835,10 +852,11 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
InstrumentedMutexLock l(&mutex_); InstrumentedMutexLock l(&mutex_);
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
auto& mopts = *cfd->GetLatestMutableCFOptions();
// preserve time is the max of 2 options. // preserve time is the max of 2 options.
uint64_t preserve_seconds = uint64_t preserve_seconds =
std::max(cfd->ioptions()->preserve_internal_time_seconds, std::max(mopts.preserve_internal_time_seconds,
cfd->ioptions()->preclude_last_level_data_seconds); mopts.preclude_last_level_data_seconds);
if (!cfd->IsDropped() && preserve_seconds > 0) { if (!cfd->IsDropped() && preserve_seconds > 0) {
min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds); min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds); max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
@ -1140,6 +1158,13 @@ void DBImpl::DumpStats() {
continue; continue;
} }
auto* table_factory =
cfd->GetCurrentMutableCFOptions()->table_factory.get();
assert(table_factory != nullptr);
// FIXME: need to a shared_ptr if/when block_cache is going to be mutable
Cache* cache =
table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
// Release DB mutex for gathering cache entry stats. Pass over all // Release DB mutex for gathering cache entry stats. Pass over all
// column families for this first so that other stats are dumped // column families for this first so that other stats are dumped
// near-atomically. // near-atomically.
@ -1148,10 +1173,6 @@ void DBImpl::DumpStats() {
// Probe block cache for problems (if not already via another CF) // Probe block cache for problems (if not already via another CF)
if (immutable_db_options_.info_log) { if (immutable_db_options_.info_log) {
auto* table_factory = cfd->ioptions()->table_factory.get();
assert(table_factory != nullptr);
Cache* cache =
table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
if (cache && probed_caches.insert(cache).second) { if (cache && probed_caches.insert(cache).second) {
cache->ReportProblems(immutable_db_options_.info_log); cache->ReportProblems(immutable_db_options_.info_log);
} }
@ -1525,7 +1546,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
io_s.ToString().c_str()); io_s.ToString().c_str());
// In case there is a fs error we should set it globally to prevent the // In case there is a fs error we should set it globally to prevent the
// future writes // future writes
IOStatusCheck(io_s); WALIOStatusCheck(io_s);
// whether sync or not, we should abort the rest of function upon error // whether sync or not, we should abort the rest of function upon error
return static_cast<Status>(io_s); return static_cast<Status>(io_s);
} }
@ -1682,7 +1703,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
io_s.ToString().c_str()); io_s.ToString().c_str());
// In case there is a fs error we should set it globally to prevent the // In case there is a fs error we should set it globally to prevent the
// future writes // future writes
IOStatusCheck(io_s); WALIOStatusCheck(io_s);
} }
if (io_s.ok() && need_wal_dir_sync) { if (io_s.ok() && need_wal_dir_sync) {
io_s = directories_.GetWalDir()->FsyncWithDirOptions( io_s = directories_.GetWalDir()->FsyncWithDirOptions(
@ -2053,15 +2074,19 @@ InternalIterator* DBImpl::NewInternalIterator(
bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) { bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
InternalIterator* internal_iter; InternalIterator* internal_iter;
assert(arena != nullptr); assert(arena != nullptr);
auto prefix_extractor =
super_version->mutable_cf_options.prefix_extractor.get();
// Need to create internal iterator from the arena. // Need to create internal iterator from the arena.
MergeIteratorBuilder merge_iter_builder( MergeIteratorBuilder merge_iter_builder(
&cfd->internal_comparator(), arena, &cfd->internal_comparator(), arena,
!read_options.total_order_seek && // FIXME? It's not clear what interpretation of prefix seek is needed
super_version->mutable_cf_options.prefix_extractor != nullptr, // here, and no unit test cares about the value provided here.
!read_options.total_order_seek && prefix_extractor != nullptr,
read_options.iterate_upper_bound); read_options.iterate_upper_bound);
// Collect iterator for mutable memtable // Collect iterator for mutable memtable
auto mem_iter = super_version->mem->NewIterator( auto mem_iter = super_version->mem->NewIterator(
read_options, super_version->GetSeqnoToTimeMapping(), arena); read_options, super_version->GetSeqnoToTimeMapping(), arena,
super_version->mutable_cf_options.prefix_extractor.get());
Status s; Status s;
if (!read_options.ignore_range_deletions) { if (!read_options.ignore_range_deletions) {
std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter; std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter;
@ -2085,6 +2110,7 @@ InternalIterator* DBImpl::NewInternalIterator(
if (s.ok()) { if (s.ok()) {
super_version->imm->AddIterators( super_version->imm->AddIterators(
read_options, super_version->GetSeqnoToTimeMapping(), read_options, super_version->GetSeqnoToTimeMapping(),
super_version->mutable_cf_options.prefix_extractor.get(),
&merge_iter_builder, !read_options.ignore_range_deletions); &merge_iter_builder, !read_options.ignore_range_deletions);
} }
TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s); TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
@ -2475,7 +2501,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
RecordTick(stats_, MEMTABLE_HIT); RecordTick(stats_, MEMTABLE_HIT);
} }
} }
if (!done && !s.ok() && !s.IsMergeInProgress()) { if (!s.ok() && !s.IsMergeInProgress() && !s.IsNotFound()) {
assert(done);
ReturnAndCleanupSuperVersion(cfd, sv); ReturnAndCleanupSuperVersion(cfd, sv);
return s; return s;
} }
@ -3141,10 +3168,11 @@ Status DBImpl::MultiGetImpl(
StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
assert(sorted_keys); assert(sorted_keys);
assert(start_key + num_keys <= sorted_keys->size());
// Clear the timestamps for returning results so that we can distinguish // Clear the timestamps for returning results so that we can distinguish
// between tombstone or key that has never been written // between tombstone or key that has never been written
for (auto* kctx : *sorted_keys) { for (size_t i = start_key; i < start_key + num_keys; ++i) {
assert(kctx); KeyContext* kctx = (*sorted_keys)[i];
if (kctx->timestamp) { if (kctx->timestamp) {
kctx->timestamp->clear(); kctx->timestamp->clear();
} }
@ -3207,6 +3235,8 @@ Status DBImpl::MultiGetImpl(
s = Status::Aborted(); s = Status::Aborted();
break; break;
} }
// This could be a long-running operation
ROCKSDB_THREAD_YIELD_HOOK();
} }
// Post processing (decrement reference counts and record statistics) // Post processing (decrement reference counts and record statistics)
@ -3690,6 +3720,9 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
edit.SetColumnFamily(cfd->GetID()); edit.SetColumnFamily(cfd->GetID());
Status s; Status s;
// Save re-aquiring lock for RegisterRecordSeqnoTimeWorker when not
// applicable
bool used_preserve_preclude = false;
{ {
InstrumentedMutexLock l(&mutex_); InstrumentedMutexLock l(&mutex_);
if (cfd->IsDropped()) { if (cfd->IsDropped()) {
@ -3705,9 +3738,11 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
write_thread_.ExitUnbatched(&w); write_thread_.ExitUnbatched(&w);
} }
if (s.ok()) { if (s.ok()) {
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); auto& moptions = *cfd->GetLatestMutableCFOptions();
max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * max_total_in_memory_state_ -=
mutable_cf_options->max_write_buffer_number; moptions.write_buffer_size * moptions.max_write_buffer_number;
used_preserve_preclude = moptions.preserve_internal_time_seconds > 0 ||
moptions.preclude_last_level_data_seconds > 0;
} }
if (!cf_support_snapshot) { if (!cf_support_snapshot) {
@ -3725,8 +3760,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
bg_cv_.SignalAll(); bg_cv_.SignalAll();
} }
if (cfd->ioptions()->preserve_internal_time_seconds > 0 || if (used_preserve_preclude) {
cfd->ioptions()->preclude_last_level_data_seconds > 0) {
s = RegisterRecordSeqnoTimeWorker(read_options, write_options, s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
/* is_new_db */ false); /* is_new_db */ false);
} }
@ -3828,6 +3862,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
} }
} }
if (read_options.tailing) { if (read_options.tailing) {
read_options.total_order_seek |=
immutable_db_options_.prefix_seek_opt_in_only;
auto iter = new ForwardIterator(this, read_options, cfd, sv, auto iter = new ForwardIterator(this, read_options, cfd, sv,
/* allow_unprepared_value */ true); /* allow_unprepared_value */ true);
result = NewDBIterator( result = NewDBIterator(
@ -3961,14 +3998,25 @@ std::unique_ptr<IterType> DBImpl::NewMultiCfIterator(
"Different comparators are being used across CFs")); "Different comparators are being used across CFs"));
} }
} }
std::vector<Iterator*> child_iterators; std::vector<Iterator*> child_iterators;
Status s = NewIterators(_read_options, column_families, &child_iterators); Status s = NewIterators(_read_options, column_families, &child_iterators);
if (!s.ok()) { if (!s.ok()) {
return error_iterator_func(s); return error_iterator_func(s);
} }
return std::make_unique<ImplType>(column_families[0]->GetComparator(),
column_families, assert(column_families.size() == child_iterators.size());
std::move(child_iterators));
std::vector<std::pair<ColumnFamilyHandle*, std::unique_ptr<Iterator>>>
cfh_iter_pairs;
cfh_iter_pairs.reserve(column_families.size());
for (size_t i = 0; i < column_families.size(); ++i) {
cfh_iter_pairs.emplace_back(column_families[i], child_iterators[i]);
}
return std::make_unique<ImplType>(_read_options,
column_families[0]->GetComparator(),
std::move(cfh_iter_pairs));
} }
Status DBImpl::NewIterators( Status DBImpl::NewIterators(
@ -4029,6 +4077,9 @@ Status DBImpl::NewIterators(
assert(cf_sv_pairs.size() == column_families.size()); assert(cf_sv_pairs.size() == column_families.size());
if (read_options.tailing) { if (read_options.tailing) {
read_options.total_order_seek |=
immutable_db_options_.prefix_seek_opt_in_only;
for (const auto& cf_sv_pair : cf_sv_pairs) { for (const auto& cf_sv_pair : cf_sv_pairs) {
auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd, auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
cf_sv_pair.super_version, cf_sv_pair.super_version,
@ -4269,8 +4320,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
} }
// Avoid to go through every column family by checking a global threshold // Avoid to go through every column family by checking a global threshold
// first. // first.
CfdList cf_scheduled;
if (oldest_snapshot > bottommost_files_mark_threshold_) { if (oldest_snapshot > bottommost_files_mark_threshold_) {
CfdList cf_scheduled;
for (auto* cfd : *versions_->GetColumnFamilySet()) { for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (!cfd->ioptions()->allow_ingest_behind) { if (!cfd->ioptions()->allow_ingest_behind) {
cfd->current()->storage_info()->UpdateOldestSnapshot( cfd->current()->storage_info()->UpdateOldestSnapshot(
@ -4279,7 +4330,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
->storage_info() ->storage_info()
->BottommostFilesMarkedForCompaction() ->BottommostFilesMarkedForCompaction()
.empty()) { .empty()) {
SchedulePendingCompaction(cfd); EnqueuePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
cf_scheduled.push_back(cfd); cf_scheduled.push_back(cfd);
} }
@ -4302,6 +4353,24 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
} }
bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold; bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
} }
// Avoid to go through every column family by checking a global threshold
// first.
if (oldest_snapshot >= standalone_range_deletion_files_mark_threshold_) {
for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped() || CfdListContains(cf_scheduled, cfd)) {
continue;
}
if (oldest_snapshot >=
cfd->current()
->storage_info()
->standalone_range_tombstone_files_mark_threshold()) {
EnqueuePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction();
cf_scheduled.push_back(cfd);
}
}
}
} }
delete casted_s; delete casted_s;
} }
@ -4677,9 +4746,9 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
// Convert user_key into a corresponding internal key. // Convert user_key into a corresponding internal key.
InternalKey k1(start.value(), kMaxSequenceNumber, kValueTypeForSeek); InternalKey k1(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
InternalKey k2(limit.value(), kMaxSequenceNumber, kValueTypeForSeek); InternalKey k2(limit.value(), kMaxSequenceNumber, kValueTypeForSeek);
MemTable::MemTableStats memStats = ReadOnlyMemTable::MemTableStats memStats =
sv->mem->ApproximateStats(k1.Encode(), k2.Encode()); sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
MemTable::MemTableStats immStats = ReadOnlyMemTable::MemTableStats immStats =
sv->imm->ApproximateStats(k1.Encode(), k2.Encode()); sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
*count = memStats.count + immStats.count; *count = memStats.count + immStats.count;
*size = memStats.size + immStats.size; *size = memStats.size + immStats.size;
@ -4753,6 +4822,24 @@ void DBImpl::ReleaseFileNumberFromPendingOutputs(
} }
} }
std::list<uint64_t>::iterator DBImpl::CaptureOptionsFileNumber() {
// We need to remember the iterator of our insert, because after the
// compaction is done, we need to remove that element from
// min_options_file_numbers_.
min_options_file_numbers_.push_back(versions_->options_file_number());
auto min_options_file_numbers_inserted_elem = min_options_file_numbers_.end();
--min_options_file_numbers_inserted_elem;
return min_options_file_numbers_inserted_elem;
}
void DBImpl::ReleaseOptionsFileNumber(
std::unique_ptr<std::list<uint64_t>::iterator>& v) {
if (v.get() != nullptr) {
min_options_file_numbers_.erase(*v.get());
v.reset();
}
}
Status DBImpl::GetUpdatesSince( Status DBImpl::GetUpdatesSince(
SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter, SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
const TransactionLogIterator::ReadOptions& read_options) { const TransactionLogIterator::ReadOptions& read_options) {
@ -5116,11 +5203,12 @@ Status DBImpl::GetDbIdentity(std::string& identity) const {
return Status::OK(); return Status::OK();
} }
Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const { Status DBImpl::GetDbIdentityFromIdentityFile(const IOOptions& opts,
std::string* identity) const {
std::string idfilename = IdentityFileName(dbname_); std::string idfilename = IdentityFileName(dbname_);
const FileOptions soptions; const FileOptions soptions;
Status s = ReadFileToString(fs_.get(), idfilename, identity); Status s = ReadFileToString(fs_.get(), idfilename, opts, identity);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -5240,6 +5328,14 @@ Status DestroyDB(const std::string& dbname, const Options& options,
Env* env = soptions.env; Env* env = soptions.env;
std::vector<std::string> filenames; std::vector<std::string> filenames;
bool wal_in_db_path = soptions.IsWalDirSameAsDBPath(); bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
auto sfm = static_cast_with_check<SstFileManagerImpl>(
options.sst_file_manager.get());
// Allocate a separate trash bucket to be used by all the to be deleted
// files, so we can later wait for this bucket to be empty before return.
std::optional<int32_t> bucket;
if (sfm) {
bucket = sfm->NewTrashBucket();
}
// Reset the logger because it holds a handle to the // Reset the logger because it holds a handle to the
// log file and prevents cleanup and directory removal // log file and prevents cleanup and directory removal
@ -5251,6 +5347,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
/*IODebugContext*=*/nullptr) /*IODebugContext*=*/nullptr)
.PermitUncheckedError(); .PermitUncheckedError();
std::set<std::string> paths_to_delete;
FileLock* lock; FileLock* lock;
const std::string lockname = LockFileName(dbname); const std::string lockname = LockFileName(dbname);
Status result = env->LockFile(lockname, &lock); Status result = env->LockFile(lockname, &lock);
@ -5267,10 +5364,9 @@ Status DestroyDB(const std::string& dbname, const Options& options,
del = DestroyDB(path_to_delete, options); del = DestroyDB(path_to_delete, options);
} else if (type == kTableFile || type == kWalFile || } else if (type == kTableFile || type == kWalFile ||
type == kBlobFile) { type == kBlobFile) {
del = DeleteDBFile( del = DeleteUnaccountedDBFile(&soptions, path_to_delete, dbname,
&soptions, path_to_delete, dbname, /*force_bg=*/false,
/*force_bg=*/false, /*force_fg=*/false, bucket);
/*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
} else { } else {
del = env->DeleteFile(path_to_delete); del = env->DeleteFile(path_to_delete);
} }
@ -5279,6 +5375,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
} }
} }
} }
paths_to_delete.insert(dbname);
std::set<std::string> paths; std::set<std::string> paths;
for (const DbPath& db_path : options.db_paths) { for (const DbPath& db_path : options.db_paths) {
@ -5300,18 +5397,19 @@ Status DestroyDB(const std::string& dbname, const Options& options,
(type == kTableFile || (type == kTableFile ||
type == kBlobFile)) { // Lock file will be deleted at end type == kBlobFile)) { // Lock file will be deleted at end
std::string file_path = path + "/" + fname; std::string file_path = path + "/" + fname;
Status del = DeleteDBFile(&soptions, file_path, dbname, Status del = DeleteUnaccountedDBFile(&soptions, file_path, dbname,
/*force_bg=*/false, /*force_fg=*/false); /*force_bg=*/false,
/*force_fg=*/false, bucket);
if (!del.ok() && result.ok()) { if (!del.ok() && result.ok()) {
result = del; result = del;
} }
} }
} }
// TODO: Should we return an error if we cannot delete the directory?
env->DeleteDir(path).PermitUncheckedError();
} }
} }
paths_to_delete.merge(paths);
std::vector<std::string> walDirFiles; std::vector<std::string> walDirFiles;
std::string archivedir = ArchivalDirectory(dbname); std::string archivedir = ArchivalDirectory(dbname);
bool wal_dir_exists = false; bool wal_dir_exists = false;
@ -5335,46 +5433,49 @@ Status DestroyDB(const std::string& dbname, const Options& options,
// Delete archival files. // Delete archival files.
for (const auto& file : archiveFiles) { for (const auto& file : archiveFiles) {
if (ParseFileName(file, &number, &type) && type == kWalFile) { if (ParseFileName(file, &number, &type) && type == kWalFile) {
Status del = Status del = DeleteUnaccountedDBFile(
DeleteDBFile(&soptions, archivedir + "/" + file, archivedir, &soptions, archivedir + "/" + file, archivedir,
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path); /*force_bg=*/false, /*force_fg=*/!wal_in_db_path, bucket);
if (!del.ok() && result.ok()) { if (!del.ok() && result.ok()) {
result = del; result = del;
} }
} }
} }
// Ignore error in case dir contains other files paths_to_delete.insert(archivedir);
env->DeleteDir(archivedir).PermitUncheckedError();
} }
// Delete log files in the WAL dir // Delete log files in the WAL dir
if (wal_dir_exists) { if (wal_dir_exists) {
for (const auto& file : walDirFiles) { for (const auto& file : walDirFiles) {
if (ParseFileName(file, &number, &type) && type == kWalFile) { if (ParseFileName(file, &number, &type) && type == kWalFile) {
Status del = Status del = DeleteUnaccountedDBFile(
DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number), &soptions, LogFileName(soptions.wal_dir, number),
soptions.wal_dir, /*force_bg=*/false, soptions.wal_dir, /*force_bg=*/false,
/*force_fg=*/!wal_in_db_path); /*force_fg=*/!wal_in_db_path, bucket);
if (!del.ok() && result.ok()) { if (!del.ok() && result.ok()) {
result = del; result = del;
} }
} }
} }
// Ignore error in case dir contains other files paths_to_delete.insert(soptions.wal_dir);
env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
} }
// Ignore error since state is already gone // Ignore error since state is already gone
env->UnlockFile(lock).PermitUncheckedError(); env->UnlockFile(lock).PermitUncheckedError();
env->DeleteFile(lockname).PermitUncheckedError(); env->DeleteFile(lockname).PermitUncheckedError();
// Make sure trash files are all cleared before return.
if (sfm && bucket.has_value()) {
sfm->WaitForEmptyTrashBucket(bucket.value());
}
// sst_file_manager holds a ref to the logger. Make sure the logger is // sst_file_manager holds a ref to the logger. Make sure the logger is
// gone before trying to remove the directory. // gone before trying to remove the directory.
soptions.sst_file_manager.reset(); soptions.sst_file_manager.reset();
// Ignore error in case dir contains other files // Ignore error in case dir contains other files
env->DeleteDir(dbname).PermitUncheckedError(); for (const auto& path_to_delete : paths_to_delete) {
; env->DeleteDir(path_to_delete).PermitUncheckedError();
}
} }
return result; return result;
} }
@ -5772,7 +5873,6 @@ Status DBImpl::IngestExternalFile(
Status DBImpl::IngestExternalFiles( Status DBImpl::IngestExternalFiles(
const std::vector<IngestExternalFileArg>& args) { const std::vector<IngestExternalFileArg>& args) {
// TODO: plumb Env::IOActivity, Env::IOPriority // TODO: plumb Env::IOActivity, Env::IOPriority
const ReadOptions read_options;
const WriteOptions write_options; const WriteOptions write_options;
if (args.empty()) { if (args.empty()) {
@ -5798,6 +5898,10 @@ Status DBImpl::IngestExternalFiles(
snprintf(err_msg, 128, "external_files[%zu] is empty", i); snprintf(err_msg, 128, "external_files[%zu] is empty", i);
return Status::InvalidArgument(err_msg); return Status::InvalidArgument(err_msg);
} }
if (i && args[i].options.fill_cache != args[i - 1].options.fill_cache) {
return Status::InvalidArgument(
"fill_cache should be the same across ingestion options.");
}
} }
for (const auto& arg : args) { for (const auto& arg : args) {
const IngestExternalFileOptions& ingest_opts = arg.options; const IngestExternalFileOptions& ingest_opts = arg.options;
@ -5820,11 +5924,10 @@ Status DBImpl::IngestExternalFiles(
"write_global_seqno is deprecated and does not work with " "write_global_seqno is deprecated and does not work with "
"allow_db_generated_files."); "allow_db_generated_files.");
} }
if (ingest_opts.move_files) { }
return Status::NotSupported( if (ingest_opts.move_files && ingest_opts.link_files) {
"Options move_files and allow_db_generated_files are not " return Status::InvalidArgument(
"compatible."); "`move_files` and `link_files` can not both be true.");
}
} }
} }
@ -5858,9 +5961,9 @@ Status DBImpl::IngestExternalFiles(
uint64_t start_file_number = next_file_number; uint64_t start_file_number = next_file_number;
for (size_t i = 1; i != num_cfs; ++i) { for (size_t i = 1; i != num_cfs; ++i) {
start_file_number += args[i - 1].external_files.size(); start_file_number += args[i - 1].external_files.size();
auto* cfd = SuperVersion* super_version =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd(); ingestion_jobs[i].GetColumnFamilyData()->GetReferencedSuperVersion(
SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); this);
Status es = ingestion_jobs[i].Prepare( Status es = ingestion_jobs[i].Prepare(
args[i].external_files, args[i].files_checksums, args[i].external_files, args[i].files_checksums,
args[i].files_checksum_func_names, args[i].file_temperature, args[i].files_checksum_func_names, args[i].file_temperature,
@ -5874,9 +5977,9 @@ Status DBImpl::IngestExternalFiles(
TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0"); TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"); TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
{ {
auto* cfd = SuperVersion* super_version =
static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(); ingestion_jobs[0].GetColumnFamilyData()->GetReferencedSuperVersion(
SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); this);
Status es = ingestion_jobs[0].Prepare( Status es = ingestion_jobs[0].Prepare(
args[0].external_files, args[0].files_checksums, args[0].external_files, args[0].files_checksums,
args[0].files_checksum_func_names, args[0].file_temperature, args[0].files_checksum_func_names, args[0].file_temperature,
@ -5927,8 +6030,7 @@ Status DBImpl::IngestExternalFiles(
bool at_least_one_cf_need_flush = false; bool at_least_one_cf_need_flush = false;
std::vector<bool> need_flush(num_cfs, false); std::vector<bool> need_flush(num_cfs, false);
for (size_t i = 0; i != num_cfs; ++i) { for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd = auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
if (cfd->IsDropped()) { if (cfd->IsDropped()) {
// TODO (yanqin) investigate whether we should abort ingestion or // TODO (yanqin) investigate whether we should abort ingestion or
// proceed with other non-dropped column families. // proceed with other non-dropped column families.
@ -5960,12 +6062,10 @@ Status DBImpl::IngestExternalFiles(
for (size_t i = 0; i != num_cfs; ++i) { for (size_t i = 0; i != num_cfs; ++i) {
if (need_flush[i]) { if (need_flush[i]) {
mutex_.Unlock(); mutex_.Unlock();
auto* cfd = status =
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family) FlushMemTable(ingestion_jobs[i].GetColumnFamilyData(),
->cfd(); flush_opts, FlushReason::kExternalFileIngestion,
status = FlushMemTable(cfd, flush_opts, true /* entered_write_thread */);
FlushReason::kExternalFileIngestion,
true /* entered_write_thread */);
mutex_.Lock(); mutex_.Lock();
if (!status.ok()) { if (!status.ok()) {
break; break;
@ -5973,6 +6073,13 @@ Status DBImpl::IngestExternalFiles(
} }
} }
} }
if (status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) {
if (immutable_db_options_.atomic_flush || need_flush[i]) {
ingestion_jobs[i].SetFlushedBeforeRun();
}
}
}
} }
// Run ingestion jobs. // Run ingestion jobs.
if (status.ok()) { if (status.ok()) {
@ -5986,16 +6093,15 @@ Status DBImpl::IngestExternalFiles(
} }
} }
if (status.ok()) { if (status.ok()) {
ReadOptions read_options;
read_options.fill_cache = args[0].options.fill_cache;
autovector<ColumnFamilyData*> cfds_to_commit; autovector<ColumnFamilyData*> cfds_to_commit;
autovector<const MutableCFOptions*> mutable_cf_options_list; autovector<const MutableCFOptions*> mutable_cf_options_list;
autovector<autovector<VersionEdit*>> edit_lists; autovector<autovector<VersionEdit*>> edit_lists;
uint32_t num_entries = 0; uint32_t num_entries = 0;
for (size_t i = 0; i != num_cfs; ++i) { for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd = auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd(); assert(!cfd->IsDropped());
if (cfd->IsDropped()) {
continue;
}
cfds_to_commit.push_back(cfd); cfds_to_commit.push_back(cfd);
mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
autovector<VersionEdit*> edit_list; autovector<VersionEdit*> edit_list;
@ -6045,20 +6151,16 @@ Status DBImpl::IngestExternalFiles(
if (status.ok()) { if (status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) { for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd = auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd(); assert(!cfd->IsDropped());
if (!cfd->IsDropped()) { InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i], *cfd->GetLatestMutableCFOptions());
*cfd->GetLatestMutableCFOptions());
#ifndef NDEBUG #ifndef NDEBUG
if (0 == i && num_cfs > 1) { if (0 == i && num_cfs > 1) {
TEST_SYNC_POINT( TEST_SYNC_POINT("DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0"); TEST_SYNC_POINT("DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
TEST_SYNC_POINT(
"DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
}
#endif // !NDEBUG
} }
#endif // !NDEBUG
} }
} else if (versions_->io_status().IsIOError()) { } else if (versions_->io_status().IsIOError()) {
// Error while writing to MANIFEST. // Error while writing to MANIFEST.
@ -6100,8 +6202,7 @@ Status DBImpl::IngestExternalFiles(
} }
if (status.ok()) { if (status.ok()) {
for (size_t i = 0; i != num_cfs; ++i) { for (size_t i = 0; i != num_cfs; ++i) {
auto* cfd = auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
if (!cfd->IsDropped()) { if (!cfd->IsDropped()) {
NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]); NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
} }
@ -6732,6 +6833,62 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
} }
} }
void DBImpl::TrackOrUntrackFiles(
const std::vector<std::string>& existing_data_files, bool track) {
auto sfm = static_cast_with_check<SstFileManagerImpl>(
immutable_db_options_.sst_file_manager.get());
assert(sfm);
std::vector<ColumnFamilyMetaData> metadata;
GetAllColumnFamilyMetaData(&metadata);
auto action = [&](const std::string& file_path,
std::optional<uint64_t> size) {
if (track) {
if (size) {
sfm->OnAddFile(file_path, *size).PermitUncheckedError();
} else {
sfm->OnAddFile(file_path).PermitUncheckedError();
}
} else {
sfm->OnUntrackFile(file_path).PermitUncheckedError();
}
};
std::unordered_set<std::string> referenced_files;
for (const auto& md : metadata) {
for (const auto& lmd : md.levels) {
for (const auto& fmd : lmd.files) {
// We're assuming that each sst file name exists in at most one of
// the paths.
std::string file_path =
fmd.directory + kFilePathSeparator + fmd.relative_filename;
action(file_path, fmd.size);
referenced_files.insert(file_path);
}
}
for (const auto& bmd : md.blob_files) {
std::string name = bmd.blob_file_name;
// The BlobMetaData.blob_file_name may start with "/".
if (!name.empty() && name[0] == kFilePathSeparator) {
name = name.substr(1);
}
// We're assuming that each blob file name exists in at most one of
// the paths.
std::string file_path = bmd.blob_file_path + kFilePathSeparator + name;
action(file_path, bmd.blob_file_size);
referenced_files.insert(file_path);
}
}
for (const auto& file_path : existing_data_files) {
if (referenced_files.find(file_path) != referenced_files.end()) {
continue;
}
// There shouldn't be any duplicated files. In case there is, SstFileManager
// will take care of deduping it.
action(file_path, /*size=*/std::nullopt);
}
}
void DBImpl::InstallSeqnoToTimeMappingInSV( void DBImpl::InstallSeqnoToTimeMappingInSV(
std::vector<SuperVersionContext>* sv_contexts) { std::vector<SuperVersionContext>* sv_contexts) {
mutex_.AssertHeld(); mutex_.AssertHeld();

View File

@ -482,7 +482,8 @@ class DBImpl : public DB {
Status GetDbIdentity(std::string& identity) const override; Status GetDbIdentity(std::string& identity) const override;
virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const; virtual Status GetDbIdentityFromIdentityFile(const IOOptions& opts,
std::string* identity) const;
Status GetDbSessionId(std::string& session_id) const override; Status GetDbSessionId(std::string& session_id) const override;
@ -853,6 +854,8 @@ class DBImpl : public DB {
uint64_t GetObsoleteSstFilesSize(); uint64_t GetObsoleteSstFilesSize();
uint64_t MinOptionsFileNumberToKeep();
// Returns the list of live files in 'live' and the list // Returns the list of live files in 'live' and the list
// of all files in the filesystem in 'candidate_files'. // of all files in the filesystem in 'candidate_files'.
// If force == false and the last call was less than // If force == false and the last call was less than
@ -1151,6 +1154,8 @@ class DBImpl : public DB {
// Get the background error status // Get the background error status
Status TEST_GetBGError(); Status TEST_GetBGError();
bool TEST_IsRecoveryInProgress();
// Return the maximum overlapping data (in bytes) at next level for any // Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1. // file at a level >= 1.
uint64_t TEST_MaxNextLevelOverlappingBytes( uint64_t TEST_MaxNextLevelOverlappingBytes(
@ -1195,9 +1200,7 @@ class DBImpl : public DB {
uint64_t TEST_total_log_size() const { return total_log_size_; } uint64_t TEST_total_log_size() const { return total_log_size_; }
// Returns column family name to ImmutableCFOptions map. void TEST_GetAllBlockCaches(std::unordered_set<const Cache*>* cache_set);
Status TEST_GetAllImmutableCFOptions(
std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
// Return the lastest MutableCFOptions of a column family // Return the lastest MutableCFOptions of a column family
Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
@ -1226,6 +1229,8 @@ class DBImpl : public DB {
return logs_.back().number; return logs_.back().number;
} }
void TEST_DeleteObsoleteFiles();
const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const { const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
return files_grabbed_for_purge_; return files_grabbed_for_purge_;
} }
@ -1235,9 +1240,14 @@ class DBImpl : public DB {
static Status TEST_ValidateOptions(const DBOptions& db_options) { static Status TEST_ValidateOptions(const DBOptions& db_options) {
return ValidateOptions(db_options); return ValidateOptions(db_options);
} }
#endif // NDEBUG #endif // NDEBUG
// In certain configurations, verify that the table/blob file cache only
// contains entries for live files, to check for effective leaks of open
// files. This can only be called when purging of obsolete files has
// "settled," such as during parts of DB Close().
void TEST_VerifyNoObsoleteFilesCached(bool db_mutex_already_held) const;
// persist stats to column family "_persistent_stats" // persist stats to column family "_persistent_stats"
void PersistStats(); void PersistStats();
@ -1580,11 +1590,12 @@ class DBImpl : public DB {
virtual bool OwnTablesAndLogs() const { return true; } virtual bool OwnTablesAndLogs() const { return true; }
// Setup DB identity file, and write DB ID to manifest if necessary. // Read/create DB identity file (as appropriate), and write DB ID to
// version_edit if provided.
Status SetupDBId(const WriteOptions& write_options, bool read_only, Status SetupDBId(const WriteOptions& write_options, bool read_only,
RecoveryContext* recovery_ctx); bool is_new_db, bool is_retry, VersionEdit* version_edit);
// Assign db_id_ and write DB ID to manifest if necessary. // Assign db_id_ and write DB ID to version_edit if provided.
void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx); void SetDBId(std::string&& id, bool read_only, VersionEdit* version_edit);
// Collect a deduplicated collection of paths used by this DB, including // Collect a deduplicated collection of paths used by this DB, including
// dbname_, DBOptions.db_paths, ColumnFamilyOptions.cf_paths. // dbname_, DBOptions.db_paths, ColumnFamilyOptions.cf_paths.
@ -1614,9 +1625,15 @@ class DBImpl : public DB {
// vast majority of all files), since it already has the file size // vast majority of all files), since it already has the file size
// on record, we don't need to query the file system. Otherwise, we query the // on record, we don't need to query the file system. Otherwise, we query the
// file system for the size of an unreferenced file. // file system for the size of an unreferenced file.
// REQUIRES: mutex unlocked
void TrackExistingDataFiles( void TrackExistingDataFiles(
const std::vector<std::string>& existing_data_files); const std::vector<std::string>& existing_data_files);
// Untrack data files in sst manager. This is only called during DB::Close on
// an unowned SstFileManager, to return it to a consistent state.
// REQUIRES: mutex unlocked
void UntrackDataFiles();
// SetDbSessionId() should be called in the constuctor DBImpl() // SetDbSessionId() should be called in the constuctor DBImpl()
// to ensure that db_session_id_ gets updated every time the DB is opened // to ensure that db_session_id_ gets updated every time the DB is opened
void SetDbSessionId(); void SetDbSessionId();
@ -1683,6 +1700,8 @@ class DBImpl : public DB {
friend class XFTransactionWriteHandler; friend class XFTransactionWriteHandler;
friend class DBBlobIndexTest; friend class DBBlobIndexTest;
friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
friend class CompactionServiceTest_PreservedOptionsLocalCompaction_Test;
friend class CompactionServiceTest_PreservedOptionsRemoteCompaction_Test;
#endif #endif
struct CompactionState; struct CompactionState;
@ -1691,7 +1710,7 @@ class DBImpl : public DB {
struct WriteContext { struct WriteContext {
SuperVersionContext superversion_context; SuperVersionContext superversion_context;
autovector<MemTable*> memtables_to_free_; autovector<ReadOnlyMemTable*> memtables_to_free_;
explicit WriteContext(bool create_superversion = false) explicit WriteContext(bool create_superversion = false)
: superversion_context(create_superversion) {} : superversion_context(create_superversion) {}
@ -1954,6 +1973,13 @@ class DBImpl : public DB {
void ReleaseFileNumberFromPendingOutputs( void ReleaseFileNumberFromPendingOutputs(
std::unique_ptr<std::list<uint64_t>::iterator>& v); std::unique_ptr<std::list<uint64_t>::iterator>& v);
// Similar to pending_outputs, preserve OPTIONS file. Used for remote
// compaction.
std::list<uint64_t>::iterator CaptureOptionsFileNumber();
void ReleaseOptionsFileNumber(
std::unique_ptr<std::list<uint64_t>::iterator>& v);
// Sets bg error if there is an error writing to WAL.
IOStatus SyncClosedWals(const WriteOptions& write_options, IOStatus SyncClosedWals(const WriteOptions& write_options,
JobContext* job_context, VersionEdit* synced_wals, JobContext* job_context, VersionEdit* synced_wals,
bool error_recovery_in_prog); bool error_recovery_in_prog);
@ -2026,6 +2052,8 @@ class DBImpl : public DB {
Status TrimMemtableHistory(WriteContext* context); Status TrimMemtableHistory(WriteContext* context);
// Switches the current live memtable to immutable/read-only memtable.
// A new WAL is created if the current WAL is not empty.
Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
// Select and output column families qualified for atomic flush in // Select and output column families qualified for atomic flush in
@ -2064,17 +2092,18 @@ class DBImpl : public DB {
// memtable pending flush. // memtable pending flush.
// resuming_from_bg_err indicates whether the caller is attempting to resume // resuming_from_bg_err indicates whether the caller is attempting to resume
// from background error. // from background error.
Status WaitForFlushMemTable(ColumnFamilyData* cfd, Status WaitForFlushMemTable(
const uint64_t* flush_memtable_id = nullptr, ColumnFamilyData* cfd, const uint64_t* flush_memtable_id = nullptr,
bool resuming_from_bg_err = false) { bool resuming_from_bg_err = false,
std::optional<FlushReason> flush_reason = std::nullopt) {
return WaitForFlushMemTables({cfd}, {flush_memtable_id}, return WaitForFlushMemTables({cfd}, {flush_memtable_id},
resuming_from_bg_err); resuming_from_bg_err, flush_reason);
} }
// Wait for memtables to be flushed for multiple column families. // Wait for memtables to be flushed for multiple column families.
Status WaitForFlushMemTables( Status WaitForFlushMemTables(
const autovector<ColumnFamilyData*>& cfds, const autovector<ColumnFamilyData*>& cfds,
const autovector<const uint64_t*>& flush_memtable_ids, const autovector<const uint64_t*>& flush_memtable_ids,
bool resuming_from_bg_err); bool resuming_from_bg_err, std::optional<FlushReason> flush_reason);
inline void WaitForPendingWrites() { inline void WaitForPendingWrites() {
mutex_.AssertHeld(); mutex_.AssertHeld();
@ -2172,7 +2201,7 @@ class DBImpl : public DB {
// Used by WriteImpl to update bg_error_ when IO error happens, e.g., write // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
// WAL, sync WAL fails, if paranoid check is enabled. // WAL, sync WAL fails, if paranoid check is enabled.
void IOStatusCheck(const IOStatus& status); void WALIOStatusCheck(const IOStatus& status);
// Used by WriteImpl to update bg_error_ in case of memtable insert error. // Used by WriteImpl to update bg_error_ in case of memtable insert error.
void MemTableInsertStatusCheck(const Status& memtable_insert_status); void MemTableInsertStatusCheck(const Status& memtable_insert_status);
@ -2185,7 +2214,9 @@ class DBImpl : public DB {
JobContext* job_context, LogBuffer* log_buffer, JobContext* job_context, LogBuffer* log_buffer,
CompactionJobInfo* compaction_job_info); CompactionJobInfo* compaction_job_info);
ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); // REQUIRES: mutex unlocked
void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
bool track);
void MaybeScheduleFlushOrCompaction(); void MaybeScheduleFlushOrCompaction();
@ -2214,10 +2245,27 @@ class DBImpl : public DB {
void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds, void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
FlushReason flush_reason, FlushRequest* req); FlushReason flush_reason, FlushRequest* req);
// Returns true if `req` is successfully enqueued. // Below functions are for executing flush, compaction in the background. A
bool SchedulePendingFlush(const FlushRequest& req); // dequeue is the communication channel between threads that asks for the work
// to be done and the available threads in the thread pool that pick it up to
// execute it. We use these terminologies to describe the state of the work
// and its transitions:
// 1) It becomes pending once it's successfully enqueued into the
// corresponding dequeue, a work in this state is also called unscheduled.
// Counter `unscheduled_*_` counts work in this state.
// 2) When `MaybeScheduleFlushOrCompaction` schedule a thread to run `BGWork*`
// for the work, it becomes scheduled
// Counter `bg_*_scheduled_` counts work in this state.
// 3) Once the thread start to execute `BGWork*`, the work is popped from the
// dequeue, it is now in running state
// Counter `num_running_*_` counts work in this state.
// 4) Eventually, the work is finished. We don't need to specifically track
// finished work.
void SchedulePendingCompaction(ColumnFamilyData* cfd); // Returns true if `req` is successfully enqueued.
bool EnqueuePendingFlush(const FlushRequest& req);
void EnqueuePendingCompaction(ColumnFamilyData* cfd);
void SchedulePendingPurge(std::string fname, std::string dir_to_sync, void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
FileType type, uint64_t number, int job_id); FileType type, uint64_t number, int job_id);
static void BGWorkCompaction(void* arg); static void BGWorkCompaction(void* arg);
@ -2722,6 +2770,11 @@ class DBImpl : public DB {
// State is protected with db mutex. // State is protected with db mutex.
std::list<uint64_t> pending_outputs_; std::list<uint64_t> pending_outputs_;
// Similar to pending_outputs_, FindObsoleteFiles()/PurgeObsoleteFiles() never
// deletes any OPTIONS file that has number bigger than any of the file number
// in min_options_file_numbers_.
std::list<uint64_t> min_options_file_numbers_;
// flush_queue_ and compaction_queue_ hold column families that we need to // flush_queue_ and compaction_queue_ hold column families that we need to
// flush and compact, respectively. // flush and compact, respectively.
// A column family is inserted into flush_queue_ when it satisfies condition // A column family is inserted into flush_queue_ when it satisfies condition
@ -2844,6 +2897,11 @@ class DBImpl : public DB {
// garbages, among all column families. // garbages, among all column families.
SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
// The min threshold to trigger compactions for standalone range deletion
// files that are marked for compaction.
SequenceNumber standalone_range_deletion_files_mark_threshold_ =
kMaxSequenceNumber;
LogsWithPrepTracker logs_with_prep_tracker_; LogsWithPrepTracker logs_with_prep_tracker_;
// Callback for compaction to check if a key is visible to a snapshot. // Callback for compaction to check if a key is visible to a snapshot.
@ -2944,6 +3002,15 @@ DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions, CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
const MutableCFOptions& mutable_cf_options); const MutableCFOptions& mutable_cf_options);
// Return a VersionEdit for the DB's recovery when the `memtables` of the
// specified column family are obsolete. Specifically, the min log number to
// keep, and the WAL files that can be deleted.
VersionEdit GetDBRecoveryEditForObsoletingMemTables(
VersionSet* vset, const ColumnFamilyData& cfd,
const autovector<VersionEdit*>& edit_list,
const autovector<ReadOnlyMemTable*>& memtables,
LogsWithPrepTracker* prep_tracker);
// Return the earliest log file to keep after the memtable flush is // Return the earliest log file to keep after the memtable flush is
// finalized. // finalized.
// `cfd_to_flush` is the column family whose memtable (specified in // `cfd_to_flush` is the column family whose memtable (specified in
@ -2953,13 +3020,13 @@ CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
uint64_t PrecomputeMinLogNumberToKeep2PC( uint64_t PrecomputeMinLogNumberToKeep2PC(
VersionSet* vset, const ColumnFamilyData& cfd_to_flush, VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
const autovector<VersionEdit*>& edit_list, const autovector<VersionEdit*>& edit_list,
const autovector<MemTable*>& memtables_to_flush, const autovector<ReadOnlyMemTable*>& memtables_to_flush,
LogsWithPrepTracker* prep_tracker); LogsWithPrepTracker* prep_tracker);
// For atomic flush. // For atomic flush.
uint64_t PrecomputeMinLogNumberToKeep2PC( uint64_t PrecomputeMinLogNumberToKeep2PC(
VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush, VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
const autovector<autovector<VersionEdit*>>& edit_lists, const autovector<autovector<VersionEdit*>>& edit_lists,
const autovector<const autovector<MemTable*>*>& memtables_to_flush, const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush,
LogsWithPrepTracker* prep_tracker); LogsWithPrepTracker* prep_tracker);
// In non-2PC mode, WALs with log number < the returned number can be // In non-2PC mode, WALs with log number < the returned number can be
@ -2976,11 +3043,11 @@ uint64_t PrecomputeMinLogNumberToKeepNon2PC(
// will not depend on any WAL file. nullptr means no memtable is being flushed. // will not depend on any WAL file. nullptr means no memtable is being flushed.
// The function is only applicable to 2pc mode. // The function is only applicable to 2pc mode.
uint64_t FindMinPrepLogReferencedByMemTable( uint64_t FindMinPrepLogReferencedByMemTable(
VersionSet* vset, const autovector<MemTable*>& memtables_to_flush); VersionSet* vset, const autovector<ReadOnlyMemTable*>& memtables_to_flush);
// For atomic flush. // For atomic flush.
uint64_t FindMinPrepLogReferencedByMemTable( uint64_t FindMinPrepLogReferencedByMemTable(
VersionSet* vset, VersionSet* vset,
const autovector<const autovector<MemTable*>*>& memtables_to_flush); const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush);
// Fix user-supplied options to be reasonable // Fix user-supplied options to be reasonable
template <class T, class V> template <class T, class V>

View File

@ -753,7 +753,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
if (s.ok()) { if (s.ok()) {
autovector<ColumnFamilyData*> tmp_cfds; autovector<ColumnFamilyData*> tmp_cfds;
autovector<const autovector<MemTable*>*> mems_list; autovector<const autovector<ReadOnlyMemTable*>*> mems_list;
autovector<const MutableCFOptions*> mutable_cf_options_list; autovector<const MutableCFOptions*> mutable_cf_options_list;
autovector<FileMetaData*> tmp_file_meta; autovector<FileMetaData*> tmp_file_meta;
autovector<std::list<std::unique_ptr<FlushJobInfo>>*> autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
@ -1457,11 +1457,6 @@ Status DBImpl::CompactFilesImpl(
input_set.insert(TableFileNameToNumber(file_name)); input_set.insert(TableFileNameToNumber(file_name));
} }
ColumnFamilyMetaData cf_meta;
// TODO(yhchiang): can directly use version here if none of the
// following functions call is pluggable to external developers.
version->GetColumnFamilyMetaData(&cf_meta);
if (output_path_id < 0) { if (output_path_id < 0) {
if (cfd->ioptions()->cf_paths.size() == 1U) { if (cfd->ioptions()->cf_paths.size() == 1U) {
output_path_id = 0; output_path_id = 0;
@ -1482,7 +1477,7 @@ Status DBImpl::CompactFilesImpl(
std::vector<CompactionInputFiles> input_files; std::vector<CompactionInputFiles> input_files;
Status s = cfd->compaction_picker()->SanitizeAndConvertCompactionInputFiles( Status s = cfd->compaction_picker()->SanitizeAndConvertCompactionInputFiles(
&input_set, cf_meta, output_level, version->storage_info(), &input_files); &input_set, output_level, version, &input_files);
TEST_SYNC_POINT( TEST_SYNC_POINT(
"DBImpl::CompactFilesImpl::PostSanitizeAndConvertCompactionInputFiles"); "DBImpl::CompactFilesImpl::PostSanitizeAndConvertCompactionInputFiles");
if (!s.ok()) { if (!s.ok()) {
@ -1561,6 +1556,12 @@ Status DBImpl::CompactFilesImpl(
compaction_job.Prepare(); compaction_job.Prepare();
std::unique_ptr<std::list<uint64_t>::iterator> min_options_file_number_elem;
if (immutable_db_options().compaction_service != nullptr) {
min_options_file_number_elem.reset(
new std::list<uint64_t>::iterator(CaptureOptionsFileNumber()));
}
mutex_.Unlock(); mutex_.Unlock();
TEST_SYNC_POINT("CompactFilesImpl:0"); TEST_SYNC_POINT("CompactFilesImpl:0");
TEST_SYNC_POINT("CompactFilesImpl:1"); TEST_SYNC_POINT("CompactFilesImpl:1");
@ -1570,6 +1571,10 @@ Status DBImpl::CompactFilesImpl(
TEST_SYNC_POINT("CompactFilesImpl:3"); TEST_SYNC_POINT("CompactFilesImpl:3");
mutex_.Lock(); mutex_.Lock();
if (immutable_db_options().compaction_service != nullptr) {
ReleaseOptionsFileNumber(min_options_file_number_elem);
}
bool compaction_released = false; bool compaction_released = false;
Status status = Status status =
compaction_job.Install(*c->mutable_cf_options(), &compaction_released); compaction_job.Install(*c->mutable_cf_options(), &compaction_released);
@ -1852,8 +1857,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
mutable_cf_options.compression_opts, mutable_cf_options.compression_opts,
mutable_cf_options.default_write_temperature, mutable_cf_options.default_write_temperature,
0 /* max_subcompactions, not applicable */, 0 /* max_subcompactions, not applicable */,
{} /* grandparents, not applicable */, false /* is manual */, {} /* grandparents, not applicable */,
"" /* trim_ts */, -1 /* score, not applicable */, std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
false /* is deletion compaction, not applicable */, false /* is deletion compaction, not applicable */,
false /* l0_files_might_overlap, not applicable */, false /* l0_files_might_overlap, not applicable */,
CompactionReason::kRefitLevel)); CompactionReason::kRefitLevel));
@ -1880,7 +1886,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
Status status = versions_->LogAndApply(cfd, mutable_cf_options, Status status = versions_->LogAndApply(cfd, mutable_cf_options,
read_options, write_options, &edit, read_options, write_options, &edit,
&mutex_, directories_.GetDbDir()); &mutex_, directories_.GetDbDir());
c->MarkFilesBeingCompacted(false);
cfd->compaction_picker()->UnregisterCompaction(c.get()); cfd->compaction_picker()->UnregisterCompaction(c.get());
c.reset(); c.reset();
@ -2377,7 +2383,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
ColumnFamilyData* loop_cfd = ColumnFamilyData* loop_cfd =
req.cfd_to_max_mem_id_to_persist.begin()->first; req.cfd_to_max_mem_id_to_persist.begin()->first;
bool already_queued_for_flush = loop_cfd->queued_for_flush(); bool already_queued_for_flush = loop_cfd->queued_for_flush();
bool flush_req_enqueued = SchedulePendingFlush(req); bool flush_req_enqueued = EnqueuePendingFlush(req);
if (already_queued_for_flush || flush_req_enqueued) { if (already_queued_for_flush || flush_req_enqueued) {
loop_cfd->SetFlushSkipReschedule(); loop_cfd->SetFlushSkipReschedule();
} }
@ -2407,7 +2413,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
} }
s = WaitForFlushMemTables( s = WaitForFlushMemTables(
cfds, flush_memtable_ids, cfds, flush_memtable_ids,
flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */); flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
flush_reason);
InstrumentedMutexLock lock_guard(&mutex_); InstrumentedMutexLock lock_guard(&mutex_);
for (auto* tmp_cfd : cfds) { for (auto* tmp_cfd : cfds) {
tmp_cfd->UnrefAndTryDelete(); tmp_cfd->UnrefAndTryDelete();
@ -2528,7 +2535,7 @@ Status DBImpl::AtomicFlushMemTables(
} }
} }
GenerateFlushRequest(cfds, flush_reason, &flush_req); GenerateFlushRequest(cfds, flush_reason, &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
} }
@ -2549,7 +2556,8 @@ Status DBImpl::AtomicFlushMemTables(
} }
s = WaitForFlushMemTables( s = WaitForFlushMemTables(
cfds, flush_memtable_ids, cfds, flush_memtable_ids,
flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */); flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
flush_reason);
InstrumentedMutexLock lock_guard(&mutex_); InstrumentedMutexLock lock_guard(&mutex_);
for (auto* cfd : cfds) { for (auto* cfd : cfds) {
cfd->UnrefAndTryDelete(); cfd->UnrefAndTryDelete();
@ -2583,7 +2591,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
if (immutable_db_options_.atomic_flush) { if (immutable_db_options_.atomic_flush) {
FlushRequest flush_req; FlushRequest flush_req;
GenerateFlushRequest(cfds, flush_reason, &flush_req); GenerateFlushRequest(cfds, flush_reason, &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
flush_memtable_ids.push_back(iter.second); flush_memtable_ids.push_back(iter.second);
} }
@ -2597,7 +2605,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
flush_reason, flush_reason,
{{cfd, {{cfd,
std::numeric_limits<uint64_t>::max() /* max_mem_id_to_persist */}}}; std::numeric_limits<uint64_t>::max() /* max_mem_id_to_persist */}}};
if (SchedulePendingFlush(flush_req)) { if (EnqueuePendingFlush(flush_req)) {
cfd->SetFlushSkipReschedule(); cfd->SetFlushSkipReschedule();
}; };
} }
@ -2612,7 +2620,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
flush_memtable_id_ptrs.push_back(&flush_memtable_id); flush_memtable_id_ptrs.push_back(&flush_memtable_id);
} }
s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs, s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs,
true /* resuming_from_bg_err */); true /* resuming_from_bg_err */, flush_reason);
mutex_.Lock(); mutex_.Lock();
} }
@ -2712,7 +2720,7 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
Status DBImpl::WaitForFlushMemTables( Status DBImpl::WaitForFlushMemTables(
const autovector<ColumnFamilyData*>& cfds, const autovector<ColumnFamilyData*>& cfds,
const autovector<const uint64_t*>& flush_memtable_ids, const autovector<const uint64_t*>& flush_memtable_ids,
bool resuming_from_bg_err) { bool resuming_from_bg_err, std::optional<FlushReason> flush_reason) {
int num = static_cast<int>(cfds.size()); int num = static_cast<int>(cfds.size());
// Wait until the compaction completes // Wait until the compaction completes
InstrumentedMutexLock l(&mutex_); InstrumentedMutexLock l(&mutex_);
@ -2750,7 +2758,15 @@ Status DBImpl::WaitForFlushMemTables(
(flush_memtable_ids[i] != nullptr && (flush_memtable_ids[i] != nullptr &&
cfds[i]->imm()->GetEarliestMemTableID() > cfds[i]->imm()->GetEarliestMemTableID() >
*flush_memtable_ids[i])) { *flush_memtable_ids[i])) {
++num_finished; // Make file ingestion's flush wait until SuperVersion is also updated
// since after flush, it does range overlapping check and file level
// assignment with the current SuperVersion.
if (!flush_reason.has_value() ||
flush_reason.value() != FlushReason::kExternalFileIngestion ||
cfds[i]->GetSuperVersion()->imm->GetID() ==
cfds[i]->imm()->current()->GetID()) {
++num_finished;
}
} }
} }
if (1 == num_dropped && 1 == num) { if (1 == num_dropped && 1 == num) {
@ -2950,6 +2966,7 @@ void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
cfd->Ref(); cfd->Ref();
compaction_queue_.push_back(cfd); compaction_queue_.push_back(cfd);
cfd->set_queued_for_compaction(true); cfd->set_queued_for_compaction(true);
++unscheduled_compactions_;
} }
ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
@ -3005,7 +3022,7 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue(
return cfd; return cfd;
} }
bool DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) { bool DBImpl::EnqueuePendingFlush(const FlushRequest& flush_req) {
mutex_.AssertHeld(); mutex_.AssertHeld();
bool enqueued = false; bool enqueued = false;
if (reject_new_background_jobs_) { if (reject_new_background_jobs_) {
@ -3041,16 +3058,15 @@ bool DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
return enqueued; return enqueued;
} }
void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { void DBImpl::EnqueuePendingCompaction(ColumnFamilyData* cfd) {
mutex_.AssertHeld(); mutex_.AssertHeld();
if (reject_new_background_jobs_) { if (reject_new_background_jobs_) {
return; return;
} }
if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
TEST_SYNC_POINT_CALLBACK("SchedulePendingCompaction::cfd", TEST_SYNC_POINT_CALLBACK("EnqueuePendingCompaction::cfd",
static_cast<void*>(cfd)); static_cast<void*>(cfd));
AddToCompactionQueue(cfd); AddToCompactionQueue(cfd);
++unscheduled_compactions_;
} }
} }
@ -3218,7 +3234,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
#ifndef NDEBUG #ifndef NDEBUG
flush_req.reschedule_count += 1; flush_req.reschedule_count += 1;
#endif /* !NDEBUG */ #endif /* !NDEBUG */
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
*reason = flush_reason; *reason = flush_reason;
*flush_rescheduled_to_retain_udt = true; *flush_rescheduled_to_retain_udt = true;
return Status::TryAgain(); return Status::TryAgain();
@ -3541,6 +3557,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
is_manual && manual_compaction->disallow_trivial_move; is_manual && manual_compaction->disallow_trivial_move;
CompactionJobStats compaction_job_stats; CompactionJobStats compaction_job_stats;
// Set is_remote_compaction to true on CompactionBegin Event if
// compaction_service is set except for trivial moves. We do not know whether
// remote compaction will actually be successfully scheduled, or fall back to
// local at this time. CompactionCompleted event will tell the truth where
// the compaction actually happened.
compaction_job_stats.is_remote_compaction =
immutable_db_options().compaction_service != nullptr;
Status status; Status status;
if (!error_handler_.IsBGWorkStopped()) { if (!error_handler_.IsBGWorkStopped()) {
if (shutting_down_.load(std::memory_order_acquire)) { if (shutting_down_.load(std::memory_order_acquire)) {
@ -3661,8 +3685,20 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
// compaction is not necessary. Need to make sure mutex is held // compaction is not necessary. Need to make sure mutex is held
// until we make a copy in the following code // until we make a copy in the following code
TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
SnapshotChecker* snapshot_checker = nullptr;
std::vector<SequenceNumber> snapshot_seqs;
// This info is not useful for other scenarios, so save querying existing
// snapshots for those cases.
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
cfd->user_comparator()->timestamp_size() == 0) {
SequenceNumber earliest_write_conflict_snapshot;
GetSnapshotContext(job_context, &snapshot_seqs,
&earliest_write_conflict_snapshot,
&snapshot_checker);
assert(is_snapshot_supported_ || snapshots_.empty());
}
c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_, c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_,
log_buffer)); snapshot_seqs, snapshot_checker, log_buffer));
TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
if (c != nullptr) { if (c != nullptr) {
@ -3678,7 +3714,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
->ComputeCompactionScore(*(c->immutable_options()), ->ComputeCompactionScore(*(c->immutable_options()),
*(c->mutable_cf_options())); *(c->mutable_cf_options()));
AddToCompactionQueue(cfd); AddToCompactionQueue(cfd);
++unscheduled_compactions_;
c.reset(); c.reset();
// Don't need to sleep here, because BackgroundCallCompaction // Don't need to sleep here, because BackgroundCallCompaction
@ -3707,7 +3742,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
if (cfd->NeedsCompaction()) { if (cfd->NeedsCompaction()) {
// Yes, we need more compactions! // Yes, we need more compactions!
AddToCompactionQueue(cfd); AddToCompactionQueue(cfd);
++unscheduled_compactions_;
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
} }
} }
@ -3768,6 +3802,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
compaction_job_stats.num_input_files = c->num_input_files(0); compaction_job_stats.num_input_files = c->num_input_files(0);
// Trivial moves do not get compacted remotely
compaction_job_stats.is_remote_compaction = false;
NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
compaction_job_stats, job_context->job_id); compaction_job_stats, job_context->job_id);
@ -3903,6 +3939,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
&bg_bottom_compaction_scheduled_); &bg_bottom_compaction_scheduled_);
compaction_job.Prepare(); compaction_job.Prepare();
std::unique_ptr<std::list<uint64_t>::iterator> min_options_file_number_elem;
if (immutable_db_options().compaction_service != nullptr) {
min_options_file_number_elem.reset(
new std::list<uint64_t>::iterator(CaptureOptionsFileNumber()));
}
NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
compaction_job_stats, job_context->job_id); compaction_job_stats, job_context->job_id);
mutex_.Unlock(); mutex_.Unlock();
@ -3912,6 +3954,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
compaction_job.Run().PermitUncheckedError(); compaction_job.Run().PermitUncheckedError();
TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
mutex_.Lock(); mutex_.Lock();
if (immutable_db_options().compaction_service != nullptr) {
ReleaseOptionsFileNumber(min_options_file_number_elem);
}
status = status =
compaction_job.Install(*c->mutable_cf_options(), &compaction_released); compaction_job.Install(*c->mutable_cf_options(), &compaction_released);
io_s = compaction_job.io_status(); io_s = compaction_job.io_status();
@ -3939,7 +3986,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
// Sanity checking that compaction files are freed. // Sanity checking that compaction files are freed.
for (size_t i = 0; i < c->num_input_levels(); i++) { for (size_t i = 0; i < c->num_input_levels(); i++) {
for (size_t j = 0; j < c->inputs(i)->size(); j++) { for (size_t j = 0; j < c->inputs(i)->size(); j++) {
assert(!c->input(i, j)->being_compacted); // When status is not OK, compaction's result installation failed and
// no new Version installed. The files could have been released and
// picked up again by other compaction attempts.
assert(!c->input(i, j)->being_compacted || !status.ok());
} }
} }
std::unordered_set<Compaction*>* cip = c->column_family_data() std::unordered_set<Compaction*>* cip = c->column_family_data()
@ -3997,7 +4047,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
*(c->mutable_cf_options())); *(c->mutable_cf_options()));
if (!cfd->queued_for_compaction()) { if (!cfd->queued_for_compaction()) {
AddToCompactionQueue(cfd); AddToCompactionQueue(cfd);
++unscheduled_compactions_;
} }
} }
} }
@ -4259,17 +4308,23 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
// newer snapshot created and released frequently, the compaction will be // newer snapshot created and released frequently, the compaction will be
// triggered soon anyway. // triggered soon anyway.
bottommost_files_mark_threshold_ = kMaxSequenceNumber; bottommost_files_mark_threshold_ = kMaxSequenceNumber;
standalone_range_deletion_files_mark_threshold_ = kMaxSequenceNumber;
for (auto* my_cfd : *versions_->GetColumnFamilySet()) { for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
if (!my_cfd->ioptions()->allow_ingest_behind) { if (!my_cfd->ioptions()->allow_ingest_behind) {
bottommost_files_mark_threshold_ = std::min( bottommost_files_mark_threshold_ = std::min(
bottommost_files_mark_threshold_, bottommost_files_mark_threshold_,
my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
} }
standalone_range_deletion_files_mark_threshold_ =
std::min(standalone_range_deletion_files_mark_threshold_,
cfd->current()
->storage_info()
->standalone_range_tombstone_files_mark_threshold());
} }
// Whenever we install new SuperVersion, we might need to issue new flushes or // Whenever we install new SuperVersion, we might need to issue new flushes or
// compactions. // compactions.
SchedulePendingCompaction(cfd); EnqueuePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
// Update max_total_in_memory_state_ // Update max_total_in_memory_state_

View File

@ -9,6 +9,7 @@
#ifndef NDEBUG #ifndef NDEBUG
#include "db/blob/blob_file_cache.h"
#include "db/column_family.h" #include "db/column_family.h"
#include "db/db_impl/db_impl.h" #include "db/db_impl/db_impl.h"
#include "db/error_handler.h" #include "db/error_handler.h"
@ -199,6 +200,11 @@ Status DBImpl::TEST_GetBGError() {
return error_handler_.GetBGError(); return error_handler_.GetBGError();
} }
bool DBImpl::TEST_IsRecoveryInProgress() {
InstrumentedMutexLock l(&mutex_);
return error_handler_.IsRecoveryInProgress();
}
void DBImpl::TEST_LockMutex() { mutex_.Lock(); } void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
@ -227,23 +233,16 @@ uint64_t DBImpl::TEST_LogfileNumber() {
return logfile_number_; return logfile_number_;
} }
Status DBImpl::TEST_GetAllImmutableCFOptions( void DBImpl::TEST_GetAllBlockCaches(
std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) { std::unordered_set<const Cache*>* cache_set) {
std::vector<std::string> cf_names; InstrumentedMutexLock l(&mutex_);
std::vector<const ImmutableCFOptions*> iopts; for (auto cfd : *versions_->GetColumnFamilySet()) {
{ if (const auto bbto =
InstrumentedMutexLock l(&mutex_); cfd->GetCurrentMutableCFOptions()
for (auto cfd : *versions_->GetColumnFamilySet()) { ->table_factory->GetOptions<BlockBasedTableOptions>()) {
cf_names.push_back(cfd->GetName()); cache_set->insert(bbto->block_cache.get());
iopts.push_back(cfd->ioptions());
} }
} }
iopts_map->clear();
for (size_t i = 0; i < cf_names.size(); ++i) {
iopts_map->insert({cf_names[i], iopts[i]});
}
return Status::OK();
} }
uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
@ -259,7 +258,7 @@ size_t DBImpl::TEST_LogsWithPrepSize() {
} }
uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() { uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
autovector<MemTable*> empty_list; autovector<ReadOnlyMemTable*> empty_list;
return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list); return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
} }
@ -314,9 +313,75 @@ const autovector<uint64_t>& DBImpl::TEST_GetFilesToQuarantine() const {
return error_handler_.GetFilesToQuarantine(); return error_handler_.GetFilesToQuarantine();
} }
void DBImpl::TEST_DeleteObsoleteFiles() {
InstrumentedMutexLock l(&mutex_);
DeleteObsoleteFiles();
}
size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_); InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
return EstimateInMemoryStatsHistorySize(); return EstimateInMemoryStatsHistorySize();
} }
void DBImpl::TEST_VerifyNoObsoleteFilesCached(
bool db_mutex_already_held) const {
// This check is somewhat expensive and obscure to make a part of every
// unit test in every build variety. Thus, we only enable it for ASAN builds.
if (!kMustFreeHeapAllocations) {
return;
}
std::optional<InstrumentedMutexLock> l;
if (db_mutex_already_held) {
mutex_.AssertHeld();
} else {
l.emplace(&mutex_);
}
if (!opened_successfully_) {
// We don't need to pro-actively clean up open files during DB::Open()
// if we know we are about to fail and clean up in Close().
return;
}
if (disable_delete_obsolete_files_ > 0) {
// For better or worse, DB::Close() is allowed with deletions disabled.
// Since we generally associate clean-up of open files with deleting them,
// we allow "obsolete" open files when deletions are disabled.
return;
}
// Live and "quarantined" files are allowed to be open in table cache
std::set<uint64_t> live_and_quar_files;
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
// Iterate over live versions
Version* current = cfd->current();
Version* ver = current;
do {
// Sneakily add both SST and blob files to the same list
std::vector<uint64_t> live_files_vec;
ver->AddLiveFiles(&live_files_vec, &live_files_vec);
live_and_quar_files.insert(live_files_vec.begin(), live_files_vec.end());
ver = ver->Next();
} while (ver != current);
}
{
const auto& quar_files = error_handler_.GetFilesToQuarantine();
live_and_quar_files.insert(quar_files.begin(), quar_files.end());
}
auto fn = [&live_and_quar_files](const Slice& key, Cache::ObjectPtr, size_t,
const Cache::CacheItemHelper*) {
// See TableCache and BlobFileCache
assert(key.size() == sizeof(uint64_t));
uint64_t file_number;
GetUnaligned(reinterpret_cast<const uint64_t*>(key.data()), &file_number);
// Assert file is in live/quarantined set
assert(live_and_quar_files.find(file_number) != live_and_quar_files.end());
};
table_cache_->ApplyToAllEntries(fn, {});
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
#endif // NDEBUG #endif // NDEBUG

View File

@ -47,7 +47,7 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
// compaction score // compaction score
vstorage->ComputeCompactionScore(*cfd->ioptions(), vstorage->ComputeCompactionScore(*cfd->ioptions(),
*cfd->GetLatestMutableCFOptions()); *cfd->GetLatestMutableCFOptions());
SchedulePendingCompaction(cfd); EnqueuePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
} }
return Status::OK(); return Status::OK();

View File

@ -43,6 +43,14 @@ uint64_t DBImpl::GetObsoleteSstFilesSize() {
return versions_->GetObsoleteSstFilesSize(); return versions_->GetObsoleteSstFilesSize();
} }
uint64_t DBImpl::MinOptionsFileNumberToKeep() {
mutex_.AssertHeld();
if (!min_options_file_numbers_.empty()) {
return *min_options_file_numbers_.begin();
}
return std::numeric_limits<uint64_t>::max();
}
Status DBImpl::DisableFileDeletions() { Status DBImpl::DisableFileDeletions() {
Status s; Status s;
int my_disable_delete_obsolete_files; int my_disable_delete_obsolete_files;
@ -147,6 +155,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
// here but later find newer generated unfinalized files while scanning. // here but later find newer generated unfinalized files while scanning.
job_context->min_pending_output = MinObsoleteSstNumberToKeep(); job_context->min_pending_output = MinObsoleteSstNumberToKeep();
job_context->files_to_quarantine = error_handler_.GetFilesToQuarantine(); job_context->files_to_quarantine = error_handler_.GetFilesToQuarantine();
job_context->min_options_file_number = MinOptionsFileNumberToKeep();
// Get obsolete files. This function will also update the list of // Get obsolete files. This function will also update the list of
// pending files in VersionSet(). // pending files in VersionSet().
@ -440,14 +449,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
// File is being deleted (actually obsolete) // File is being deleted (actually obsolete)
auto number = file.metadata->fd.GetNumber(); auto number = file.metadata->fd.GetNumber();
candidate_files.emplace_back(MakeTableFileName(number), file.path); candidate_files.emplace_back(MakeTableFileName(number), file.path);
if (handle == nullptr) { TableCache::ReleaseObsolete(table_cache_.get(), number, handle,
// For files not "pinned" in table cache file.uncache_aggressiveness);
handle = TableCache::Lookup(table_cache_.get(), number);
}
if (handle) {
TableCache::ReleaseObsolete(table_cache_.get(), handle,
file.uncache_aggressiveness);
}
} }
file.DeleteMetadata(); file.DeleteMetadata();
} }
@ -498,7 +501,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
dbname_); dbname_);
// File numbers of most recent two OPTIONS file in candidate_files (found in // File numbers of most recent two OPTIONS file in candidate_files (found in
// previos FindObsoleteFiles(full_scan=true)) // previous FindObsoleteFiles(full_scan=true))
// At this point, there must not be any duplicate file numbers in // At this point, there must not be any duplicate file numbers in
// candidate_files. // candidate_files.
uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min(); uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
@ -519,6 +522,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
} }
} }
// For remote compactions, we need to keep OPTIONS file that may get
// referenced by the remote worker
optsfile_num2 = std::min(optsfile_num2, state.min_options_file_number);
// Close WALs before trying to delete them. // Close WALs before trying to delete them.
for (const auto w : state.logs_to_free) { for (const auto w : state.logs_to_free) {
// TODO: maybe check the return value of Close. // TODO: maybe check the return value of Close.
@ -558,9 +566,17 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
case kTableFile: case kTableFile:
// If the second condition is not there, this makes // If the second condition is not there, this makes
// DontDeletePendingOutputs fail // DontDeletePendingOutputs fail
// FIXME: but should NOT keep if it came from sst_delete_files?
keep = (sst_live_set.find(number) != sst_live_set.end()) || keep = (sst_live_set.find(number) != sst_live_set.end()) ||
number >= state.min_pending_output; number >= state.min_pending_output;
if (!keep) { if (!keep) {
// NOTE: sometimes redundant (if came from sst_delete_files)
// We don't know which column family is applicable here so we don't
// know what uncache_aggressiveness would be used with
// ReleaseObsolete(). Anyway, obsolete files ideally go into
// sst_delete_files for better/quicker handling, and this is just a
// backstop.
TableCache::Evict(table_cache_.get(), number);
files_to_del.insert(number); files_to_del.insert(number);
} }
break; break;
@ -722,13 +738,46 @@ void DBImpl::DeleteObsoleteFiles() {
mutex_.Lock(); mutex_.Lock();
} }
VersionEdit GetDBRecoveryEditForObsoletingMemTables(
VersionSet* vset, const ColumnFamilyData& cfd,
const autovector<VersionEdit*>& edit_list,
const autovector<ReadOnlyMemTable*>& memtables,
LogsWithPrepTracker* prep_tracker) {
VersionEdit wal_deletion_edit;
uint64_t min_wal_number_to_keep = 0;
assert(edit_list.size() > 0);
if (vset->db_options()->allow_2pc) {
// Note that if mempurge is successful, the edit_list will
// not be applicable (contains info of new min_log number to keep,
// and level 0 file path of SST file created during normal flush,
// so both pieces of information are irrelevant after a successful
// mempurge operation).
min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
vset, cfd, edit_list, memtables, prep_tracker);
// We piggyback the information of earliest log file to keep in the
// manifest entry for the last file flushed.
} else {
min_wal_number_to_keep =
PrecomputeMinLogNumberToKeepNon2PC(vset, cfd, edit_list);
}
wal_deletion_edit.SetMinLogNumberToKeep(min_wal_number_to_keep);
if (vset->db_options()->track_and_verify_wals_in_manifest) {
if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
wal_deletion_edit.DeleteWalsBefore(min_wal_number_to_keep);
}
}
return wal_deletion_edit;
}
uint64_t FindMinPrepLogReferencedByMemTable( uint64_t FindMinPrepLogReferencedByMemTable(
VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) { VersionSet* vset, const autovector<ReadOnlyMemTable*>& memtables_to_flush) {
uint64_t min_log = 0; uint64_t min_log = 0;
// we must look through the memtables for two phase transactions // we must look through the memtables for two phase transactions
// that have been committed but not yet flushed // that have been committed but not yet flushed
std::unordered_set<MemTable*> memtables_to_flush_set( std::unordered_set<ReadOnlyMemTable*> memtables_to_flush_set(
memtables_to_flush.begin(), memtables_to_flush.end()); memtables_to_flush.begin(), memtables_to_flush.end());
for (auto loop_cfd : *vset->GetColumnFamilySet()) { for (auto loop_cfd : *vset->GetColumnFamilySet()) {
if (loop_cfd->IsDropped()) { if (loop_cfd->IsDropped()) {
@ -753,12 +802,12 @@ uint64_t FindMinPrepLogReferencedByMemTable(
} }
uint64_t FindMinPrepLogReferencedByMemTable( uint64_t FindMinPrepLogReferencedByMemTable(
VersionSet* vset, VersionSet* vset, const autovector<const autovector<ReadOnlyMemTable*>*>&
const autovector<const autovector<MemTable*>*>& memtables_to_flush) { memtables_to_flush) {
uint64_t min_log = 0; uint64_t min_log = 0;
std::unordered_set<MemTable*> memtables_to_flush_set; std::unordered_set<ReadOnlyMemTable*> memtables_to_flush_set;
for (const autovector<MemTable*>* memtables : memtables_to_flush) { for (const autovector<ReadOnlyMemTable*>* memtables : memtables_to_flush) {
memtables_to_flush_set.insert(memtables->begin(), memtables->end()); memtables_to_flush_set.insert(memtables->begin(), memtables->end());
} }
for (auto loop_cfd : *vset->GetColumnFamilySet()) { for (auto loop_cfd : *vset->GetColumnFamilySet()) {
@ -850,7 +899,7 @@ uint64_t PrecomputeMinLogNumberToKeepNon2PC(
uint64_t PrecomputeMinLogNumberToKeep2PC( uint64_t PrecomputeMinLogNumberToKeep2PC(
VersionSet* vset, const ColumnFamilyData& cfd_to_flush, VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
const autovector<VersionEdit*>& edit_list, const autovector<VersionEdit*>& edit_list,
const autovector<MemTable*>& memtables_to_flush, const autovector<ReadOnlyMemTable*>& memtables_to_flush,
LogsWithPrepTracker* prep_tracker) { LogsWithPrepTracker* prep_tracker) {
assert(vset != nullptr); assert(vset != nullptr);
assert(prep_tracker != nullptr); assert(prep_tracker != nullptr);
@ -891,7 +940,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
uint64_t PrecomputeMinLogNumberToKeep2PC( uint64_t PrecomputeMinLogNumberToKeep2PC(
VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush, VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
const autovector<autovector<VersionEdit*>>& edit_lists, const autovector<autovector<VersionEdit*>>& edit_lists,
const autovector<const autovector<MemTable*>*>& memtables_to_flush, const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush,
LogsWithPrepTracker* prep_tracker) { LogsWithPrepTracker* prep_tracker) {
assert(vset != nullptr); assert(vset != nullptr);
assert(prep_tracker != nullptr); assert(prep_tracker != nullptr);
@ -921,57 +970,65 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
} }
void DBImpl::SetDBId(std::string&& id, bool read_only, void DBImpl::SetDBId(std::string&& id, bool read_only,
RecoveryContext* recovery_ctx) { VersionEdit* version_edit) {
assert(db_id_.empty()); assert(db_id_.empty());
assert(!id.empty()); assert(!id.empty());
db_id_ = std::move(id); db_id_ = std::move(id);
if (!read_only && immutable_db_options_.write_dbid_to_manifest) { if (!read_only && version_edit) {
assert(recovery_ctx != nullptr); assert(version_edit != nullptr);
assert(versions_->GetColumnFamilySet() != nullptr); assert(versions_->GetColumnFamilySet() != nullptr);
VersionEdit edit; version_edit->SetDBId(db_id_);
edit.SetDBId(db_id_);
versions_->db_id_ = db_id_; versions_->db_id_ = db_id_;
recovery_ctx->UpdateVersionEdits(
versions_->GetColumnFamilySet()->GetDefault(), edit);
} }
} }
Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only, Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
RecoveryContext* recovery_ctx) { bool is_new_db, bool is_retry,
VersionEdit* version_edit) {
Status s; Status s;
// Check for the IDENTITY file and create it if not there or if (!is_new_db) {
// broken or not matching manifest // Check for the IDENTITY file and create it if not there or
std::string db_id_in_file; // broken or not matching manifest
s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); std::string db_id_in_file;
if (s.ok()) { s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
s = GetDbIdentityFromIdentityFile(&db_id_in_file); if (s.ok()) {
if (s.ok() && !db_id_in_file.empty()) { IOOptions opts;
if (db_id_.empty()) { if (is_retry) {
// Loaded from file and wasn't already known from manifest opts.verify_and_reconstruct_read = true;
SetDBId(std::move(db_id_in_file), read_only, recovery_ctx); }
return s; s = GetDbIdentityFromIdentityFile(opts, &db_id_in_file);
} else if (db_id_ == db_id_in_file) { if (s.ok() && !db_id_in_file.empty()) {
// Loaded from file and matches manifest if (db_id_.empty()) {
return s; // Loaded from file and wasn't already known from manifest
SetDBId(std::move(db_id_in_file), read_only, version_edit);
return s;
} else if (db_id_ == db_id_in_file) {
// Loaded from file and matches manifest
return s;
}
} }
} }
} if (s.IsNotFound()) {
if (s.IsNotFound()) { s = Status::OK();
s = Status::OK(); }
} if (!s.ok()) {
if (!s.ok()) { assert(s.IsIOError());
assert(s.IsIOError()); return s;
return s; }
} }
// Otherwise IDENTITY file is missing or no good. // Otherwise IDENTITY file is missing or no good.
// Generate new id if needed // Generate new id if needed
if (db_id_.empty()) { if (db_id_.empty()) {
SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx); SetDBId(env_->GenerateUniqueId(), read_only, version_edit);
} }
// Persist it to IDENTITY file if allowed // Persist it to IDENTITY file if allowed
if (!read_only) { if (!read_only && immutable_db_options_.write_identity_file) {
s = SetIdentityFile(write_options, env_, dbname_, db_id_); s = SetIdentityFile(write_options, env_, dbname_,
immutable_db_options_.metadata_write_temperature,
db_id_);
} }
// NOTE: an obsolete IDENTITY file with write_identity_file=false is handled
// elsewhere, so that it's only deleted after successful recovery
return s; return s;
} }

View File

@ -289,27 +289,25 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
"start_time and end_time cannot be the same"); "start_time and end_time cannot be the same");
} }
} }
if (!db_options.write_dbid_to_manifest && !db_options.write_identity_file) {
return Status::InvalidArgument(
"write_dbid_to_manifest and write_identity_file cannot both be false");
}
return Status::OK(); return Status::OK();
} }
Status DBImpl::NewDB(std::vector<std::string>* new_filenames) { Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
VersionEdit new_db; VersionEdit new_db_edit;
const WriteOptions write_options(Env::IOActivity::kDBOpen); const WriteOptions write_options(Env::IOActivity::kDBOpen);
Status s = SetIdentityFile(write_options, env_, dbname_); Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true,
/*is_retry=*/false, &new_db_edit);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
if (immutable_db_options_.write_dbid_to_manifest) { new_db_edit.SetLogNumber(0);
std::string temp_db_id; new_db_edit.SetNextFile(2);
s = GetDbIdentityFromIdentityFile(&temp_db_id); new_db_edit.SetLastSequence(0);
if (!s.ok()) {
return s;
}
new_db.SetDBId(temp_db_id);
}
new_db.SetLogNumber(0);
new_db.SetNextFile(2);
new_db.SetLastSequence(0);
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n"); ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
const std::string manifest = DescriptorFileName(dbname_, 1); const std::string manifest = DescriptorFileName(dbname_, 1);
@ -319,6 +317,12 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
} }
std::unique_ptr<FSWritableFile> file; std::unique_ptr<FSWritableFile> file;
FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_); FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
// DB option takes precedence when not kUnknown
if (immutable_db_options_.metadata_write_temperature !=
Temperature::kUnknown) {
file_options.temperature =
immutable_db_options_.metadata_write_temperature;
}
s = NewWritableFile(fs_.get(), manifest, &file, file_options); s = NewWritableFile(fs_.get(), manifest, &file, file_options);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
@ -335,7 +339,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
tmp_set.Contains(FileType::kDescriptorFile))); tmp_set.Contains(FileType::kDescriptorFile)));
log::Writer log(std::move(file_writer), 0, false); log::Writer log(std::move(file_writer), 0, false);
std::string record; std::string record;
new_db.EncodeTo(&record); new_db_edit.EncodeTo(&record);
s = log.AddRecord(write_options, record); s = log.AddRecord(write_options, record);
if (s.ok()) { if (s.ok()) {
s = SyncManifest(&immutable_db_options_, write_options, log.file()); s = SyncManifest(&immutable_db_options_, write_options, log.file());
@ -344,6 +348,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
if (s.ok()) { if (s.ok()) {
// Make "CURRENT" file that points to the new manifest file. // Make "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(write_options, fs_.get(), dbname_, 1, s = SetCurrentFile(write_options, fs_.get(), dbname_, 1,
immutable_db_options_.metadata_write_temperature,
directories_.GetDbDir()); directories_.GetDbDir());
if (new_filenames) { if (new_filenames) {
new_filenames->emplace_back( new_filenames->emplace_back(
@ -520,7 +525,7 @@ Status DBImpl::Recover(
} }
assert(s.ok()); assert(s.ok());
} }
assert(db_id_.empty()); assert(is_new_db || db_id_.empty());
Status s; Status s;
bool missing_table_file = false; bool missing_table_file = false;
if (!immutable_db_options_.best_efforts_recovery) { if (!immutable_db_options_.best_efforts_recovery) {
@ -530,6 +535,12 @@ Status DBImpl::Recover(
/*no_error_if_files_missing=*/false, is_retry, /*no_error_if_files_missing=*/false, is_retry,
&desc_status); &desc_status);
desc_status.PermitUncheckedError(); desc_status.PermitUncheckedError();
if (is_retry) {
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT);
if (desc_status.ok()) {
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
}
}
if (can_retry) { if (can_retry) {
// If we're opening for the first time and the failure is likely due to // If we're opening for the first time and the failure is likely due to
// a corrupt MANIFEST file (could result in either the log::Reader // a corrupt MANIFEST file (could result in either the log::Reader
@ -564,6 +575,7 @@ Status DBImpl::Recover(
} }
if (s.ok() && !read_only) { if (s.ok() && !read_only) {
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
auto& moptions = *cfd->GetLatestMutableCFOptions();
// Try to trivially move files down the LSM tree to start from bottommost // Try to trivially move files down the LSM tree to start from bottommost
// level when level_compaction_dynamic_level_bytes is enabled. This should // level when level_compaction_dynamic_level_bytes is enabled. This should
// only be useful when user is migrating to turning on this option. // only be useful when user is migrating to turning on this option.
@ -581,14 +593,14 @@ Status DBImpl::Recover(
if (cfd->ioptions()->compaction_style == if (cfd->ioptions()->compaction_style ==
CompactionStyle::kCompactionStyleLevel && CompactionStyle::kCompactionStyleLevel &&
cfd->ioptions()->level_compaction_dynamic_level_bytes && cfd->ioptions()->level_compaction_dynamic_level_bytes &&
!cfd->GetLatestMutableCFOptions()->disable_auto_compactions) { !moptions.disable_auto_compactions) {
int to_level = cfd->ioptions()->num_levels - 1; int to_level = cfd->ioptions()->num_levels - 1;
// last level is reserved // last level is reserved
// allow_ingest_behind does not support Level Compaction, // allow_ingest_behind does not support Level Compaction,
// and per_key_placement can have infinite compaction loop for Level // and per_key_placement can have infinite compaction loop for Level
// Compaction. Adjust to_level here just to be safe. // Compaction. Adjust to_level here just to be safe.
if (cfd->ioptions()->allow_ingest_behind || if (cfd->ioptions()->allow_ingest_behind ||
cfd->ioptions()->preclude_last_level_data_seconds > 0) { moptions.preclude_last_level_data_seconds > 0) {
to_level -= 1; to_level -= 1;
} }
// Whether this column family has a level trivially moved // Whether this column family has a level trivially moved
@ -660,7 +672,17 @@ Status DBImpl::Recover(
} }
} }
} }
s = SetupDBId(write_options, read_only, recovery_ctx); if (is_new_db) {
// Already set up DB ID in NewDB
} else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) {
VersionEdit edit;
s = SetupDBId(write_options, read_only, is_new_db, is_retry, &edit);
recovery_ctx->UpdateVersionEdits(
versions_->GetColumnFamilySet()->GetDefault(), edit);
} else {
s = SetupDBId(write_options, read_only, is_new_db, is_retry, nullptr);
}
assert(!s.ok() || !db_id_.empty());
ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
if (s.ok() && !read_only) { if (s.ok() && !read_only) {
s = MaybeUpdateNextFileNumber(recovery_ctx); s = MaybeUpdateNextFileNumber(recovery_ctx);
@ -1253,7 +1275,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
reader.GetRecordedTimestampSize(); reader.GetRecordedTimestampSize();
status = HandleWriteBatchTimestampSizeDifference( status = HandleWriteBatchTimestampSizeDifference(
&batch, running_ts_sz, record_ts_sz, &batch, running_ts_sz, record_ts_sz,
TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch); TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
batch_per_txn_, &new_batch);
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }
@ -1646,9 +1669,19 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
Arena arena; Arena arena;
Status s; Status s;
TableProperties table_properties; TableProperties table_properties;
const auto* ucmp = cfd->internal_comparator().user_comparator();
assert(ucmp);
const size_t ts_sz = ucmp->timestamp_size();
const bool logical_strip_timestamp =
ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps;
{ {
ScopedArenaPtr<InternalIterator> iter( ScopedArenaPtr<InternalIterator> iter(
mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena)); logical_strip_timestamp
? mem->NewTimestampStrippingIterator(
ro, /*seqno_to_time_mapping=*/nullptr, &arena,
/*prefix_extractor=*/nullptr, ts_sz)
: mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
/*prefix_extractor=*/nullptr));
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] [WriteLevel0TableForRecovery]" "[%s] [WriteLevel0TableForRecovery]"
" Level-0 table #%" PRIu64 ": started", " Level-0 table #%" PRIu64 ": started",
@ -1667,7 +1700,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
meta.oldest_ancester_time = current_time; meta.oldest_ancester_time = current_time;
meta.epoch_number = cfd->NewEpochNumber(); meta.epoch_number = cfd->NewEpochNumber();
{ {
auto write_hint = cfd->CalculateSSTWriteHint(0); auto write_hint =
cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
mutex_.Unlock(); mutex_.Unlock();
SequenceNumber earliest_write_conflict_snapshot; SequenceNumber earliest_write_conflict_snapshot;
@ -1682,11 +1716,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>> std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters; range_del_iters;
auto range_del_iter = auto range_del_iter =
// This is called during recovery, where a live memtable is flushed logical_strip_timestamp
// directly. In this case, no fragmented tombstone list is cached in ? mem->NewTimestampStrippingRangeTombstoneIterator(
// this memtable yet. ro, kMaxSequenceNumber, ts_sz)
mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber, // This is called during recovery, where a live memtable is
false /* immutable_memtable */); // flushed directly. In this case, no fragmented tombstone list is
// cached in this memtable yet.
: mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
false /* immutable_memtable */);
if (range_del_iter != nullptr) { if (range_del_iter != nullptr) {
range_del_iters.emplace_back(range_del_iter); range_del_iters.emplace_back(range_del_iter);
} }
@ -1700,10 +1737,11 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(), cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(),
GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(), mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
0 /* level */, false /* is_bottommost */, 0 /* level */, current_time /* newest_key_time */,
TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, false /* is_bottommost */, TableFileCreationReason::kRecovery,
0 /* file_creation_time */, db_id_, db_session_id_, 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
0 /* target_file_size */, meta.fd.GetNumber(), kMaxSequenceNumber); db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(),
kMaxSequenceNumber);
Version* version = cfd->current(); Version* version = cfd->current();
version->Ref(); version->Ref();
uint64_t num_input_entries = 0; uint64_t num_input_entries = 0;
@ -1733,7 +1771,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
s = io_s; s = io_s;
} }
uint64_t total_num_entries = mem->num_entries(); uint64_t total_num_entries = mem->NumEntries();
if (s.ok() && total_num_entries != num_input_entries) { if (s.ok() && total_num_entries != num_input_entries) {
std::string msg = "Expected " + std::to_string(total_num_entries) + std::string msg = "Expected " + std::to_string(total_num_entries) +
" entries in memtable, but read " + " entries in memtable, but read " +
@ -1772,9 +1810,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
// For UDT in memtable only feature, move up the cutoff timestamp whenever // For UDT in memtable only feature, move up the cutoff timestamp whenever
// a flush happens. // a flush happens.
const Comparator* ucmp = cfd->user_comparator(); if (logical_strip_timestamp) {
size_t ts_sz = ucmp->timestamp_size();
if (ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps) {
Slice mem_newest_udt = mem->GetNewestUDT(); Slice mem_newest_udt = mem->GetNewestUDT();
std::string full_history_ts_low = cfd->GetFullHistoryTsLow(); std::string full_history_ts_low = cfd->GetFullHistoryTsLow();
if (full_history_ts_low.empty() || if (full_history_ts_low.empty() ||
@ -1930,6 +1966,10 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
BuildDBOptions(immutable_db_options_, mutable_db_options_); BuildDBOptions(immutable_db_options_, mutable_db_options_);
FileOptions opt_file_options = FileOptions opt_file_options =
fs_->OptimizeForLogWrite(file_options_, db_options); fs_->OptimizeForLogWrite(file_options_, db_options);
// DB option takes precedence when not kUnknown
if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
}
std::string wal_dir = immutable_db_options_.GetWalDir(); std::string wal_dir = immutable_db_options_.GetWalDir();
std::string log_fname = LogFileName(wal_dir, log_file_num); std::string log_fname = LogFileName(wal_dir, log_file_num);
@ -1969,46 +2009,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
void DBImpl::TrackExistingDataFiles( void DBImpl::TrackExistingDataFiles(
const std::vector<std::string>& existing_data_files) { const std::vector<std::string>& existing_data_files) {
auto sfm = static_cast<SstFileManagerImpl*>( TrackOrUntrackFiles(existing_data_files, /*track=*/true);
immutable_db_options_.sst_file_manager.get());
assert(sfm);
std::vector<ColumnFamilyMetaData> metadata;
GetAllColumnFamilyMetaData(&metadata);
std::unordered_set<std::string> referenced_files;
for (const auto& md : metadata) {
for (const auto& lmd : md.levels) {
for (const auto& fmd : lmd.files) {
// We're assuming that each sst file name exists in at most one of
// the paths.
std::string file_path =
fmd.directory + kFilePathSeparator + fmd.relative_filename;
sfm->OnAddFile(file_path, fmd.size).PermitUncheckedError();
referenced_files.insert(file_path);
}
}
for (const auto& bmd : md.blob_files) {
std::string name = bmd.blob_file_name;
// The BlobMetaData.blob_file_name may start with "/".
if (!name.empty() && name[0] == kFilePathSeparator) {
name = name.substr(1);
}
// We're assuming that each blob file name exists in at most one of
// the paths.
std::string file_path = bmd.blob_file_path + kFilePathSeparator + name;
sfm->OnAddFile(file_path, bmd.blob_file_size).PermitUncheckedError();
referenced_files.insert(file_path);
}
}
for (const auto& file_path : existing_data_files) {
if (referenced_files.find(file_path) != referenced_files.end()) {
continue;
}
// There shouldn't be any duplicated files. In case there is, SstFileManager
// will take care of deduping it.
sfm->OnAddFile(file_path).PermitUncheckedError();
}
} }
Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
@ -2152,6 +2153,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
s = impl->LogAndApplyForRecovery(recovery_ctx); s = impl->LogAndApplyForRecovery(recovery_ctx);
} }
if (s.ok() && !impl->immutable_db_options_.write_identity_file) {
// On successful recovery, delete an obsolete IDENTITY file to avoid DB ID
// inconsistency
impl->env_->DeleteFile(IdentityFileName(impl->dbname_))
.PermitUncheckedError();
}
if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) { if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
impl->mutex_.AssertHeld(); impl->mutex_.AssertHeld();
s = impl->InitPersistStatsColumnFamily(); s = impl->InitPersistStatsColumnFamily();

View File

@ -265,7 +265,8 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem(); const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
std::string manifest_path; std::string manifest_path;
uint64_t manifest_file_number; uint64_t manifest_file_number;
s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path, s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), /*is_retry=*/false,
&manifest_path,
&manifest_file_number); &manifest_file_number);
} else { } else {
// Historic behavior that doesn't necessarily make sense // Historic behavior that doesn't necessarily make sense

View File

@ -12,7 +12,8 @@
#include "logging/auto_roll_logger.h" #include "logging/auto_roll_logger.h"
#include "logging/logging.h" #include "logging/logging.h"
#include "monitoring/perf_context_imp.h" #include "monitoring/perf_context_imp.h"
#include "rocksdb/configurable.h" #include "rocksdb/convenience.h"
#include "rocksdb/utilities/options_util.h"
#include "util/cast_util.h" #include "util/cast_util.h"
#include "util/write_batch_util.h" #include "util/write_batch_util.h"
@ -232,7 +233,8 @@ Status DBImplSecondary::RecoverLogFiles(
reader->GetRecordedTimestampSize(); reader->GetRecordedTimestampSize();
status = HandleWriteBatchTimestampSizeDifference( status = HandleWriteBatchTimestampSizeDifference(
&batch, running_ts_sz, record_ts_sz, &batch, running_ts_sz, record_ts_sz,
TimestampSizeConsistencyMode::kVerifyConsistency); TimestampSizeConsistencyMode::kVerifyConsistency, seq_per_batch_,
batch_per_txn_);
if (!status.ok()) { if (!status.ok()) {
break; break;
} }
@ -246,9 +248,7 @@ Status DBImplSecondary::RecoverLogFiles(
if (cfd == nullptr) { if (cfd == nullptr) {
continue; continue;
} }
if (cfds_changed->count(cfd) == 0) { cfds_changed->insert(cfd);
cfds_changed->insert(cfd);
}
const std::vector<FileMetaData*>& l0_files = const std::vector<FileMetaData*>& l0_files =
cfd->current()->storage_info()->LevelFiles(0); cfd->current()->storage_info()->LevelFiles(0);
SequenceNumber seq = SequenceNumber seq =
@ -938,69 +938,101 @@ Status DB::OpenAndCompact(
const std::string& output_directory, const std::string& input, const std::string& output_directory, const std::string& input,
std::string* output, std::string* output,
const CompactionServiceOptionsOverride& override_options) { const CompactionServiceOptionsOverride& override_options) {
// Check for cancellation
if (options.canceled && options.canceled->load(std::memory_order_acquire)) { if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
return Status::Incomplete(Status::SubCode::kManualCompactionPaused); return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
} }
// 1. Deserialize Compaction Input
CompactionServiceInput compaction_input; CompactionServiceInput compaction_input;
Status s = CompactionServiceInput::Read(input, &compaction_input); Status s = CompactionServiceInput::Read(input, &compaction_input);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
compaction_input.db_options.max_open_files = -1; // 2. Load the options
compaction_input.db_options.compaction_service = nullptr; DBOptions db_options;
if (compaction_input.db_options.statistics) { ConfigOptions config_options;
compaction_input.db_options.statistics.reset(); config_options.env = override_options.env;
} std::vector<ColumnFamilyDescriptor> all_column_families;
compaction_input.db_options.env = override_options.env;
compaction_input.db_options.file_checksum_gen_factory =
override_options.file_checksum_gen_factory;
compaction_input.db_options.statistics = override_options.statistics;
compaction_input.column_family.options.comparator =
override_options.comparator;
compaction_input.column_family.options.merge_operator =
override_options.merge_operator;
compaction_input.column_family.options.compaction_filter =
override_options.compaction_filter;
compaction_input.column_family.options.compaction_filter_factory =
override_options.compaction_filter_factory;
compaction_input.column_family.options.prefix_extractor =
override_options.prefix_extractor;
compaction_input.column_family.options.table_factory =
override_options.table_factory;
compaction_input.column_family.options.sst_partitioner_factory =
override_options.sst_partitioner_factory;
compaction_input.column_family.options.table_properties_collector_factories =
override_options.table_properties_collector_factories;
compaction_input.db_options.listeners = override_options.listeners;
std::vector<ColumnFamilyDescriptor> column_families; std::string options_file_name =
column_families.push_back(compaction_input.column_family); OptionsFileName(name, compaction_input.options_file_number);
// TODO: we have to open default CF, because of an implementation limitation,
// currently we just use the same CF option from input, which is not collect
// and open may fail.
if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
column_families.emplace_back(kDefaultColumnFamilyName,
compaction_input.column_family.options);
}
DB* db; s = LoadOptionsFromFile(config_options, options_file_name, &db_options,
std::vector<ColumnFamilyHandle*> handles; &all_column_families);
s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
column_families, &handles, &db);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
// 3. Override pointer configurations in DBOptions with
// CompactionServiceOptionsOverride
db_options.env = override_options.env;
db_options.file_checksum_gen_factory =
override_options.file_checksum_gen_factory;
db_options.statistics = override_options.statistics;
db_options.listeners = override_options.listeners;
db_options.compaction_service = nullptr;
// We will close the DB after the compaction anyway.
// Open as many files as needed for the compaction.
db_options.max_open_files = -1;
// 4. Filter CFs that are needed for OpenAndCompact()
// We do not need to open all column families for the remote compaction.
// Only open default CF + target CF. If target CF == default CF, we will open
// just the default CF (Due to current limitation, DB cannot open without the
// default CF)
std::vector<ColumnFamilyDescriptor> column_families;
for (auto& cf : all_column_families) {
if (cf.name == compaction_input.cf_name) {
cf.options.comparator = override_options.comparator;
cf.options.merge_operator = override_options.merge_operator;
cf.options.compaction_filter = override_options.compaction_filter;
cf.options.compaction_filter_factory =
override_options.compaction_filter_factory;
cf.options.prefix_extractor = override_options.prefix_extractor;
cf.options.table_factory = override_options.table_factory;
cf.options.sst_partitioner_factory =
override_options.sst_partitioner_factory;
cf.options.table_properties_collector_factories =
override_options.table_properties_collector_factories;
column_families.emplace_back(cf);
} else if (cf.name == kDefaultColumnFamilyName) {
column_families.emplace_back(cf);
}
}
// 5. Open db As Secondary
DB* db;
std::vector<ColumnFamilyHandle*> handles;
s = DB::OpenAsSecondary(db_options, name, output_directory, column_families,
&handles, &db);
if (!s.ok()) {
return s;
}
assert(db);
// 6. Find the handle of the Column Family that this will compact
ColumnFamilyHandle* cfh = nullptr;
for (auto* handle : handles) {
if (compaction_input.cf_name == handle->GetName()) {
cfh = handle;
break;
}
}
assert(cfh);
// 7. Run the compaction without installation.
// Output will be stored in the directory specified by output_directory
CompactionServiceResult compaction_result; CompactionServiceResult compaction_result;
DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db); DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
assert(handles.size() > 0); s = db_secondary->CompactWithoutInstallation(options, cfh, compaction_input,
s = db_secondary->CompactWithoutInstallation( &compaction_result);
options, handles[0], compaction_input, &compaction_result);
// 8. Serialize the result
Status serialization_status = compaction_result.Write(output); Status serialization_status = compaction_result.Write(output);
// 9. Close the db and return
for (auto& handle : handles) { for (auto& handle : handles) {
delete handle; delete handle;
} }

View File

@ -656,7 +656,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if (!io_s.ok()) { if (!io_s.ok()) {
// Check WriteToWAL status // Check WriteToWAL status
IOStatusCheck(io_s); WALIOStatusCheck(io_s);
} }
if (!w.CallbackFailed()) { if (!w.CallbackFailed()) {
if (!io_s.ok()) { if (!io_s.ok()) {
@ -687,7 +687,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
} }
} }
// Note: if we are to resume after non-OK statuses we need to revisit how // Note: if we are to resume after non-OK statuses we need to revisit how
// we reacts to non-OK statuses here. // we react to non-OK statuses here.
versions_->SetLastSequence(last_sequence); versions_->SetLastSequence(last_sequence);
} }
MemTableInsertStatusCheck(w.status); MemTableInsertStatusCheck(w.status);
@ -735,17 +735,6 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
size_t total_byte_size = 0; size_t total_byte_size = 0;
if (w.status.ok()) { if (w.status.ok()) {
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
for (auto* writer : wal_write_group) {
// TODO: maybe handle the tracing status?
tracer_->Write(writer->batch).PermitUncheckedError();
}
}
}
SequenceNumber next_sequence = current_sequence; SequenceNumber next_sequence = current_sequence;
for (auto* writer : wal_write_group) { for (auto* writer : wal_write_group) {
assert(writer); assert(writer);
@ -760,6 +749,22 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
} }
} }
} }
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
for (auto* writer : wal_write_group) {
if (writer->CallbackFailed()) {
// When optimisitc txn conflict checking fails, we should
// not record to trace.
continue;
}
// TODO: maybe handle the tracing status?
tracer_->Write(writer->batch).PermitUncheckedError();
}
}
}
if (w.disable_wal) { if (w.disable_wal) {
has_unpersisted_data_.store(true, std::memory_order_relaxed); has_unpersisted_data_.store(true, std::memory_order_relaxed);
} }
@ -799,7 +804,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
if (!io_s.ok()) { if (!io_s.ok()) {
// Check WriteToWAL status // Check WriteToWAL status
IOStatusCheck(io_s); WALIOStatusCheck(io_s);
} else if (!w.CallbackFailed()) { } else if (!w.CallbackFailed()) {
WriteStatusCheck(w.status); WriteStatusCheck(w.status);
} }
@ -969,21 +974,17 @@ Status DBImpl::WriteImplWALOnly(
assert(w.state == WriteThread::STATE_GROUP_LEADER); assert(w.state == WriteThread::STATE_GROUP_LEADER);
if (publish_last_seq == kDoPublishLastSeq) { if (publish_last_seq == kDoPublishLastSeq) {
Status status;
// Currently we only use kDoPublishLastSeq in unordered_write // Currently we only use kDoPublishLastSeq in unordered_write
assert(immutable_db_options_.unordered_write); assert(immutable_db_options_.unordered_write);
WriteContext write_context;
if (error_handler_.IsDBStopped()) {
status = error_handler_.GetBGError();
}
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
// without paying the cost of obtaining the mutex. // without paying the cost of obtaining the mutex.
if (status.ok()) { LogContext log_context;
LogContext log_context; WriteContext write_context;
status = PreprocessWrite(write_options, &log_context, &write_context); Status status =
WriteStatusCheckOnLocked(status); PreprocessWrite(write_options, &log_context, &write_context);
} WriteStatusCheckOnLocked(status);
if (!status.ok()) { if (!status.ok()) {
WriteThread::WriteGroup write_group; WriteThread::WriteGroup write_group;
write_thread->EnterAsBatchGroupLeader(&w, &write_group); write_thread->EnterAsBatchGroupLeader(&w, &write_group);
@ -1009,19 +1010,6 @@ Status DBImpl::WriteImplWALOnly(
WriteThread::WriteGroup write_group; WriteThread::WriteGroup write_group;
uint64_t last_sequence; uint64_t last_sequence;
write_thread->EnterAsBatchGroupLeader(&w, &write_group); write_thread->EnterAsBatchGroupLeader(&w, &write_group);
// Note: no need to update last_batch_group_size_ here since the batch writes
// to WAL only
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
for (auto* writer : write_group) {
// TODO: maybe handle the tracing status?
tracer_->Write(writer->batch).PermitUncheckedError();
}
}
}
size_t pre_release_callback_cnt = 0; size_t pre_release_callback_cnt = 0;
size_t total_byte_size = 0; size_t total_byte_size = 0;
@ -1036,6 +1024,23 @@ Status DBImpl::WriteImplWALOnly(
} }
} }
// Note: no need to update last_batch_group_size_ here since the batch writes
// to WAL only
// TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
// grabs but does not seem thread-safe.
if (tracer_) {
InstrumentedMutexLock lock(&trace_mutex_);
if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
for (auto* writer : write_group) {
if (writer->CallbackFailed()) {
continue;
}
// TODO: maybe handle the tracing status?
tracer_->Write(writer->batch).PermitUncheckedError();
}
}
}
const bool concurrent_update = true; const bool concurrent_update = true;
// Update stats while we are an exclusive group leader, so we know // Update stats while we are an exclusive group leader, so we know
// that nobody else can be writing to these particular stats. // that nobody else can be writing to these particular stats.
@ -1081,7 +1086,7 @@ Status DBImpl::WriteImplWALOnly(
// This error checking and return is moved up to avoid using uninitialized // This error checking and return is moved up to avoid using uninitialized
// last_sequence. // last_sequence.
if (!io_s.ok()) { if (!io_s.ok()) {
IOStatusCheck(io_s); WALIOStatusCheck(io_s);
write_thread->ExitAsBatchGroupLeader(write_group, status); write_thread->ExitAsBatchGroupLeader(write_group, status);
return status; return status;
} }
@ -1179,7 +1184,7 @@ void DBImpl::WriteStatusCheck(const Status& status) {
} }
} }
void DBImpl::IOStatusCheck(const IOStatus& io_status) { void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
// Is setting bg_error_ enough here? This will at least stop // Is setting bg_error_ enough here? This will at least stop
// compaction and fail any further writes. // compaction and fail any further writes.
if ((immutable_db_options_.paranoid_checks && !io_status.ok() && if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
@ -1187,7 +1192,8 @@ void DBImpl::IOStatusCheck(const IOStatus& io_status) {
io_status.IsIOFenced()) { io_status.IsIOFenced()) {
mutex_.Lock(); mutex_.Lock();
// Maybe change the return status to void? // Maybe change the return status to void?
error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback); error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback,
/*wal_related=*/true);
mutex_.Unlock(); mutex_.Unlock();
} else { } else {
// Force writable file to be continue writable. // Force writable file to be continue writable.
@ -1484,9 +1490,14 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
if (!io_s.ok()) { if (!io_s.ok()) {
break; break;
} }
io_s = log.writer->file()->Sync(opts, immutable_db_options_.use_fsync); // If last sync failed on a later WAL, this could be a fully synced
if (!io_s.ok()) { // and closed WAL that just needs to be recorded as synced in the
break; // manifest.
if (auto* f = log.writer->file()) {
io_s = f->Sync(opts, immutable_db_options_.use_fsync);
if (!io_s.ok()) {
break;
}
} }
} }
} }
@ -1599,6 +1610,8 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
Status DBImpl::WriteRecoverableState() { Status DBImpl::WriteRecoverableState() {
mutex_.AssertHeld(); mutex_.AssertHeld();
if (!cached_recoverable_state_empty_) { if (!cached_recoverable_state_empty_) {
// Only for write-prepared and write-unprepared.
assert(seq_per_batch_);
bool dont_care_bool; bool dont_care_bool;
SequenceNumber next_seq; SequenceNumber next_seq;
if (two_write_queues_) { if (two_write_queues_) {
@ -1788,13 +1801,13 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
if (!immutable_db_options_.atomic_flush) { if (!immutable_db_options_.atomic_flush) {
FlushRequest flush_req; FlushRequest flush_req;
GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req); GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
} }
} }
if (immutable_db_options_.atomic_flush) { if (immutable_db_options_.atomic_flush) {
FlushRequest flush_req; FlushRequest flush_req;
GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req); GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
} }
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
} }
@ -1880,13 +1893,13 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
FlushRequest flush_req; FlushRequest flush_req;
GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager, GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager,
&flush_req); &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
} }
} }
if (immutable_db_options_.atomic_flush) { if (immutable_db_options_.atomic_flush) {
FlushRequest flush_req; FlushRequest flush_req;
GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req); GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
} }
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
} }
@ -2162,12 +2175,12 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
AssignAtomicFlushSeq(cfds); AssignAtomicFlushSeq(cfds);
FlushRequest flush_req; FlushRequest flush_req;
GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req); GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
} else { } else {
for (auto* cfd : cfds) { for (auto* cfd : cfds) {
FlushRequest flush_req; FlushRequest flush_req;
GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req); GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req);
SchedulePendingFlush(flush_req); EnqueuePendingFlush(flush_req);
} }
} }
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
@ -2240,8 +2253,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
memtable_info.cf_name = cfd->GetName(); memtable_info.cf_name = cfd->GetName();
memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
memtable_info.num_entries = cfd->mem()->num_entries(); memtable_info.num_entries = cfd->mem()->NumEntries();
memtable_info.num_deletes = cfd->mem()->num_deletes(); memtable_info.num_deletes = cfd->mem()->NumDeletion();
if (!cfd->ioptions()->persist_user_defined_timestamps && if (!cfd->ioptions()->persist_user_defined_timestamps &&
cfd->user_comparator()->timestamp_size() > 0) { cfd->user_comparator()->timestamp_size() > 0) {
const Slice& newest_udt = cfd->mem()->GetNewestUDT(); const Slice& newest_udt = cfd->mem()->GetNewestUDT();
@ -2325,7 +2338,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
// We may have lost data from the WritableFileBuffer in-memory buffer for // We may have lost data from the WritableFileBuffer in-memory buffer for
// the current log, so treat it as a fatal error and set bg_error // the current log, so treat it as a fatal error and set bg_error
if (!io_s.ok()) { if (!io_s.ok()) {
error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable); error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable,
/*wal_related=*/true);
} else { } else {
error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable); error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
} }

View File

@ -27,12 +27,14 @@ class CorruptionFS : public FileSystemWrapper {
num_writable_file_errors_(0), num_writable_file_errors_(0),
corruption_trigger_(INT_MAX), corruption_trigger_(INT_MAX),
read_count_(0), read_count_(0),
corrupt_offset_(0),
corrupt_len_(0),
rnd_(300), rnd_(300),
fs_buffer_(fs_buffer), fs_buffer_(fs_buffer),
verify_read_(verify_read) {} verify_read_(verify_read) {}
~CorruptionFS() override { ~CorruptionFS() override {
// Assert that the corruption was reset, which means it got triggered // Assert that the corruption was reset, which means it got triggered
assert(corruption_trigger_ == INT_MAX); assert(corruption_trigger_ == INT_MAX || corrupt_len_ > 0);
} }
const char* Name() const override { return "ErrorEnv"; } const char* Name() const override { return "ErrorEnv"; }
@ -48,8 +50,10 @@ class CorruptionFS : public FileSystemWrapper {
} }
void SetCorruptionTrigger(const int trigger) { void SetCorruptionTrigger(const int trigger) {
MutexLock l(&mutex_);
corruption_trigger_ = trigger; corruption_trigger_ = trigger;
read_count_ = 0; read_count_ = 0;
corrupt_fname_.clear();
} }
IOStatus NewRandomAccessFile(const std::string& fname, IOStatus NewRandomAccessFile(const std::string& fname,
@ -58,25 +62,31 @@ class CorruptionFS : public FileSystemWrapper {
IODebugContext* dbg) override { IODebugContext* dbg) override {
class CorruptionRandomAccessFile : public FSRandomAccessFileOwnerWrapper { class CorruptionRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
public: public:
CorruptionRandomAccessFile(CorruptionFS& fs, CorruptionRandomAccessFile(CorruptionFS& fs, const std::string& fname,
std::unique_ptr<FSRandomAccessFile>& file) std::unique_ptr<FSRandomAccessFile>& file)
: FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {} : FSRandomAccessFileOwnerWrapper(std::move(file)),
fs_(fs),
fname_(fname) {}
IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts, IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
Slice* result, char* scratch, Slice* result, char* scratch,
IODebugContext* dbg) const override { IODebugContext* dbg) const override {
IOStatus s = target()->Read(offset, len, opts, result, scratch, dbg); IOStatus s = target()->Read(offset, len, opts, result, scratch, dbg);
if (opts.verify_and_reconstruct_read) { if (opts.verify_and_reconstruct_read) {
fs_.MaybeResetOverlapWithCorruptedChunk(fname_, offset,
result->size());
return s; return s;
} }
MutexLock l(&fs_.mutex_);
if (s.ok() && ++fs_.read_count_ >= fs_.corruption_trigger_) { if (s.ok() && ++fs_.read_count_ >= fs_.corruption_trigger_) {
fs_.read_count_ = 0;
fs_.corruption_trigger_ = INT_MAX; fs_.corruption_trigger_ = INT_MAX;
char* data = const_cast<char*>(result->data()); char* data = const_cast<char*>(result->data());
std::memcpy( std::memcpy(
data, data,
fs_.rnd_.RandomString(static_cast<int>(result->size())).c_str(), fs_.rnd_.RandomString(static_cast<int>(result->size())).c_str(),
result->size()); result->size());
fs_.SetCorruptedChunk(fname_, offset, result->size());
} }
return s; return s;
} }
@ -101,14 +111,76 @@ class CorruptionFS : public FileSystemWrapper {
return IOStatus::OK(); return IOStatus::OK();
} }
IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
const IOOptions& /*options*/,
IODebugContext* /*dbg*/) override {
return IOStatus::NotSupported("Prefetch");
}
private: private:
CorruptionFS& fs_; CorruptionFS& fs_;
std::string fname_;
}; };
std::unique_ptr<FSRandomAccessFile> file; std::unique_ptr<FSRandomAccessFile> file;
IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
EXPECT_OK(s); EXPECT_OK(s);
result->reset(new CorruptionRandomAccessFile(*this, file)); result->reset(new CorruptionRandomAccessFile(*this, fname, file));
return s;
}
IOStatus NewSequentialFile(const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<FSSequentialFile>* result,
IODebugContext* dbg) override {
class CorruptionSequentialFile : public FSSequentialFileOwnerWrapper {
public:
CorruptionSequentialFile(CorruptionFS& fs, const std::string& fname,
std::unique_ptr<FSSequentialFile>& file)
: FSSequentialFileOwnerWrapper(std::move(file)),
fs_(fs),
fname_(fname),
offset_(0) {}
IOStatus Read(size_t len, const IOOptions& opts, Slice* result,
char* scratch, IODebugContext* dbg) override {
IOStatus s = target()->Read(len, opts, result, scratch, dbg);
if (result->size() == 0 ||
fname_.find("IDENTITY") != std::string::npos) {
return s;
}
if (opts.verify_and_reconstruct_read) {
fs_.MaybeResetOverlapWithCorruptedChunk(fname_, offset_,
result->size());
return s;
}
MutexLock l(&fs_.mutex_);
if (s.ok() && ++fs_.read_count_ >= fs_.corruption_trigger_) {
fs_.corruption_trigger_ = INT_MAX;
char* data = const_cast<char*>(result->data());
std::memcpy(
data,
fs_.rnd_.RandomString(static_cast<int>(result->size())).c_str(),
result->size());
fs_.SetCorruptedChunk(fname_, offset_, result->size());
}
offset_ += result->size();
return s;
}
private:
CorruptionFS& fs_;
std::string fname_;
size_t offset_;
};
std::unique_ptr<FSSequentialFile> file;
IOStatus s = target()->NewSequentialFile(fname, file_opts, &file, dbg);
EXPECT_OK(s);
result->reset(new CorruptionSequentialFile(*this, fname, file));
return s; return s;
} }
@ -123,12 +195,40 @@ class CorruptionFS : public FileSystemWrapper {
} }
} }
void SetCorruptedChunk(const std::string& fname, size_t offset, size_t len) {
assert(corrupt_fname_.empty());
corrupt_fname_ = fname;
corrupt_offset_ = offset;
corrupt_len_ = len;
}
void MaybeResetOverlapWithCorruptedChunk(const std::string& fname,
size_t offset, size_t len) {
if (fname == corrupt_fname_ &&
((offset <= corrupt_offset_ && (offset + len) > corrupt_offset_) ||
(offset >= corrupt_offset_ &&
offset < (corrupt_offset_ + corrupt_len_)))) {
corrupt_fname_.clear();
}
}
bool VerifyRetry() { return corrupt_len_ > 0 && corrupt_fname_.empty(); }
int read_count() { return read_count_; }
int corruption_trigger() { return corruption_trigger_; }
private: private:
int corruption_trigger_; int corruption_trigger_;
int read_count_; int read_count_;
std::string corrupt_fname_;
size_t corrupt_offset_;
size_t corrupt_len_;
Random rnd_; Random rnd_;
bool fs_buffer_; bool fs_buffer_;
bool verify_read_; bool verify_read_;
port::Mutex mutex_;
}; };
} // anonymous namespace } // anonymous namespace
@ -705,6 +805,7 @@ class DBIOCorruptionTest
DBIOCorruptionTest() : DBIOFailureTest() { DBIOCorruptionTest() : DBIOFailureTest() {
BlockBasedTableOptions bbto; BlockBasedTableOptions bbto;
options_ = CurrentOptions(); options_ = CurrentOptions();
options_.statistics = CreateDBStatistics();
base_env_ = env_; base_env_ = env_;
EXPECT_NE(base_env_, nullptr); EXPECT_NE(base_env_, nullptr);
@ -716,6 +817,7 @@ class DBIOCorruptionTest
bbto.num_file_reads_for_auto_readahead = 0; bbto.num_file_reads_for_auto_readahead = 0;
options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
options_.disable_auto_compactions = true; options_.disable_auto_compactions = true;
options_.max_file_opening_threads = 0;
Reopen(options_); Reopen(options_);
} }
@ -727,6 +829,8 @@ class DBIOCorruptionTest
Status ReopenDB() { return TryReopen(options_); } Status ReopenDB() { return TryReopen(options_); }
Statistics* stats() { return options_.statistics.get(); }
protected: protected:
std::unique_ptr<Env> env_guard_; std::unique_ptr<Env> env_guard_;
std::shared_ptr<CorruptionFS> fs_; std::shared_ptr<CorruptionFS> fs_;
@ -749,8 +853,12 @@ TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) {
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(val, "val1"); ASSERT_EQ(val, "val1");
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -773,8 +881,12 @@ TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) {
} }
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_TRUE(iter->status().IsCorruption()); ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
delete iter; delete iter;
} }
@ -799,9 +911,13 @@ TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) {
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_EQ(values[0].ToString(), "val1"); ASSERT_EQ(values[0].ToString(), "val1");
ASSERT_EQ(values[1].ToString(), "val2"); ASSERT_EQ(values[1].ToString(), "val2");
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_TRUE(statuses[0].IsCorruption()); ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_TRUE(statuses[1].IsCorruption()); ASSERT_TRUE(statuses[1].IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -818,6 +934,9 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val; std::string val;
ReadOptions ro; ReadOptions ro;
@ -826,6 +945,7 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
ASSERT_EQ(val, "val1"); ASSERT_EQ(val, "val1");
} else { } else {
ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.IsCorruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -838,6 +958,9 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
Status s = Flush(); Status s = Flush();
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(s); ASSERT_OK(s);
ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val; std::string val;
ReadOptions ro; ReadOptions ro;
@ -846,6 +969,7 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
ASSERT_EQ(val, "val1"); ASSERT_EQ(val, "val1");
} else { } else {
ASSERT_NOK(s); ASSERT_NOK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
} }
@ -862,12 +986,142 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {
if (std::get<2>(GetParam())) { if (std::get<2>(GetParam())) {
ASSERT_OK(ReopenDB()); ASSERT_OK(ReopenDB());
ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
} else { } else {
ASSERT_EQ(ReopenDB(), Status::Corruption()); ASSERT_EQ(ReopenDB(), Status::Corruption());
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
} }
SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->DisableProcessing();
} }
TEST_P(DBIOCorruptionTest, FooterReadCorruptionRetry) {
Random rnd(300);
bool retry = false;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"ReadFooterFromFileInternal:0", [&](void* arg) {
Slice* data = static_cast<Slice*>(arg);
if (!retry) {
std::memcpy(const_cast<char*>(data->data()),
rnd.RandomString(static_cast<int>(data->size())).c_str(),
data->size());
retry = true;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put("key1", "val1"));
Status s = Flush();
if (std::get<2>(GetParam())) {
ASSERT_OK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val;
ReadOptions ro;
ro.async_io = std::get<1>(GetParam());
ASSERT_OK(dbfull()->Get(ro, "key1", &val));
ASSERT_EQ(val, "val1");
} else {
ASSERT_NOK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
ASSERT_GT(stats()->getTickerCount(SST_FOOTER_CORRUPTION_COUNT), 0);
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_P(DBIOCorruptionTest, TablePropertiesCorruptionRetry) {
Random rnd(300);
bool retry = false;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"ReadTablePropertiesHelper:0", [&](void* arg) {
Slice* data = static_cast<Slice*>(arg);
if (!retry) {
std::memcpy(const_cast<char*>(data->data()),
rnd.RandomString(static_cast<int>(data->size())).c_str(),
data->size());
retry = true;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put("key1", "val1"));
Status s = Flush();
if (std::get<2>(GetParam())) {
ASSERT_OK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
1);
std::string val;
ReadOptions ro;
ro.async_io = std::get<1>(GetParam());
ASSERT_OK(dbfull()->Get(ro, "key1", &val));
ASSERT_EQ(val, "val1");
} else {
ASSERT_NOK(s);
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_P(DBIOCorruptionTest, DBOpenReadCorruptionRetry) {
if (!std::get<2>(GetParam())) {
return;
}
CorruptionFS* fs =
static_cast<CorruptionFS*>(env_guard_->GetFileSystem().get());
for (int sst = 0; sst < 3; ++sst) {
for (int key = 0; key < 100; ++key) {
std::stringstream ss;
ss << std::setw(3) << 100 * sst + key;
ASSERT_OK(Put("key" + ss.str(), "val" + ss.str()));
}
ASSERT_OK(Flush());
}
Close();
// DB open will create table readers unless we reduce the table cache
// capacity.
// SanitizeOptions will set max_open_files to minimum of 20. Table cache
// is allocated with max_open_files - 10 as capacity. So override
// max_open_files to 11 so table cache capacity will become 1. This will
// prevent file open during DB open and force the file to be opened
// during MultiGet
SyncPoint::GetInstance()->SetCallBack(
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
int* max_open_files = (int*)arg;
*max_open_files = 11;
});
SyncPoint::GetInstance()->EnableProcessing();
// Progressively increase the IO count trigger for corruption, and verify
// that it was retried
int corruption_trigger = 1;
fs->SetCorruptionTrigger(corruption_trigger);
do {
fs->SetCorruptionTrigger(corruption_trigger);
ASSERT_OK(ReopenDB());
for (int sst = 0; sst < 3; ++sst) {
for (int key = 0; key < 100; ++key) {
std::stringstream ss;
ss << std::setw(3) << 100 * sst + key;
ASSERT_EQ(Get("key" + ss.str()), "val" + ss.str());
}
}
// Verify that the injected corruption was repaired
ASSERT_TRUE(fs->VerifyRetry());
corruption_trigger++;
} while (fs->corruption_trigger() == INT_MAX);
}
// The parameters are - 1. Use FS provided buffer, 2. Use async IO ReadOption, // The parameters are - 1. Use FS provided buffer, 2. Use async IO ReadOption,
// 3. Retry with verify_and_reconstruct_read IOOption // 3. Retry with verify_and_reconstruct_read IOOption
INSTANTIATE_TEST_CASE_P(DBIOCorruptionTest, DBIOCorruptionTest, INSTANTIATE_TEST_CASE_P(DBIOCorruptionTest, DBIOCorruptionTest,

View File

@ -52,7 +52,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
user_comparator_(cmp), user_comparator_(cmp),
merge_operator_(ioptions.merge_operator.get()), merge_operator_(ioptions.merge_operator.get()),
iter_(iter), iter_(iter),
version_(version), blob_reader_(version, read_options.read_tier,
read_options.verify_checksums, read_options.fill_cache,
read_options.io_activity),
read_callback_(read_callback), read_callback_(read_callback),
sequence_(s), sequence_(s),
statistics_(ioptions.stats), statistics_(ioptions.stats),
@ -65,20 +67,16 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
valid_(false), valid_(false),
current_entry_is_merged_(false), current_entry_is_merged_(false),
is_key_seqnum_zero_(false), is_key_seqnum_zero_(false),
prefix_same_as_start_(mutable_cf_options.prefix_extractor prefix_same_as_start_(
? read_options.prefix_same_as_start prefix_extractor_ ? read_options.prefix_same_as_start : false),
: false),
pin_thru_lifetime_(read_options.pin_data), pin_thru_lifetime_(read_options.pin_data),
expect_total_order_inner_iter_(prefix_extractor_ == nullptr || expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
read_options.total_order_seek || read_options.total_order_seek ||
read_options.auto_prefix_mode), read_options.auto_prefix_mode),
read_tier_(read_options.read_tier),
fill_cache_(read_options.fill_cache),
verify_checksums_(read_options.verify_checksums),
expose_blob_index_(expose_blob_index), expose_blob_index_(expose_blob_index),
allow_unprepared_value_(read_options.allow_unprepared_value),
is_blob_(false), is_blob_(false),
arena_mode_(arena_mode), arena_mode_(arena_mode),
io_activity_(read_options.io_activity),
cfh_(cfh), cfh_(cfh),
timestamp_ub_(read_options.timestamp), timestamp_ub_(read_options.timestamp),
timestamp_lb_(read_options.iter_start_ts), timestamp_lb_(read_options.iter_start_ts),
@ -93,6 +91,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
status_.PermitUncheckedError(); status_.PermitUncheckedError();
assert(timestamp_size_ == assert(timestamp_size_ ==
user_comparator_.user_comparator()->timestamp_size()); user_comparator_.user_comparator()->timestamp_size());
// prefix_seek_opt_in_only should force total_order_seek whereever the caller
// is duplicating the original ReadOptions
assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
} }
Status DBIter::GetProperty(std::string prop_name, std::string* prop) { Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
@ -149,7 +150,7 @@ void DBIter::Next() {
PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
// Release temporarily pinned blocks from last operation // Release temporarily pinned blocks from last operation
ReleaseTempPinnedData(); ReleaseTempPinnedData();
ResetBlobValue(); ResetBlobData();
ResetValueAndColumns(); ResetValueAndColumns();
local_stats_.skip_count_ += num_internal_keys_skipped_; local_stats_.skip_count_ += num_internal_keys_skipped_;
local_stats_.skip_count_--; local_stats_.skip_count_--;
@ -192,29 +193,21 @@ void DBIter::Next() {
} }
} }
bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, Status DBIter::BlobReader::RetrieveAndSetBlobValue(const Slice& user_key,
const Slice& blob_index) { const Slice& blob_index) {
assert(!is_blob_);
assert(blob_value_.empty()); assert(blob_value_.empty());
if (expose_blob_index_) { // Stacked BlobDB implementation
is_blob_ = true;
return true;
}
if (!version_) { if (!version_) {
status_ = Status::Corruption("Encountered unexpected blob index."); return Status::Corruption("Encountered unexpected blob index.");
valid_ = false;
return false;
} }
// TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
// avoid having to copy options back and forth. // avoid having to copy options back and forth.
// TODO: plumb Env::IOActivity, Env::IOPriority // TODO: plumb Env::IOPriority
ReadOptions read_options; ReadOptions read_options;
read_options.read_tier = read_tier_; read_options.read_tier = read_tier_;
read_options.fill_cache = fill_cache_;
read_options.verify_checksums = verify_checksums_; read_options.verify_checksums = verify_checksums_;
read_options.fill_cache = fill_cache_;
read_options.io_activity = io_activity_; read_options.io_activity = io_activity_;
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
constexpr uint64_t* bytes_read = nullptr; constexpr uint64_t* bytes_read = nullptr;
@ -222,16 +215,51 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
const Status s = version_->GetBlob(read_options, user_key, blob_index, const Status s = version_->GetBlob(read_options, user_key, blob_index,
prefetch_buffer, &blob_value_, bytes_read); prefetch_buffer, &blob_value_, bytes_read);
if (!s.ok()) {
return s;
}
return Status::OK();
}
bool DBIter::SetValueAndColumnsFromBlobImpl(const Slice& user_key,
const Slice& blob_index) {
const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index);
if (!s.ok()) { if (!s.ok()) {
status_ = s; status_ = s;
valid_ = false; valid_ = false;
is_blob_ = false;
return false; return false;
} }
is_blob_ = true; SetValueAndColumnsFromPlain(blob_reader_.GetBlobValue());
return true; return true;
} }
bool DBIter::SetValueAndColumnsFromBlob(const Slice& user_key,
const Slice& blob_index) {
assert(!is_blob_);
is_blob_ = true;
if (expose_blob_index_) {
SetValueAndColumnsFromPlain(blob_index);
return true;
}
if (allow_unprepared_value_) {
assert(value_.empty());
assert(wide_columns_.empty());
assert(lazy_blob_index_.empty());
lazy_blob_index_ = blob_index;
return true;
}
return SetValueAndColumnsFromBlobImpl(user_key, blob_index);
}
bool DBIter::SetValueAndColumnsFromEntity(Slice slice) { bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
assert(value_.empty()); assert(value_.empty());
assert(wide_columns_.empty()); assert(wide_columns_.empty());
@ -277,6 +305,24 @@ bool DBIter::SetValueAndColumnsFromMergeResult(const Status& merge_status,
return true; return true;
} }
bool DBIter::PrepareValue() {
assert(valid_);
if (lazy_blob_index_.empty()) {
return true;
}
assert(allow_unprepared_value_);
assert(is_blob_);
const bool result =
SetValueAndColumnsFromBlobImpl(saved_key_.GetUserKey(), lazy_blob_index_);
lazy_blob_index_.clear();
return result;
}
// PRE: saved_key_ has the current user key if skipping_saved_key // PRE: saved_key_ has the current user key if skipping_saved_key
// POST: saved_key_ should have the next user key if valid_, // POST: saved_key_ should have the next user key if valid_,
// if the current entry is a result of merge // if the current entry is a result of merge
@ -406,7 +452,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
case kTypeValuePreferredSeqno: case kTypeValuePreferredSeqno:
case kTypeBlobIndex: case kTypeBlobIndex:
case kTypeWideColumnEntity: case kTypeWideColumnEntity:
if (!PrepareValue()) { if (!PrepareValueInternal()) {
return false; return false;
} }
if (timestamp_lb_) { if (timestamp_lb_) {
@ -418,12 +464,9 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
} }
if (ikey_.type == kTypeBlobIndex) { if (ikey_.type == kTypeBlobIndex) {
if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { if (!SetValueAndColumnsFromBlob(ikey_.user_key, iter_.value())) {
return false; return false;
} }
SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value()
: blob_value_);
} else if (ikey_.type == kTypeWideColumnEntity) { } else if (ikey_.type == kTypeWideColumnEntity) {
if (!SetValueAndColumnsFromEntity(iter_.value())) { if (!SetValueAndColumnsFromEntity(iter_.value())) {
return false; return false;
@ -443,7 +486,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
return true; return true;
break; break;
case kTypeMerge: case kTypeMerge:
if (!PrepareValue()) { if (!PrepareValueInternal()) {
return false; return false;
} }
saved_key_.SetUserKey( saved_key_.SetUserKey(
@ -538,6 +581,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
} else { } else {
iter_.Next(); iter_.Next();
} }
// This could be a long-running operation due to tombstones, etc.
ROCKSDB_THREAD_YIELD_HOOK();
} while (iter_.Valid()); } while (iter_.Valid());
valid_ = false; valid_ = false;
@ -588,7 +633,7 @@ bool DBIter::MergeValuesNewToOld() {
iter_.Next(); iter_.Next();
break; break;
} }
if (!PrepareValue()) { if (!PrepareValueInternal()) {
return false; return false;
} }
@ -617,23 +662,9 @@ bool DBIter::MergeValuesNewToOld() {
iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1); PERF_COUNTER_ADD(internal_merge_count, 1);
} else if (kTypeBlobIndex == ikey.type) { } else if (kTypeBlobIndex == ikey.type) {
if (expose_blob_index_) { if (!MergeWithBlobBaseValue(iter_.value(), ikey.user_key)) {
status_ =
Status::NotSupported("BlobDB does not support merge operator.");
valid_ = false;
return false; return false;
} }
// hit a put, merge the put value with operands and store the
// final result in saved_value_. We are done!
if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
return false;
}
valid_ = true;
if (!MergeWithPlainBaseValue(blob_value_, ikey.user_key)) {
return false;
}
ResetBlobValue();
// iter_ is positioned after put // iter_ is positioned after put
iter_.Next(); iter_.Next();
@ -641,6 +672,7 @@ bool DBIter::MergeValuesNewToOld() {
valid_ = false; valid_ = false;
return false; return false;
} }
return true; return true;
} else if (kTypeWideColumnEntity == ikey.type) { } else if (kTypeWideColumnEntity == ikey.type) {
if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) { if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) {
@ -687,7 +719,7 @@ void DBIter::Prev() {
PERF_COUNTER_ADD(iter_prev_count, 1); PERF_COUNTER_ADD(iter_prev_count, 1);
PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
ReleaseTempPinnedData(); ReleaseTempPinnedData();
ResetBlobValue(); ResetBlobData();
ResetValueAndColumns(); ResetValueAndColumns();
ResetInternalKeysSkippedCounter(); ResetInternalKeysSkippedCounter();
bool ok = true; bool ok = true;
@ -924,7 +956,7 @@ bool DBIter::FindValueForCurrentKey() {
return FindValueForCurrentKeyUsingSeek(); return FindValueForCurrentKeyUsingSeek();
} }
if (!PrepareValue()) { if (!PrepareValueInternal()) {
return false; return false;
} }
@ -1039,21 +1071,9 @@ bool DBIter::FindValueForCurrentKey() {
} }
return true; return true;
} else if (last_not_merge_type == kTypeBlobIndex) { } else if (last_not_merge_type == kTypeBlobIndex) {
if (expose_blob_index_) { if (!MergeWithBlobBaseValue(pinned_value_, saved_key_.GetUserKey())) {
status_ =
Status::NotSupported("BlobDB does not support merge operator.");
valid_ = false;
return false; return false;
} }
if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
return false;
}
valid_ = true;
if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) {
return false;
}
ResetBlobValue();
return true; return true;
} else if (last_not_merge_type == kTypeWideColumnEntity) { } else if (last_not_merge_type == kTypeWideColumnEntity) {
@ -1078,13 +1098,9 @@ bool DBIter::FindValueForCurrentKey() {
break; break;
case kTypeBlobIndex: case kTypeBlobIndex:
if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { if (!SetValueAndColumnsFromBlob(saved_key_.GetUserKey(), pinned_value_)) {
return false; return false;
} }
SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
: blob_value_);
break; break;
case kTypeWideColumnEntity: case kTypeWideColumnEntity:
if (!SetValueAndColumnsFromEntity(pinned_value_)) { if (!SetValueAndColumnsFromEntity(pinned_value_)) {
@ -1171,7 +1187,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
} }
return true; return true;
} }
if (!PrepareValue()) { if (!PrepareValueInternal()) {
return false; return false;
} }
if (timestamp_size_ > 0) { if (timestamp_size_ > 0) {
@ -1188,12 +1204,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
pinned_value_ = iter_.value(); pinned_value_ = iter_.value();
} }
if (ikey.type == kTypeBlobIndex) { if (ikey.type == kTypeBlobIndex) {
if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) { if (!SetValueAndColumnsFromBlob(ikey.user_key, pinned_value_)) {
return false; return false;
} }
SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
: blob_value_);
} else if (ikey.type == kTypeWideColumnEntity) { } else if (ikey.type == kTypeWideColumnEntity) {
if (!SetValueAndColumnsFromEntity(pinned_value_)) { if (!SetValueAndColumnsFromEntity(pinned_value_)) {
return false; return false;
@ -1241,7 +1254,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
ikey.type == kTypeDeletionWithTimestamp) { ikey.type == kTypeDeletionWithTimestamp) {
break; break;
} }
if (!PrepareValue()) { if (!PrepareValueInternal()) {
return false; return false;
} }
@ -1259,21 +1272,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
PERF_COUNTER_ADD(internal_merge_count, 1); PERF_COUNTER_ADD(internal_merge_count, 1);
} else if (ikey.type == kTypeBlobIndex) { } else if (ikey.type == kTypeBlobIndex) {
if (expose_blob_index_) { if (!MergeWithBlobBaseValue(iter_.value(), saved_key_.GetUserKey())) {
status_ =
Status::NotSupported("BlobDB does not support merge operator.");
valid_ = false;
return false; return false;
} }
if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
return false;
}
valid_ = true;
if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) {
return false;
}
ResetBlobValue();
return true; return true;
} else if (ikey.type == kTypeWideColumnEntity) { } else if (ikey.type == kTypeWideColumnEntity) {
@ -1340,6 +1341,35 @@ bool DBIter::MergeWithPlainBaseValue(const Slice& value,
return SetValueAndColumnsFromMergeResult(s, result_type); return SetValueAndColumnsFromMergeResult(s, result_type);
} }
bool DBIter::MergeWithBlobBaseValue(const Slice& blob_index,
const Slice& user_key) {
assert(!is_blob_);
if (expose_blob_index_) {
status_ =
Status::NotSupported("Legacy BlobDB does not support merge operator.");
valid_ = false;
return false;
}
const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index);
if (!s.ok()) {
status_ = s;
valid_ = false;
return false;
}
valid_ = true;
if (!MergeWithPlainBaseValue(blob_reader_.GetBlobValue(), user_key)) {
return false;
}
blob_reader_.ResetBlobValue();
return true;
}
bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity, bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity,
const Slice& user_key) { const Slice& user_key) {
// `op_failure_scope` (an output parameter) is not provided (set to nullptr) // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
@ -1529,7 +1559,7 @@ void DBIter::Seek(const Slice& target) {
status_ = Status::OK(); status_ = Status::OK();
ReleaseTempPinnedData(); ReleaseTempPinnedData();
ResetBlobValue(); ResetBlobData();
ResetValueAndColumns(); ResetValueAndColumns();
ResetInternalKeysSkippedCounter(); ResetInternalKeysSkippedCounter();
@ -1605,7 +1635,7 @@ void DBIter::SeekForPrev(const Slice& target) {
status_ = Status::OK(); status_ = Status::OK();
ReleaseTempPinnedData(); ReleaseTempPinnedData();
ResetBlobValue(); ResetBlobData();
ResetValueAndColumns(); ResetValueAndColumns();
ResetInternalKeysSkippedCounter(); ResetInternalKeysSkippedCounter();
@ -1666,7 +1696,7 @@ void DBIter::SeekToFirst() {
status_.PermitUncheckedError(); status_.PermitUncheckedError();
direction_ = kForward; direction_ = kForward;
ReleaseTempPinnedData(); ReleaseTempPinnedData();
ResetBlobValue(); ResetBlobData();
ResetValueAndColumns(); ResetValueAndColumns();
ResetInternalKeysSkippedCounter(); ResetInternalKeysSkippedCounter();
ClearSavedValue(); ClearSavedValue();
@ -1729,7 +1759,7 @@ void DBIter::SeekToLast() {
status_.PermitUncheckedError(); status_.PermitUncheckedError();
direction_ = kReverse; direction_ = kReverse;
ReleaseTempPinnedData(); ReleaseTempPinnedData();
ResetBlobValue(); ResetBlobData();
ResetValueAndColumns(); ResetValueAndColumns();
ResetInternalKeysSkippedCounter(); ResetInternalKeysSkippedCounter();
ClearSavedValue(); ClearSavedValue();

View File

@ -218,7 +218,34 @@ class DBIter final : public Iterator {
} }
void set_valid(bool v) { valid_ = v; } void set_valid(bool v) { valid_ = v; }
bool PrepareValue() override;
private: private:
class BlobReader {
public:
BlobReader(const Version* version, ReadTier read_tier,
bool verify_checksums, bool fill_cache,
Env::IOActivity io_activity)
: version_(version),
read_tier_(read_tier),
verify_checksums_(verify_checksums),
fill_cache_(fill_cache),
io_activity_(io_activity) {}
const Slice& GetBlobValue() const { return blob_value_; }
Status RetrieveAndSetBlobValue(const Slice& user_key,
const Slice& blob_index);
void ResetBlobValue() { blob_value_.Reset(); }
private:
PinnableSlice blob_value_;
const Version* version_;
ReadTier read_tier_;
bool verify_checksums_;
bool fill_cache_;
Env::IOActivity io_activity_;
};
// For all methods in this block: // For all methods in this block:
// PRE: iter_->Valid() && status_.ok() // PRE: iter_->Valid() && status_.ok()
// Return false if there was an error, and status() is non-ok, valid_ = false; // Return false if there was an error, and status() is non-ok, valid_ = false;
@ -299,15 +326,6 @@ class DBIter final : public Iterator {
: user_comparator_.CompareWithoutTimestamp(a, b); : user_comparator_.CompareWithoutTimestamp(a, b);
} }
// Retrieves the blob value for the specified user key using the given blob
// index when using the integrated BlobDB implementation.
bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index);
void ResetBlobValue() {
is_blob_ = false;
blob_value_.Reset();
}
void SetValueAndColumnsFromPlain(const Slice& slice) { void SetValueAndColumnsFromPlain(const Slice& slice) {
assert(value_.empty()); assert(value_.empty());
assert(wide_columns_.empty()); assert(wide_columns_.empty());
@ -316,6 +334,11 @@ class DBIter final : public Iterator {
wide_columns_.emplace_back(kDefaultWideColumnName, slice); wide_columns_.emplace_back(kDefaultWideColumnName, slice);
} }
bool SetValueAndColumnsFromBlobImpl(const Slice& user_key,
const Slice& blob_index);
bool SetValueAndColumnsFromBlob(const Slice& user_key,
const Slice& blob_index);
bool SetValueAndColumnsFromEntity(Slice slice); bool SetValueAndColumnsFromEntity(Slice slice);
bool SetValueAndColumnsFromMergeResult(const Status& merge_status, bool SetValueAndColumnsFromMergeResult(const Status& merge_status,
@ -326,14 +349,21 @@ class DBIter final : public Iterator {
wide_columns_.clear(); wide_columns_.clear();
} }
void ResetBlobData() {
blob_reader_.ResetBlobValue();
lazy_blob_index_.clear();
is_blob_ = false;
}
// The following methods perform the actual merge operation for the // The following methods perform the actual merge operation for the
// no base value/plain base value/wide-column base value cases. // no/plain/blob/wide-column base value cases.
// If user-defined timestamp is enabled, `user_key` includes timestamp. // If user-defined timestamp is enabled, `user_key` includes timestamp.
bool MergeWithNoBaseValue(const Slice& user_key); bool MergeWithNoBaseValue(const Slice& user_key);
bool MergeWithPlainBaseValue(const Slice& value, const Slice& user_key); bool MergeWithPlainBaseValue(const Slice& value, const Slice& user_key);
bool MergeWithBlobBaseValue(const Slice& blob_index, const Slice& user_key);
bool MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key); bool MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key);
bool PrepareValue() { bool PrepareValueInternal() {
if (!iter_.PrepareValue()) { if (!iter_.PrepareValue()) {
assert(!iter_.status().ok()); assert(!iter_.status().ok());
valid_ = false; valid_ = false;
@ -356,7 +386,7 @@ class DBIter final : public Iterator {
UserComparatorWrapper user_comparator_; UserComparatorWrapper user_comparator_;
const MergeOperator* const merge_operator_; const MergeOperator* const merge_operator_;
IteratorWrapper iter_; IteratorWrapper iter_;
const Version* version_; BlobReader blob_reader_;
ReadCallback* read_callback_; ReadCallback* read_callback_;
// Max visible sequence number. It is normally the snapshot seq unless we have // Max visible sequence number. It is normally the snapshot seq unless we have
// uncommitted data in db as in WriteUnCommitted. // uncommitted data in db as in WriteUnCommitted.
@ -376,7 +406,6 @@ class DBIter final : public Iterator {
std::string saved_value_; std::string saved_value_;
Slice pinned_value_; Slice pinned_value_;
// for prefix seek mode to support prev() // for prefix seek mode to support prev()
PinnableSlice blob_value_;
// Value of the default column // Value of the default column
Slice value_; Slice value_;
// All columns (i.e. name-value pairs) // All columns (i.e. name-value pairs)
@ -410,15 +439,13 @@ class DBIter final : public Iterator {
// Expect the inner iterator to maintain a total order. // Expect the inner iterator to maintain a total order.
// prefix_extractor_ must be non-NULL if the value is false. // prefix_extractor_ must be non-NULL if the value is false.
const bool expect_total_order_inner_iter_; const bool expect_total_order_inner_iter_;
ReadTier read_tier_;
bool fill_cache_;
bool verify_checksums_;
// Whether the iterator is allowed to expose blob references. Set to true when // Whether the iterator is allowed to expose blob references. Set to true when
// the stacked BlobDB implementation is used, false otherwise. // the stacked BlobDB implementation is used, false otherwise.
bool expose_blob_index_; bool expose_blob_index_;
bool allow_unprepared_value_;
Slice lazy_blob_index_;
bool is_blob_; bool is_blob_;
bool arena_mode_; bool arena_mode_;
const Env::IOActivity io_activity_;
// List of operands for merge operator. // List of operands for merge operator.
MergeContext merge_context_; MergeContext merge_context_;
LocalStatistics local_stats_; LocalStatistics local_stats_;

View File

@ -684,13 +684,14 @@ class DbMemtableKVChecksumTest : public DbKvChecksumTest {
DbMemtableKVChecksumTest() : DbKvChecksumTest() {} DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
protected: protected:
const size_t kValueLenOffset = 12;
// Indices in the memtable entry that we will not corrupt. // Indices in the memtable entry that we will not corrupt.
// For memtable entry format, see comments in MemTable::Add(). // For memtable entry format, see comments in MemTable::Add().
// We do not corrupt key length and value length fields in this test // We do not corrupt key length and value length fields in this test
// case since it causes segfault and ASAN will complain. // case since it causes segfault and ASAN will complain.
// For this test case, key and value are all of length 3, so // For this test case, key and value are all of length 3, so
// key length field is at index 0 and value length field is at index 12. // key length field is at index 0 and value length field is at index 12.
const std::set<size_t> index_not_to_corrupt{0, 12}; const std::set<size_t> index_not_to_corrupt{0, kValueLenOffset};
void SkipNotToCorruptEntry() { void SkipNotToCorruptEntry() {
if (index_not_to_corrupt.find(corrupt_byte_offset_) != if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
@ -737,6 +738,8 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
buf[corrupt_byte_offset_] += corrupt_byte_addend_; buf[corrupt_byte_offset_] += corrupt_byte_addend_;
++corrupt_byte_offset_; ++corrupt_byte_offset_;
}); });
// Corrupt value only so that MultiGet below can find the key.
corrupt_byte_offset_ = kValueLenOffset + 1;
SyncPoint::GetInstance()->EnableProcessing(); SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions(); Options options = CurrentOptions();
options.memtable_protection_bytes_per_key = options.memtable_protection_bytes_per_key =
@ -745,12 +748,17 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
options.merge_operator = MergeOperators::CreateStringAppendOperator(); options.merge_operator = MergeOperators::CreateStringAppendOperator();
} }
std::string key = "key";
SkipNotToCorruptEntry(); SkipNotToCorruptEntry();
while (MoreBytesToCorrupt()) { while (MoreBytesToCorrupt()) {
Reopen(options); Reopen(options);
ASSERT_OK(ExecuteWrite(nullptr)); ASSERT_OK(ExecuteWrite(nullptr));
std::string val; std::string val;
ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption()); ASSERT_TRUE(db_->Get(ReadOptions(), key, &val).IsCorruption());
std::vector<std::string> vals = {val};
std::vector<Status> statuses = db_->MultiGet(
ReadOptions(), {db_->DefaultColumnFamily()}, {key}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
Destroy(options); Destroy(options);
SkipNotToCorruptEntry(); SkipNotToCorruptEntry();
} }

View File

@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
} }
} }
TEST_F(DBMemTableTest, IntegrityChecks) {
// We insert keys key000000, key000001 and key000002 into skiplist at fixed
// height 1 (smallest height). Then we corrupt the second key to aey000001 to
// make it smaller. With `paranoid_memory_checks` set to true, if the
// skip list sees key000000 and then aey000001, then it will report out of
// order keys with corruption status. With `paranoid_memory_checks` set
// to false, read/scan may return wrong results.
for (bool allow_data_in_error : {false, true}) {
Options options = CurrentOptions();
options.allow_data_in_errors = allow_data_in_error;
options.paranoid_memory_checks = true;
DestroyAndReopen(options);
SyncPoint::GetInstance()->SetCallBack(
"InlineSkipList::RandomHeight::height", [](void* h) {
auto height_ptr = static_cast<int*>(h);
*height_ptr = 1;
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(Put(Key(0), "val0"));
ASSERT_OK(Put(Key(2), "val2"));
// p will point to the buffer for encoded key000001
char* p = nullptr;
SyncPoint::GetInstance()->SetCallBack(
"MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) {
p = const_cast<char*>(static_cast<Slice*>(encoded)->data());
});
ASSERT_OK(Put(Key(1), "val1"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
ASSERT_TRUE(p);
// Offset 0 is key size, key bytes start at offset 1.
// "key000001 -> aey000001"
p[1] = 'a';
ReadOptions rops;
std::string val;
Status s = db_->Get(rops, Key(1), &val);
ASSERT_TRUE(s.IsCorruption());
std::string key0 = Slice(Key(0)).ToString(true);
ASSERT_EQ(s.ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Without `paranoid_memory_checks`, NotFound will be returned.
// This would fail an assertion in InlineSkipList::FindGreaterOrEqual().
// If we remove the assertion, this passes.
// ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound());
std::vector<std::string> vals;
std::vector<Status> statuses = db_->MultiGet(
rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr);
ASSERT_TRUE(statuses[0].IsCorruption());
ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos,
allow_data_in_error);
std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
ASSERT_OK(iter->status());
iter->Seek(Key(1));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
iter->Seek(Key(0));
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
// iterating through skip list at height at 1 should catch out-of-order keys
iter->Next();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
iter->SeekForPrev(Key(2));
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
// Internally DB Iter will iterate backwards (call Prev()) after
// SeekToLast() to find the correct internal key with the last user key.
// Prev() will do integrity checks and catch corruption.
iter->SeekToLast();
ASSERT_TRUE(iter->status().IsCorruption());
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
allow_data_in_error);
ASSERT_FALSE(iter->Valid());
}
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -56,6 +56,11 @@ class DBOptionsTest : public DBTestBase {
EXPECT_OK(GetStringFromMutableCFOptions( EXPECT_OK(GetStringFromMutableCFOptions(
config_options, MutableCFOptions(options), &options_str)); config_options, MutableCFOptions(options), &options_str));
EXPECT_OK(StringToMap(options_str, &mutable_map)); EXPECT_OK(StringToMap(options_str, &mutable_map));
for (auto& opt : TEST_GetImmutableInMutableCFOptions()) {
// Not yet mutable but migrated to MutableCFOptions in preparation for
// being mutable
mutable_map.erase(opt);
}
return mutable_map; return mutable_map;
} }
@ -231,21 +236,33 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
ASSERT_OK(dbfull()->SetOptions( ASSERT_OK(dbfull()->SetOptions(
cfh, {{"table_factory.block_size", "16384"}, cfh, {{"table_factory.block_size", "16384"},
{"table_factory.block_restart_interval", "11"}})); {"table_factory.block_restart_interval", "11"}}));
// Old c_bbto
ASSERT_EQ(c_bbto->block_size, 8192);
ASSERT_EQ(c_bbto->block_restart_interval, 7);
// New c_bbto
c_opts = dbfull()->GetOptions(cfh);
c_bbto = c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_size, 16384);
ASSERT_EQ(c_bbto->block_restart_interval, 11); ASSERT_EQ(c_bbto->block_restart_interval, 11);
// Now set an option that is not mutable - options should not change // Now set an option that is not mutable - options should not change
ASSERT_NOK( // FIXME: find a way to make this fail again
dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}})); // ASSERT_NOK(
// dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
c_opts = dbfull()->GetOptions(cfh);
ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
ASSERT_EQ(c_bbto->no_block_cache, true); ASSERT_EQ(c_bbto->no_block_cache, true);
ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_size, 16384);
ASSERT_EQ(c_bbto->block_restart_interval, 11); ASSERT_EQ(c_bbto->block_restart_interval, 11);
// Set some that are mutable and some that are not - options should not change // Set some that are mutable and some that are not - options should not change
ASSERT_NOK(dbfull()->SetOptions( // FIXME: find a way to make this fail again
cfh, {{"table_factory.no_block_cache", "false"}, // ASSERT_NOK(dbfull()->SetOptions(
{"table_factory.block_size", "8192"}, // cfh, {{"table_factory.no_block_cache", "false"},
{"table_factory.block_restart_interval", "7"}})); // {"table_factory.block_size", "8192"},
// {"table_factory.block_restart_interval", "7"}}));
c_opts = dbfull()->GetOptions(cfh);
ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
ASSERT_EQ(c_bbto->no_block_cache, true); ASSERT_EQ(c_bbto->no_block_cache, true);
ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_size, 16384);
ASSERT_EQ(c_bbto->block_restart_interval, 11); ASSERT_EQ(c_bbto->block_restart_interval, 11);
@ -256,6 +273,8 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
cfh, {{"table_factory.block_size", "8192"}, cfh, {{"table_factory.block_size", "8192"},
{"table_factory.does_not_exist", "true"}, {"table_factory.does_not_exist", "true"},
{"table_factory.block_restart_interval", "7"}})); {"table_factory.block_restart_interval", "7"}}));
c_opts = dbfull()->GetOptions(cfh);
ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
ASSERT_EQ(c_bbto->no_block_cache, true); ASSERT_EQ(c_bbto->no_block_cache, true);
ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_size, 16384);
ASSERT_EQ(c_bbto->block_restart_interval, 11); ASSERT_EQ(c_bbto->block_restart_interval, 11);
@ -271,6 +290,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
{"table_factory.block_restart_interval", "13"}})); {"table_factory.block_restart_interval", "13"}}));
c_opts = dbfull()->GetOptions(cfh); c_opts = dbfull()->GetOptions(cfh);
ASSERT_EQ(c_opts.blob_file_size, 32768); ASSERT_EQ(c_opts.blob_file_size, 32768);
c_bbto = c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_size, 16384);
ASSERT_EQ(c_bbto->block_restart_interval, 13); ASSERT_EQ(c_bbto->block_restart_interval, 13);
// Set some on the table and a bad one on the ColumnFamily - options should // Set some on the table and a bad one on the ColumnFamily - options should
@ -279,6 +299,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
cfh, {{"table_factory.block_size", "1024"}, cfh, {{"table_factory.block_size", "1024"},
{"no_such_option", "32768"}, {"no_such_option", "32768"},
{"table_factory.block_restart_interval", "7"}})); {"table_factory.block_restart_interval", "7"}}));
ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_size, 16384);
ASSERT_EQ(c_bbto->block_restart_interval, 13); ASSERT_EQ(c_bbto->block_restart_interval, 13);
} }

View File

@ -244,7 +244,7 @@ TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
ASSERT_EQ(largest.user_key().ToString(), "foo"); ASSERT_EQ(largest.user_key().ToString(), "foo");
ASSERT_EQ(result.output_level, 1); ASSERT_EQ(result.output_level, 1);
ASSERT_EQ(result.output_path, this->secondary_path_); ASSERT_EQ(result.output_path, this->secondary_path_);
ASSERT_EQ(result.num_output_records, 2); ASSERT_EQ(result.stats.num_output_records, 2);
ASSERT_GT(result.bytes_written, 0); ASSERT_GT(result.bytes_written, 0);
ASSERT_OK(result.status); ASSERT_OK(result.status);
} }

View File

@ -383,12 +383,16 @@ TEST_F(DBSSTTest, DBWithSstFileManager) {
ASSERT_EQ(files_moved, 0); ASSERT_EQ(files_moved, 0);
Close(); Close();
ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
Reopen(options); Reopen(options);
ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
ASSERT_EQ(sfm->GetTotalSize(), total_files_size); ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
// Verify that we track all the files again after the DB is closed and opened // Verify that we track all the files again after the DB is closed and opened
Close(); Close();
ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
sst_file_manager.reset(NewSstFileManager(env_)); sst_file_manager.reset(NewSstFileManager(env_));
options.sst_file_manager = sst_file_manager; options.sst_file_manager = sst_file_manager;
sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get()); sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
@ -439,6 +443,11 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; }); "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
int64_t untracked_files = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::OnUntrackFile",
[&](void* /*arg*/) { ++untracked_files; });
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
Options options = CurrentOptions(); Options options = CurrentOptions();
@ -485,6 +494,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
} }
ASSERT_EQ(sfm->GetTotalSize(), total_files_size); ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
Close(); Close();
ASSERT_EQ(untracked_files, files_in_db.size());
untracked_files = 0;
ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
Reopen(options); Reopen(options);
ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
@ -492,6 +505,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
// Verify that we track all the files again after the DB is closed and opened. // Verify that we track all the files again after the DB is closed and opened.
Close(); Close();
ASSERT_EQ(untracked_files, files_in_db.size());
untracked_files = 0;
ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
sst_file_manager.reset(NewSstFileManager(env_)); sst_file_manager.reset(NewSstFileManager(env_));
options.sst_file_manager = sst_file_manager; options.sst_file_manager = sst_file_manager;
@ -507,6 +524,27 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
ASSERT_EQ(files_deleted, 0); ASSERT_EQ(files_deleted, 0);
ASSERT_EQ(files_scheduled_to_delete, 0); ASSERT_EQ(files_scheduled_to_delete, 0);
Close(); Close();
ASSERT_EQ(untracked_files, files_in_db.size());
untracked_files = 0;
ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DestroyDB(dbname_, options));
ASSERT_EQ(files_deleted, blob_files.size()); ASSERT_EQ(files_deleted, blob_files.size());
ASSERT_EQ(files_scheduled_to_delete, blob_files.size()); ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
@ -649,6 +687,26 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
} }
Close(); Close();
ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DestroyDB(dbname_, options));
sfm->WaitForEmptyTrash(); sfm->WaitForEmptyTrash();
ASSERT_EQ(files_deleted, 5); ASSERT_EQ(files_deleted, 5);
@ -883,8 +941,9 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
// Create 4 files in L0 // Create 4 files in L0
for (char v = 'a'; v <= 'd'; v++) { for (char v = 'a'; v <= 'd'; v++) {
if (v == 'c') { if (v == 'c') {
// Maximize the change that the last log file will be preserved in trash // Maximize the chance that the last log file will be preserved in trash
// before restarting the DB. // before restarting the DB. (Enable slow deletion but at a very slow
// deletion rate)
// We have to set this on the 2nd to last file for it to delay deletion // We have to set this on the 2nd to last file for it to delay deletion
// on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash()) // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
options.sst_file_manager->SetDeleteRateBytesPerSecond(1); options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
@ -1902,6 +1961,24 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
ASSERT_EQ(files_deleted, 1); ASSERT_EQ(files_deleted, 1);
Close(); Close();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
assert(arg);
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
++files_scheduled_to_delete;
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
const std::string* const file_path =
static_cast<const std::string*>(arg);
if (EndsWith(*file_path, ".blob")) {
files_deleted++;
}
});
ASSERT_OK(DestroyDB(dbname_, options)); ASSERT_OK(DestroyDB(dbname_, options));
ASSERT_EQ(files_scheduled_to_delete, 4); ASSERT_EQ(files_scheduled_to_delete, 4);

View File

@ -1826,21 +1826,30 @@ TEST_F(DBTest, GetApproximateMemTableStats) {
uint64_t count; uint64_t count;
uint64_t size; uint64_t size;
// Because Random::GetTLSInstance() seed is reset in DBTestBase,
// this test is deterministic.
std::string start = Key(50); std::string start = Key(50);
std::string end = Key(60); std::string end = Key(60);
Range r(start, end); Range r(start, end);
db_->GetApproximateMemTableStats(r, &count, &size); db_->GetApproximateMemTableStats(r, &count, &size);
ASSERT_GT(count, 0); // When actual count is <= 10, it returns that as the minimum
ASSERT_LE(count, N); EXPECT_EQ(count, 10);
ASSERT_GT(size, 6000); EXPECT_EQ(size, 10440);
ASSERT_LT(size, 204800);
start = Key(20);
end = Key(100);
r = Range(start, end);
db_->GetApproximateMemTableStats(r, &count, &size);
EXPECT_EQ(count, 72);
EXPECT_EQ(size, 75168);
start = Key(500); start = Key(500);
end = Key(600); end = Key(600);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateMemTableStats(r, &count, &size); db_->GetApproximateMemTableStats(r, &count, &size);
ASSERT_EQ(count, 0); EXPECT_EQ(count, 0);
ASSERT_EQ(size, 0); EXPECT_EQ(size, 0);
ASSERT_OK(Flush()); ASSERT_OK(Flush());
@ -1848,8 +1857,8 @@ TEST_F(DBTest, GetApproximateMemTableStats) {
end = Key(60); end = Key(60);
r = Range(start, end); r = Range(start, end);
db_->GetApproximateMemTableStats(r, &count, &size); db_->GetApproximateMemTableStats(r, &count, &size);
ASSERT_EQ(count, 0); EXPECT_EQ(count, 0);
ASSERT_EQ(size, 0); EXPECT_EQ(size, 0);
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024))); ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
@ -1857,10 +1866,11 @@ TEST_F(DBTest, GetApproximateMemTableStats) {
start = Key(100); start = Key(100);
end = Key(1020); end = Key(1020);
// Actually 20 keys in the range ^^
r = Range(start, end); r = Range(start, end);
db_->GetApproximateMemTableStats(r, &count, &size); db_->GetApproximateMemTableStats(r, &count, &size);
ASSERT_GT(count, 20); EXPECT_EQ(count, 20);
ASSERT_GT(size, 6000); EXPECT_EQ(size, 20880);
} }
TEST_F(DBTest, ApproximateSizes) { TEST_F(DBTest, ApproximateSizes) {
@ -5169,10 +5179,14 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
options.max_bytes_for_level_multiplier = 4; options.max_bytes_for_level_multiplier = 4;
options.max_background_compactions = 1; options.max_background_compactions = 1;
options.num_levels = 5; options.num_levels = 5;
options.statistics = CreateDBStatistics();
options.compression_per_level.resize(3); options.compression_per_level.resize(3);
// No compression for L0
options.compression_per_level[0] = kNoCompression; options.compression_per_level[0] = kNoCompression;
// No compression for the Ln whre L0 is compacted to
options.compression_per_level[1] = kNoCompression; options.compression_per_level[1] = kNoCompression;
// Snpapy compression for Ln+1
options.compression_per_level[2] = kSnappyCompression; options.compression_per_level[2] = kSnappyCompression;
OnFileDeletionListener* listener = new OnFileDeletionListener(); OnFileDeletionListener* listener = new OnFileDeletionListener();
@ -5181,7 +5195,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
DestroyAndReopen(options); DestroyAndReopen(options);
// Insert more than 80K. L4 should be base level. Neither L0 nor L4 should // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
// be compressed, so total data size should be more than 80K. // be compressed, so there shouldn't be any compression.
for (int i = 0; i < 20; i++) { for (int i = 0; i < 20; i++) {
ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
} }
@ -5191,10 +5205,17 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(1), 0);
ASSERT_EQ(NumTableFilesAtLevel(2), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 0);
ASSERT_EQ(NumTableFilesAtLevel(3), 0); ASSERT_EQ(NumTableFilesAtLevel(3), 0);
// Assuming each files' metadata is at least 50 bytes/ ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0);
ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
// Insert 400KB. Some data will be compressed // Verify there was no compression
auto num_block_compressed =
options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
ASSERT_EQ(num_block_compressed, 0);
// Insert 400KB and there will be some files end up in L3. According to the
// above compression settings for each level, there will be some compression.
ASSERT_OK(options.statistics->Reset());
ASSERT_EQ(num_block_compressed, 0);
for (int i = 21; i < 120; i++) { for (int i = 21; i < 120; i++) {
ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
} }
@ -5202,9 +5223,14 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(1), 0);
ASSERT_EQ(NumTableFilesAtLevel(2), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 0);
ASSERT_GE(NumTableFilesAtLevel(3), 1);
ASSERT_GE(NumTableFilesAtLevel(4), 1);
// Verify there was compression
num_block_compressed =
options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
ASSERT_GT(num_block_compressed, 0);
ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
120U * 4000U + 50U * 24);
// Make sure data in files in L3 is not compacted by removing all files // Make sure data in files in L3 is not compacted by removing all files
// in L4 and calculate number of rows // in L4 and calculate number of rows
ASSERT_OK(dbfull()->SetOptions({ ASSERT_OK(dbfull()->SetOptions({
@ -5224,6 +5250,12 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
num_keys++; num_keys++;
} }
ASSERT_OK(iter->status()); ASSERT_OK(iter->status());
ASSERT_EQ(NumTableFilesAtLevel(1), 0);
ASSERT_EQ(NumTableFilesAtLevel(2), 0);
ASSERT_GE(NumTableFilesAtLevel(3), 1);
ASSERT_EQ(NumTableFilesAtLevel(4), 0);
ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U); ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
} }

View File

@ -10,6 +10,7 @@
#include <atomic> #include <atomic>
#include <cstdlib> #include <cstdlib>
#include <functional> #include <functional>
#include <iostream>
#include <memory> #include <memory>
#include "db/db_test_util.h" #include "db/db_test_util.h"
@ -26,6 +27,7 @@
#include "rocksdb/utilities/replayer.h" #include "rocksdb/utilities/replayer.h"
#include "rocksdb/wal_filter.h" #include "rocksdb/wal_filter.h"
#include "test_util/testutil.h" #include "test_util/testutil.h"
#include "util/defer.h"
#include "util/random.h" #include "util/random.h"
#include "utilities/fault_injection_env.h" #include "utilities/fault_injection_env.h"
@ -34,18 +36,6 @@ namespace ROCKSDB_NAMESPACE {
class DBTest2 : public DBTestBase { class DBTest2 : public DBTestBase {
public: public:
DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {} DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
std::vector<FileMetaData*> GetLevelFileMetadatas(int level, int cf = 0) {
VersionSet* const versions = dbfull()->GetVersionSet();
assert(versions);
ColumnFamilyData* const cfd =
versions->GetColumnFamilySet()->GetColumnFamily(cf);
assert(cfd);
Version* const current = cfd->current();
assert(current);
VersionStorageInfo* const storage_info = current->storage_info();
assert(storage_info);
return storage_info->LevelFiles(level);
}
}; };
TEST_F(DBTest2, OpenForReadOnly) { TEST_F(DBTest2, OpenForReadOnly) {
@ -5595,32 +5585,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) {
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
bbto.whole_key_filtering = false; bbto.whole_key_filtering = false;
options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.table_factory.reset(NewBlockBasedTableFactory(bbto));
DestroyAndReopen(options);
// Construct two L1 files with keys: // This test is also the primary test for prefix_seek_opt_in_only
// f1:[aaa1 ccc1] f2:[ddd0] for (bool opt_in : {false, true}) {
ASSERT_OK(Put("aaa1", "")); options.prefix_seek_opt_in_only = opt_in;
ASSERT_OK(Put("ccc1", "")); DestroyAndReopen(options);
ASSERT_OK(Flush());
ASSERT_OK(Put("ddd0", ""));
ASSERT_OK(Flush());
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
Iterator* iter = db_->NewIterator(ReadOptions()); // Construct two L1 files with keys:
ASSERT_OK(iter->status()); // f1:[aaa1 ccc1] f2:[ddd0]
ASSERT_OK(Put("aaa1", ""));
ASSERT_OK(Put("ccc1", ""));
ASSERT_OK(Flush());
ASSERT_OK(Put("ddd0", ""));
ASSERT_OK(Flush());
CompactRangeOptions cro;
cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
// Bloom filter is filterd out by f1. ReadOptions ropts;
// This is just one of several valid position following the contract. for (bool same : {false, true}) {
// Postioning to ccc1 or ddd0 is also valid. This is just to validate ropts.prefix_same_as_start = same;
// the behavior of the current implementation. If underlying implementation std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
// changes, the test might fail here. ASSERT_OK(iter->status());
iter->Seek("bbb1");
ASSERT_OK(iter->status());
ASSERT_FALSE(iter->Valid());
delete iter; iter->Seek("bbb1");
ASSERT_OK(iter->status());
if (opt_in && !same) {
// Unbounded total order seek
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(iter->key(), "ccc1");
} else {
// Bloom filter is filterd out by f1. When same == false, this is just
// one valid position following the contract. Postioning to ccc1 or ddd0
// is also valid. This is just to validate the behavior of the current
// implementation. If underlying implementation changes, the test might
// fail here.
ASSERT_FALSE(iter->Valid());
}
}
}
} }
TEST_F(DBTest2, RowCacheSnapshot) { TEST_F(DBTest2, RowCacheSnapshot) {
@ -5985,6 +5988,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) {
// create a DB with block prefix index // create a DB with block prefix index
BlockBasedTableOptions table_options; BlockBasedTableOptions table_options;
Options options = CurrentOptions(); Options options = CurrentOptions();
options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
// Sometimes filter is checked based on upper bound. Assert counters // Sometimes filter is checked based on upper bound. Assert counters
// for that case. Otherwise, only check data correctness. // for that case. Otherwise, only check data correctness.
@ -6544,6 +6548,235 @@ TEST_P(RenameCurrentTest, Compaction) {
ASSERT_EQ("d_value", Get("d")); ASSERT_EQ("d_value", Get("d"));
} }
TEST_F(DBTest2, VariousFileTemperatures) {
constexpr size_t kNumberFileTypes = static_cast<size_t>(kBlobFile) + 1U;
struct MyTestFS : public FileTemperatureTestFS {
explicit MyTestFS(const std::shared_ptr<FileSystem>& fs)
: FileTemperatureTestFS(fs) {
Reset();
}
IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
std::unique_ptr<FSWritableFile>* result,
IODebugContext* dbg) override {
IOStatus ios =
FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg);
if (ios.ok()) {
uint64_t number;
FileType type;
if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) {
if (type == kTableFile) {
// Not checked here
} else if (type == kWalFile) {
if (opts.temperature != expected_wal_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
} else if (type == kDescriptorFile) {
if (opts.temperature != expected_manifest_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
} else if (opts.temperature != expected_other_metadata_temperature) {
std::cerr << "Attempt to open " << fname << " with temperature "
<< temperature_to_string[opts.temperature]
<< " rather than "
<< temperature_to_string[expected_wal_temperature]
<< std::endl;
assert(false);
}
UpdateCount(type, 1);
}
}
return ios;
}
IOStatus RenameFile(const std::string& src, const std::string& dst,
const IOOptions& options,
IODebugContext* dbg) override {
IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg);
if (ios.ok()) {
uint64_t number;
FileType src_type;
FileType dst_type;
assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type));
assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type));
UpdateCount(src_type, -1);
UpdateCount(dst_type, 1);
}
return ios;
}
void UpdateCount(FileType type, int delta) {
size_t i = static_cast<size_t>(type);
assert(i < kNumberFileTypes);
counts[i].FetchAddRelaxed(delta);
}
std::map<FileType, size_t> PopCounts() {
std::map<FileType, size_t> ret;
for (size_t i = 0; i < kNumberFileTypes; ++i) {
int c = counts[i].ExchangeRelaxed(0);
if (c > 0) {
ret[static_cast<FileType>(i)] = c;
}
}
return ret;
}
FileOptions OptimizeForLogWrite(
const FileOptions& file_options,
const DBOptions& /*db_options*/) const override {
FileOptions opts = file_options;
if (optimize_wal_temperature != Temperature::kUnknown) {
opts.temperature = optimize_wal_temperature;
}
return opts;
}
FileOptions OptimizeForManifestWrite(
const FileOptions& file_options) const override {
FileOptions opts = file_options;
if (optimize_manifest_temperature != Temperature::kUnknown) {
opts.temperature = optimize_manifest_temperature;
}
return opts;
}
void Reset() {
optimize_manifest_temperature = Temperature::kUnknown;
optimize_wal_temperature = Temperature::kUnknown;
expected_manifest_temperature = Temperature::kUnknown;
expected_other_metadata_temperature = Temperature::kUnknown;
expected_wal_temperature = Temperature::kUnknown;
for (auto& c : counts) {
c.StoreRelaxed(0);
}
}
Temperature optimize_manifest_temperature;
Temperature optimize_wal_temperature;
Temperature expected_manifest_temperature;
Temperature expected_other_metadata_temperature;
Temperature expected_wal_temperature;
std::array<RelaxedAtomic<int>, kNumberFileTypes> counts;
};
// We don't have enough non-unknown temps to confidently distinguish that
// a specific setting caused a specific outcome, in a single run. This is a
// reasonable work-around without blowing up test time. Only returns
// non-unknown temperatures.
auto RandomTemp = [] {
static std::vector<Temperature> temps = {
Temperature::kHot, Temperature::kWarm, Temperature::kCold};
return temps[Random::GetTLSInstance()->Uniform(
static_cast<int>(temps.size()))];
};
auto test_fs = std::make_shared<MyTestFS>(env_->GetFileSystem());
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
for (bool use_optimize : {false, true}) {
std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl;
for (bool use_temp_options : {false, true}) {
std::cerr << "use_temp_options: " << std::to_string(use_temp_options)
<< std::endl;
Options options = CurrentOptions();
// Currently require for last level temperature
options.compaction_style = kCompactionStyleUniversal;
options.env = env.get();
test_fs->Reset();
if (use_optimize) {
test_fs->optimize_manifest_temperature = RandomTemp();
test_fs->expected_manifest_temperature =
test_fs->optimize_manifest_temperature;
test_fs->optimize_wal_temperature = RandomTemp();
test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature;
}
if (use_temp_options) {
options.metadata_write_temperature = RandomTemp();
test_fs->expected_manifest_temperature =
options.metadata_write_temperature;
test_fs->expected_other_metadata_temperature =
options.metadata_write_temperature;
options.wal_write_temperature = RandomTemp();
test_fs->expected_wal_temperature = options.wal_write_temperature;
options.last_level_temperature = RandomTemp();
options.default_write_temperature = RandomTemp();
}
DestroyAndReopen(options);
Defer closer([&] { Close(); });
using FTC = std::map<FileType, size_t>;
// Files on DB startup
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
{kDescriptorFile, 2},
{kCurrentFile, 2},
{kIdentityFile, 1},
{kOptionsFile, 1}}));
// Temperature count map
using TCM = std::map<Temperature, size_t>;
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({}));
ASSERT_OK(Put("foo", "1"));
ASSERT_OK(Put("bar", "1"));
ASSERT_OK(Flush());
ASSERT_OK(Put("foo", "2"));
ASSERT_OK(Put("bar", "2"));
ASSERT_OK(Flush());
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
TCM({{options.default_write_temperature, 2}}));
ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
TCM({{options.last_level_temperature, 1}}));
ASSERT_OK(Put("foo", "3"));
ASSERT_OK(Put("bar", "3"));
ASSERT_OK(Flush());
// Just in memtable/WAL
ASSERT_OK(Put("dog", "3"));
{
TCM expected;
expected[options.default_write_temperature] += 1;
expected[options.last_level_temperature] += 1;
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected);
}
// New files during operation
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}}));
Reopen(options);
// New files during re-open/recovery
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
{kTableFile, 1},
{kDescriptorFile, 1},
{kCurrentFile, 1},
{kOptionsFile, 1}}));
Destroy(options);
}
}
}
TEST_F(DBTest2, LastLevelTemperature) { TEST_F(DBTest2, LastLevelTemperature) {
class TestListener : public EventListener { class TestListener : public EventListener {
public: public:

View File

@ -366,6 +366,11 @@ Options DBTestBase::GetOptions(
table_options.block_cache = NewLRUCache(/* too small */ 1); table_options.block_cache = NewLRUCache(/* too small */ 1);
} }
// Test anticipated new default as much as reasonably possible (and remove
// this code when obsolete)
assert(!table_options.decouple_partitioned_filters);
table_options.decouple_partitioned_filters = true;
bool can_allow_mmap = IsMemoryMappedAccessSupported(); bool can_allow_mmap = IsMemoryMappedAccessSupported();
switch (option_config) { switch (option_config) {
case kHashSkipList: case kHashSkipList:
@ -1258,6 +1263,20 @@ Status DBTestBase::CountFiles(size_t* count) {
return Status::OK(); return Status::OK();
} }
std::vector<FileMetaData*> DBTestBase::GetLevelFileMetadatas(int level,
int cf) {
VersionSet* const versions = dbfull()->GetVersionSet();
assert(versions);
ColumnFamilyData* const cfd =
versions->GetColumnFamilySet()->GetColumnFamily(cf);
assert(cfd);
Version* const current = cfd->current();
assert(current);
VersionStorageInfo* const storage_info = current->storage_info();
assert(storage_info);
return storage_info->LevelFiles(level);
}
Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf, Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
uint64_t* size) { uint64_t* size) {
Range r(start, limit); Range r(start, limit);

View File

@ -831,6 +831,15 @@ class FileTemperatureTestFS : public FileSystemWrapper {
return count; return count;
} }
std::map<Temperature, size_t> CountCurrentSstFilesByTemp() {
MutexLock lock(&mu_);
std::map<Temperature, size_t> ret;
for (const auto& e : current_sst_file_temperatures_) {
ret[e.second]++;
}
return ret;
}
void OverrideSstFileTemperature(uint64_t number, Temperature temp) { void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
MutexLock lock(&mu_); MutexLock lock(&mu_);
current_sst_file_temperatures_[number] = temp; current_sst_file_temperatures_[number] = temp;
@ -842,7 +851,7 @@ class FileTemperatureTestFS : public FileSystemWrapper {
requested_sst_file_temperatures_; requested_sst_file_temperatures_;
std::map<uint64_t, Temperature> current_sst_file_temperatures_; std::map<uint64_t, Temperature> current_sst_file_temperatures_;
std::string GetFileName(const std::string& fname) { static std::string GetFileName(const std::string& fname) {
auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
// workaround only for Windows that the file path could contain both Windows // workaround only for Windows that the file path could contain both Windows
// FilePathSeparator and '/' // FilePathSeparator and '/'
@ -1264,6 +1273,8 @@ class DBTestBase : public testing::Test {
Status CountFiles(size_t* count); Status CountFiles(size_t* count);
std::vector<FileMetaData*> GetLevelFileMetadatas(int level, int cf = 0);
Status Size(const Slice& start, const Slice& limit, uint64_t* size) { Status Size(const Slice& start, const Slice& limit, uint64_t* size) {
return Size(start, limit, 0, size); return Size(start, limit, 0, size);
} }

View File

@ -213,7 +213,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
options.num_levels = num_levels_; options.num_levels = num_levels_;
options.write_buffer_size = 105 << 10; // 105KB options.write_buffer_size = 105 << 10; // 105KB
options.arena_block_size = 4 << 10; options.arena_block_size = 4 << 10;
options.target_file_size_base = 32 << 10; // 32KB
// trigger compaction if there are >= 4 files // trigger compaction if there are >= 4 files
options.level0_file_num_compaction_trigger = 4; options.level0_file_num_compaction_trigger = 4;
KeepFilterFactory* filter = new KeepFilterFactory(true); KeepFilterFactory* filter = new KeepFilterFactory(true);

View File

@ -1472,6 +1472,126 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
ASSERT_OK(dbfull()->SyncWAL()); ASSERT_OK(dbfull()->SyncWAL());
} }
TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
Options options = CurrentOptions();
options.max_write_buffer_number = 5;
options.track_and_verify_wals_in_manifest = true;
options.max_bgerror_resume_count = 0; // manual resume
options.recycle_log_file_num = 3;
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
// Disable truncating recycled WALs to new size in posix env
// (approximating a crash)
SyncPoint::GetInstance()->SetCallBack(
"PosixWritableFile::Close",
[](void* arg) { *(static_cast<size_t*>(arg)) = 0; });
SyncPoint::GetInstance()->EnableProcessing();
// Re-open with desired options
DestroyAndReopen(options);
Defer closer([this]() { Close(); });
// Ensure WAL recycling wasn't sanitized away
ASSERT_EQ(db_->GetOptions().recycle_log_file_num,
options.recycle_log_file_num);
// Prepare external files for later ingestion
std::string sst_files_dir = dbname_ + "/sst_files/";
ASSERT_OK(DestroyDir(env_, sst_files_dir));
ASSERT_OK(env_->CreateDir(sst_files_dir));
std::string external_file1 = sst_files_dir + "file1.sst";
{
SstFileWriter sst_file_writer(EnvOptions(), options);
ASSERT_OK(sst_file_writer.Open(external_file1));
ASSERT_OK(sst_file_writer.Put("external1", "ex1"));
ExternalSstFileInfo file_info;
ASSERT_OK(sst_file_writer.Finish(&file_info));
}
std::string external_file2 = sst_files_dir + "file2.sst";
{
SstFileWriter sst_file_writer(EnvOptions(), options);
ASSERT_OK(sst_file_writer.Open(external_file2));
ASSERT_OK(sst_file_writer.Put("external2", "ex2"));
ExternalSstFileInfo file_info;
ASSERT_OK(sst_file_writer.Finish(&file_info));
}
// Populate some WALs to be recycled such that there will be extra data
// from an old incarnation of the WAL on recovery
ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(db_->ContinueBackgroundWork());
ASSERT_OK(Flush());
ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500)));
ASSERT_OK(Flush());
// Verify expected log files (still there for recycling)
std::vector<FileAttributes> files;
int log_count = 0;
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
for (const auto& f : files) {
if (EndsWith(f.name, ".log")) {
EXPECT_GT(f.size_bytes, 500);
++log_count;
}
}
EXPECT_EQ(log_count, 3);
// (Re-used recipe) Generate two inactive WALs and one active WAL, with a
// gap in sequence numbers to interfere with recovery
ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("key1", "val1"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(Put("key2", "val2"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
// Need a gap in sequence numbers, so e.g. ingest external file
// with an open snapshot
{
ManagedSnapshot snapshot(db_);
ASSERT_OK(
db_->IngestExternalFile({external_file1}, IngestExternalFileOptions()));
}
ASSERT_OK(Put("key3", "val3"));
ASSERT_OK(db_->SyncWAL());
// Need an SST file that is logically after that WAL, so that dropping WAL
// data is not a valid point in time.
{
ManagedSnapshot snapshot(db_);
ASSERT_OK(
db_->IngestExternalFile({external_file2}, IngestExternalFileOptions()));
}
// Approximate a crash, with respect to recycled WAL data extending past
// the end of the current WAL data (see SyncPoint callback above)
Close();
// Verify recycled log files haven't been truncated
files.clear();
log_count = 0;
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
for (const auto& f : files) {
if (EndsWith(f.name, ".log")) {
EXPECT_GT(f.size_bytes, 500);
++log_count;
}
}
EXPECT_EQ(log_count, 3);
// Verify no data loss after reopen.
Reopen(options);
EXPECT_EQ("val1", Get("key1"));
EXPECT_EQ("val2", Get("key2")); // Passes because of adjacent seqnos
EXPECT_EQ("ex1", Get("external1"));
EXPECT_EQ("val3", Get("key3")); // <- ONLY FAILURE! (Not a point in time)
EXPECT_EQ("ex2", Get("external2"));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(DBWALTest, SyncWalPartialFailure) { TEST_F(DBWALTest, SyncWalPartialFailure) {
class MyTestFileSystem : public FileSystemWrapper { class MyTestFileSystem : public FileSystemWrapper {
public: public:
@ -1532,7 +1652,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
// * one inactive WAL, not synced, and // * one inactive WAL, not synced, and
// * one active WAL, not synced // * one active WAL, not synced
// with a single thread, to exercise as much logic as we reasonably can. // with a single thread, to exercise as much logic as we reasonably can.
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork()); ASSERT_OK(db_->PauseBackgroundWork());
ASSERT_OK(Put("key1", "val1")); ASSERT_OK(Put("key1", "val1"));
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable()); ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
ASSERT_OK(db_->SyncWAL()); ASSERT_OK(db_->SyncWAL());
@ -2811,6 +2931,29 @@ TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
ASSERT_EQ("new_v", Get("k")); ASSERT_EQ("new_v", Get("k"));
Destroy(options); Destroy(options);
} }
TEST_F(DBWALTest, WALWriteErrorNoRecovery) {
Options options = CurrentOptions();
auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
options.env = fault_fs_env.get();
options.manual_wal_flush = true;
DestroyAndReopen(options);
fault_fs->SetThreadLocalErrorContext(
FaultInjectionIOType::kWrite, 7 /* seed*/, 1 /* one_in */,
true /* retryable */, false /* has_data_loss*/);
fault_fs->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
ASSERT_OK(Put("k", "v"));
Status s;
s = db_->FlushWAL(false);
ASSERT_TRUE(s.IsIOError());
s = dbfull()->TEST_GetBGError();
ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
ASSERT_FALSE(dbfull()->TEST_IsRecoveryInProgress());
fault_fs->DisableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
Destroy(options);
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -172,6 +172,70 @@ TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
Close(); Close();
} }
TEST_F(DBBasicTestWithTimestamp, MultiGetMultipleCfs) {
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
Options options = CurrentOptions();
options.env = env_;
options.create_if_missing = true;
options.avoid_flush_during_shutdown = true;
options.comparator = &test_cmp;
DestroyAndReopen(options);
Options options1 = CurrentOptions();
options1.env = env_;
options1.comparator = &test_cmp;
ColumnFamilyHandle* handle = nullptr;
Status s = db_->CreateColumnFamily(options1, "data", &handle);
ASSERT_OK(s);
std::string ts = Timestamp(1, 0);
WriteBatch wb(0, 0, 0, kTimestampSize);
ASSERT_OK(wb.Put("a", "value"));
ASSERT_OK(wb.Put(handle, "a", "value"));
const auto ts_sz_func = [kTimestampSize](uint32_t /*cf_id*/) {
return kTimestampSize;
};
ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
ASSERT_OK(db_->Write(WriteOptions(), &wb));
int num_keys = 2;
std::vector<Slice> keys;
std::vector<std::string> expected_values;
for (int i = 0; i < num_keys; i++) {
keys.push_back("a");
expected_values.push_back("value");
}
std::vector<ColumnFamilyHandle*> handles;
handles.push_back(db_->DefaultColumnFamily());
handles.push_back(handle);
{
Slice read_ts_slice(ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::vector<PinnableSlice> values;
values.resize(num_keys);
std::vector<Status> statuses;
statuses.resize(num_keys);
std::vector<std::string> timestamps;
timestamps.resize(num_keys);
db_->MultiGet(read_opts, num_keys, handles.data(), keys.data(),
values.data(), timestamps.data(), statuses.data());
for (int i = 0; i < num_keys; i++) {
ASSERT_OK(statuses[i]);
ASSERT_EQ(expected_values[i], values[i].ToString());
ASSERT_EQ(ts, timestamps[i]);
}
}
delete handle;
Close();
}
TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) { TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.env = env_; options.env = env_;
@ -768,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) { TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
options.env = env_; options.env = env_;
options.create_if_missing = true; options.create_if_missing = true;
options.prefix_extractor.reset(NewFixedPrefixTransform(3)); options.prefix_extractor.reset(NewFixedPrefixTransform(3));
@ -945,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
TestComparator test_cmp(kTimestampSize); TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp; options.comparator = &test_cmp;
options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.prefix_extractor.reset(NewFixedPrefixTransform(1));
options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
DestroyAndReopen(options); DestroyAndReopen(options);
const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2), const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),

View File

@ -272,11 +272,23 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
void IterKey::EnlargeBuffer(size_t key_size) { void IterKey::EnlargeBuffer(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer, // If size is smaller than buffer size, continue using current buffer,
// or the static allocated one, as default // or the inline one, as default
assert(key_size > buf_size_); assert(key_size > buf_size_);
// Need to enlarge the buffer. // Need to enlarge the buffer.
ResetBuffer(); ResetBuffer();
buf_ = new char[key_size]; buf_ = new char[key_size];
buf_size_ = key_size; buf_size_ = key_size;
} }
void IterKey::EnlargeSecondaryBufferIfNeeded(size_t key_size) {
// If size is smaller than buffer size, continue using current buffer,
// or the inline one, as default
if (key_size <= secondary_buf_size_) {
return;
}
// Need to enlarge the secondary buffer.
ResetSecondaryBuffer();
secondary_buf_ = new char[key_size];
secondary_buf_size_ = key_size;
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -10,6 +10,7 @@
#pragma once #pragma once
#include <stdio.h> #include <stdio.h>
#include <array>
#include <memory> #include <memory>
#include <optional> #include <optional>
#include <string> #include <string>
@ -330,17 +331,16 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
// output : <user_provided_key> // output : <user_provided_key>
inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
size_t ts_sz) { size_t ts_sz) {
Slice ret = internal_key; assert(internal_key.size() >= kNumInternalBytes + ts_sz);
ret.remove_suffix(kNumInternalBytes + ts_sz); return Slice(internal_key.data(),
return ret; internal_key.size() - (kNumInternalBytes + ts_sz));
} }
// input [user key]: <user_provided_key | ts> // input [user key]: <user_provided_key | ts>
// output: <user_provided_key> // output: <user_provided_key>
inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
Slice ret = user_key; assert(user_key.size() >= ts_sz);
ret.remove_suffix(ts_sz); return Slice(user_key.data(), user_key.size() - ts_sz);
return ret;
} }
// input [user key]: <user_provided_key | ts> // input [user key]: <user_provided_key | ts>
@ -563,18 +563,28 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
// allocation for smaller keys. // allocation for smaller keys.
// 3. It tracks user key or internal key, and allow conversion between them. // 3. It tracks user key or internal key, and allow conversion between them.
class IterKey { class IterKey {
static constexpr size_t kInlineBufferSize = 39;
// This is only used by user-defined timestamps in MemTable only feature,
// which only supports uint64_t timestamps.
static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00";
public: public:
IterKey() IterKey()
: buf_(space_), : buf_(space_),
key_(buf_), key_(buf_),
key_size_(0), key_size_(0),
buf_size_(sizeof(space_)), buf_size_(kInlineBufferSize),
is_user_key_(true) {} is_user_key_(true),
secondary_buf_(space_for_secondary_buf_),
secondary_buf_size_(kInlineBufferSize) {}
// No copying allowed // No copying allowed
IterKey(const IterKey&) = delete; IterKey(const IterKey&) = delete;
void operator=(const IterKey&) = delete; void operator=(const IterKey&) = delete;
~IterKey() { ResetBuffer(); } ~IterKey() {
ResetBuffer();
ResetSecondaryBuffer();
}
// The bool will be picked up by the next calls to SetKey // The bool will be picked up by the next calls to SetKey
void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; } void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
@ -642,13 +652,15 @@ class IterKey {
const char* non_shared_data, const char* non_shared_data,
const size_t non_shared_len, const size_t non_shared_len,
const size_t ts_sz) { const size_t ts_sz) {
std::string kTsMin(ts_sz, static_cast<unsigned char>(0)); // This function is only used by the UDT in memtable feature, which only
std::string key_with_ts; // support built in comparators with uint64 timestamps.
std::vector<Slice> key_parts_with_ts; assert(ts_sz == sizeof(uint64_t));
size_t next_key_slice_index = 0;
if (IsUserKey()) { if (IsUserKey()) {
key_parts_with_ts = {Slice(key_, shared_len), key_slices_[next_key_slice_index++] = Slice(key_, shared_len);
Slice(non_shared_data, non_shared_len), key_slices_[next_key_slice_index++] =
Slice(kTsMin)}; Slice(non_shared_data, non_shared_len);
key_slices_[next_key_slice_index++] = Slice(kTsMin, ts_sz);
} else { } else {
assert(shared_len + non_shared_len >= kNumInternalBytes); assert(shared_len + non_shared_len >= kNumInternalBytes);
// Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len // Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
@ -665,30 +677,46 @@ class IterKey {
// One Slice among the three Slices will get split into two Slices, plus // One Slice among the three Slices will get split into two Slices, plus
// a timestamp slice. // a timestamp slice.
key_parts_with_ts.reserve(5);
bool ts_added = false; bool ts_added = false;
// Add slice parts and find the right location to add the min timestamp. // Add slice parts and find the right location to add the min timestamp.
MaybeAddKeyPartsWithTimestamp( MaybeAddKeyPartsWithTimestamp(
key_, shared_user_key_len, key_, shared_user_key_len,
shared_internal_bytes_len + non_shared_len < kNumInternalBytes, shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
shared_len + non_shared_len - kNumInternalBytes, kTsMin, shared_len + non_shared_len - kNumInternalBytes, ts_sz,
key_parts_with_ts, &ts_added); &next_key_slice_index, &ts_added);
MaybeAddKeyPartsWithTimestamp( MaybeAddKeyPartsWithTimestamp(
key_ + user_key_len, shared_internal_bytes_len, key_ + user_key_len, shared_internal_bytes_len,
non_shared_len < kNumInternalBytes, non_shared_len < kNumInternalBytes,
shared_internal_bytes_len + non_shared_len - kNumInternalBytes, shared_internal_bytes_len + non_shared_len - kNumInternalBytes, ts_sz,
kTsMin, key_parts_with_ts, &ts_added); &next_key_slice_index, &ts_added);
MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len, MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
non_shared_len >= kNumInternalBytes, non_shared_len >= kNumInternalBytes,
non_shared_len - kNumInternalBytes, kTsMin, non_shared_len - kNumInternalBytes, ts_sz,
key_parts_with_ts, &ts_added); &next_key_slice_index, &ts_added);
assert(ts_added); assert(ts_added);
} }
SetKeyImpl(next_key_slice_index,
/* total_bytes= */ shared_len + non_shared_len + ts_sz);
}
Slice new_key(SliceParts(&key_parts_with_ts.front(), Slice SetKeyWithPaddedMinTimestamp(const Slice& key, size_t ts_sz) {
static_cast<int>(key_parts_with_ts.size())), // This function is only used by the UDT in memtable feature, which only
&key_with_ts); // support built in comparators with uint64 timestamps.
SetKey(new_key); assert(ts_sz == sizeof(uint64_t));
size_t num_key_slices = 0;
if (is_user_key_) {
key_slices_[0] = key;
key_slices_[1] = Slice(kTsMin, ts_sz);
num_key_slices = 2;
} else {
assert(key.size() >= kNumInternalBytes);
size_t user_key_size = key.size() - kNumInternalBytes;
key_slices_[0] = Slice(key.data(), user_key_size);
key_slices_[1] = Slice(kTsMin, ts_sz);
key_slices_[2] = Slice(key.data() + user_key_size, kNumInternalBytes);
num_key_slices = 3;
}
return SetKeyImpl(num_key_slices, key.size() + ts_sz);
} }
Slice SetKey(const Slice& key, bool copy = true) { Slice SetKey(const Slice& key, bool copy = true) {
@ -719,15 +747,6 @@ class IterKey {
return Slice(key_, key_n); return Slice(key_, key_n);
} }
// Copy the key into IterKey own buf_
void OwnKey() {
assert(IsKeyPinned() == true);
Reserve(key_size_);
memcpy(buf_, key_, key_size_);
key_ = buf_;
}
// Update the sequence number in the internal key. Guarantees not to // Update the sequence number in the internal key. Guarantees not to
// invalidate slices to the key (and the user key). // invalidate slices to the key (and the user key).
void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) { void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
@ -739,10 +758,15 @@ class IterKey {
ts->size()); ts->size());
} }
uint64_t newval = (seq << 8) | t; uint64_t newval = (seq << 8) | t;
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); if (key_ == buf_) {
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
} else {
assert(key_ == secondary_buf_);
EncodeFixed64(&secondary_buf_[key_size_ - kNumInternalBytes], newval);
}
} }
bool IsKeyPinned() const { return (key_ != buf_); } bool IsKeyPinned() const { return key_ != buf_ && key_ != secondary_buf_; }
// If `ts` is provided, user_key should not contain timestamp, // If `ts` is provided, user_key should not contain timestamp,
// and `ts` is appended after user_key. // and `ts` is appended after user_key.
@ -807,8 +831,24 @@ class IterKey {
const char* key_; const char* key_;
size_t key_size_; size_t key_size_;
size_t buf_size_; size_t buf_size_;
char space_[39]; // Avoid allocation for short keys char space_[kInlineBufferSize]; // Avoid allocation for short keys
bool is_user_key_; bool is_user_key_;
// Below variables are only used by user-defined timestamps in MemTable only
// feature for iterating keys in an index block or a data block.
//
// We will alternate between buf_ and secondary_buf_ to hold the key. key_
// will be modified in accordance to point to the right one. This is to avoid
// an extra copy when we need to copy some shared bytes from previous key
// (delta encoding), and we need to pad a min timestamp at the right location.
char space_for_secondary_buf_[kInlineBufferSize]; // Avoid allocation for
// short keys
char* secondary_buf_;
size_t secondary_buf_size_;
// Use to track the pieces that together make the whole key. We then copy
// these pieces in order either into buf_ or secondary_buf_ depending on where
// the previous key is held.
std::array<Slice, 5> key_slices_;
// End of variables used by user-defined timestamps in MemTable only feature.
Slice SetKeyImpl(const Slice& key, bool copy) { Slice SetKeyImpl(const Slice& key, bool copy) {
size_t size = key.size(); size_t size = key.size();
@ -825,18 +865,64 @@ class IterKey {
return Slice(key_, key_size_); return Slice(key_, key_size_);
} }
Slice SetKeyImpl(size_t num_key_slices, size_t total_bytes) {
assert(num_key_slices <= 5);
char* buf_start = nullptr;
if (key_ == buf_) {
// If the previous key is in buf_, we copy key_slices_ in order into
// secondary_buf_.
EnlargeSecondaryBufferIfNeeded(total_bytes);
buf_start = secondary_buf_;
key_ = secondary_buf_;
} else {
// Copy key_slices_ in order into buf_.
EnlargeBufferIfNeeded(total_bytes);
buf_start = buf_;
key_ = buf_;
}
#ifndef NDEBUG
size_t actual_total_bytes = 0;
#endif // NDEBUG
for (size_t i = 0; i < num_key_slices; i++) {
size_t key_slice_size = key_slices_[i].size();
memcpy(buf_start, key_slices_[i].data(), key_slice_size);
buf_start += key_slice_size;
#ifndef NDEBUG
actual_total_bytes += key_slice_size;
#endif // NDEBUG
}
#ifndef NDEBUG
assert(actual_total_bytes == total_bytes);
#endif // NDEBUG
key_size_ = total_bytes;
return Slice(key_, key_size_);
}
void ResetBuffer() { void ResetBuffer() {
if (key_ == buf_) {
key_size_ = 0;
}
if (buf_ != space_) { if (buf_ != space_) {
delete[] buf_; delete[] buf_;
buf_ = space_; buf_ = space_;
} }
buf_size_ = sizeof(space_); buf_size_ = kInlineBufferSize;
key_size_ = 0; }
void ResetSecondaryBuffer() {
if (key_ == secondary_buf_) {
key_size_ = 0;
}
if (secondary_buf_ != space_for_secondary_buf_) {
delete[] secondary_buf_;
secondary_buf_ = space_for_secondary_buf_;
}
secondary_buf_size_ = kInlineBufferSize;
} }
// Enlarge the buffer size if needed based on key_size. // Enlarge the buffer size if needed based on key_size.
// By default, static allocated buffer is used. Once there is a key // By default, inline buffer is used. Once there is a key
// larger than the static allocated buffer, another buffer is dynamically // larger than the inline buffer, another buffer is dynamically
// allocated, until a larger key buffer is requested. In that case, we // allocated, until a larger key buffer is requested. In that case, we
// reallocate buffer and delete the old one. // reallocate buffer and delete the old one.
void EnlargeBufferIfNeeded(size_t key_size) { void EnlargeBufferIfNeeded(size_t key_size) {
@ -847,23 +933,27 @@ class IterKey {
} }
} }
void EnlargeSecondaryBufferIfNeeded(size_t key_size);
void EnlargeBuffer(size_t key_size); void EnlargeBuffer(size_t key_size);
void MaybeAddKeyPartsWithTimestamp(const char* slice_data, void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
const size_t slice_sz, bool add_timestamp, const size_t slice_sz, bool add_timestamp,
const size_t left_sz, const size_t left_sz, const size_t ts_sz,
const std::string& min_timestamp, size_t* next_key_slice_idx,
std::vector<Slice>& key_parts,
bool* ts_added) { bool* ts_added) {
assert(next_key_slice_idx);
if (add_timestamp && !*ts_added) { if (add_timestamp && !*ts_added) {
assert(slice_sz >= left_sz); assert(slice_sz >= left_sz);
key_parts.emplace_back(slice_data, left_sz); key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, left_sz);
key_parts.emplace_back(min_timestamp); key_slices_[(*next_key_slice_idx)++] = Slice(kTsMin, ts_sz);
key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz); key_slices_[(*next_key_slice_idx)++] =
Slice(slice_data + left_sz, slice_sz - left_sz);
*ts_added = true; *ts_added = true;
} else { } else {
key_parts.emplace_back(slice_data, slice_sz); key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, slice_sz);
} }
assert(*next_key_slice_idx <= 5);
} }
}; };
@ -937,22 +1027,13 @@ struct RangeTombstone {
// User-defined timestamp is enabled, `sk` and `ek` should be user key // User-defined timestamp is enabled, `sk` and `ek` should be user key
// with timestamp, `ts` will replace the timestamps in `sk` and // with timestamp, `ts` will replace the timestamps in `sk` and
// `ek`. // `ek`.
// When `logical_strip_timestamp` is true, the timestamps in `sk` and `ek` RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts) : seq_(sn) {
// will be replaced with min timestamp.
RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts,
bool logical_strip_timestamp)
: seq_(sn) {
const size_t ts_sz = ts.size(); const size_t ts_sz = ts.size();
assert(ts_sz > 0); assert(ts_sz > 0);
pinned_start_key_.reserve(sk.size()); pinned_start_key_.reserve(sk.size());
pinned_end_key_.reserve(ek.size()); pinned_end_key_.reserve(ek.size());
if (logical_strip_timestamp) { AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts);
AppendUserKeyWithMinTimestamp(&pinned_start_key_, sk, ts_sz); AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts);
AppendUserKeyWithMinTimestamp(&pinned_end_key_, ek, ts_sz);
} else {
AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts);
AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts);
}
start_key_ = pinned_start_key_; start_key_ = pinned_start_key_;
end_key_ = pinned_end_key_; end_key_ = pinned_end_key_;
ts_ = Slice(pinned_start_key_.data() + sk.size() - ts_sz, ts_sz); ts_ = Slice(pinned_start_key_.data() + sk.size() - ts_sz, ts_sz);

View File

@ -381,7 +381,7 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
// BackgroundErrorReason reason) will be called to handle other error cases // BackgroundErrorReason reason) will be called to handle other error cases
// such as delegating to SstFileManager to handle no space error. // such as delegating to SstFileManager to handle no space error.
void ErrorHandler::SetBGError(const Status& bg_status, void ErrorHandler::SetBGError(const Status& bg_status,
BackgroundErrorReason reason) { BackgroundErrorReason reason, bool wal_related) {
db_mutex_->AssertHeld(); db_mutex_->AssertHeld();
Status tmp_status = bg_status; Status tmp_status = bg_status;
IOStatus bg_io_err = status_to_io_status(std::move(tmp_status)); IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
@ -389,8 +389,8 @@ void ErrorHandler::SetBGError(const Status& bg_status,
if (bg_io_err.ok()) { if (bg_io_err.ok()) {
return; return;
} }
ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s, reason %d",
bg_io_err.ToString().c_str()); bg_io_err.ToString().c_str(), static_cast<int>(reason));
RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT}, RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT},
{} /* int_histograms */); {} /* int_histograms */);
@ -412,6 +412,31 @@ void ErrorHandler::SetBGError(const Status& bg_status,
recover_context_ = context; recover_context_ = context;
return; return;
} }
if (wal_related) {
assert(reason == BackgroundErrorReason::kWriteCallback ||
reason == BackgroundErrorReason::kMemTable ||
reason == BackgroundErrorReason::kFlush);
}
if (db_options_.manual_wal_flush && wal_related && bg_io_err.IsIOError()) {
// With manual_wal_flush, a WAL write failure can drop buffered WAL writes.
// Memtables and WAL then become inconsistent. A successful memtable flush
// on one CF can cause CFs to be inconsistent upon restart. Before we fix
// the bug in auto recovery from WAL write failures that can flush one CF
// at a time, we set the error severity to fatal to disallow auto recovery.
// TODO: remove parameter `wal_related` once we can automatically recover
// from WAL write failures.
bool auto_recovery = false;
Status bg_err(new_bg_io_err, Status::Severity::kFatalError);
CheckAndSetRecoveryAndBGError(bg_err);
ROCKS_LOG_WARN(db_options_.info_log,
"ErrorHandler: A potentially WAL error happened, set "
"background IO error as fatal error\n");
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
&bg_err, db_mutex_, &auto_recovery);
recover_context_ = context;
return;
}
if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace && if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
(bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile || (bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile ||
bg_io_err.GetRetryable())) { bg_io_err.GetRetryable())) {

Some files were not shown because too many files have changed in this diff Show More