Merge branch 'main' into fixPessimisticTransactionReplication

2024-11-09 17:45:10 +03:00 · 2024-11-09 17:45:10 +03:00 · ff0c57a1f7
parent 8c2456e5e0 a6ee297ac9
commit ff0c57a1f7
324 changed files with 17775 additions and 6189 deletions
--- a/.github/workflows/benchmark-linux.yml
+++ b/.github/workflows/benchmark-linux.yml
@ -1,13 +1,13 @@
 name: facebook/rocksdb/benchmark-linux
 on: workflow_dispatch
-jobs:
-  # FIXME: when this job is fixed, it should be given a cron schedule like
+permissions: {}
+  # FIXME: Disabled temporarily
  # schedule:
-  # - cron: 0 * * * *
-  # workflow_dispatch:
+  # - cron: 7 */2 * * *  # At minute 7 past every 2nd hour
+jobs:
  benchmark-linux:
    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest # FIXME: change this back to self-hosted when ready
    steps:
    - uses: actions/checkout@v4.1.0
    - uses: "./.github/actions/build-for-benchmarks"
--- a/.github/workflows/nightly-candidate.yml
+++ b/.github/workflows/nightly-candidate.yml
@ -1,5 +1,6 @@
 name: facebook/rocksdb/nightly
 on: workflow_dispatch
+permissions: {}
 jobs:
  # These jobs would be in nightly but are failing or otherwise broken for
  # some reason.
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -3,6 +3,7 @@ on:
  schedule:
  - cron: 0 9 * * *
  workflow_dispatch:
+permissions: {}
 jobs:
  build-format-compatible:
    if: ${{ github.repository_owner == 'facebook' }}
@ -59,12 +60,15 @@ jobs:
    container:
      image: zjay437/rocksdb:0.6
      options: --shm-size=16gb
+    env:
+      CC: clang-13
+      CXX: clang++-13
    steps:
    - uses: actions/checkout@v4.1.0
    - uses: "./.github/actions/pre-steps"
    - uses: "./.github/actions/setup-folly"
    - uses: "./.github/actions/build-folly"
-    - run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
+    - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
    - uses: "./.github/actions/post-steps"
  build-linux-valgrind:
    if: ${{ github.repository_owner == 'facebook' }}
@ -76,7 +80,7 @@ jobs:
    steps:
    - uses: actions/checkout@v4.1.0
    - uses: "./.github/actions/pre-steps"
-    - run: PORTABLE=1 make V=1 -j32 valgrind_test
+    - run: make V=1 -j32 valgrind_test
    - uses: "./.github/actions/post-steps"
  build-windows-vs2022-avx2:
    if: ${{ github.repository_owner == 'facebook' }}
--- a/.github/workflows/pr-jobs-candidate.yml
+++ b/.github/workflows/pr-jobs-candidate.yml
@ -1,5 +1,6 @@
 name: facebook/rocksdb/pr-jobs-candidate
 on: workflow_dispatch
+permissions: {}
 jobs:
  # These jobs would be in pr-jobs but are failing or otherwise broken for
  # some reason.
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@ -1,5 +1,6 @@
 name: facebook/rocksdb/pr-jobs
 on: [push, pull_request]
+permissions: {}
 jobs:
  # NOTE: multiple workflows would be recommended, but the current GHA UI in
  # PRs doesn't make it clear when there's an overall error with a workflow,
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -32,7 +32,7 @@
 # 3. cmake ..
 # 4. make -j

-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.12)

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
 include(ReadVersion)
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,6 +1,70 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`

+## 9.8.0 (10/25/2024)
+### New Features
+* All non-`block_cache` options in `BlockBasedTableOptions` are now mutable with `DB::SetOptions()`. See also Bug Fixes below.
+* When using iterators with BlobDB, it is now possible to load large values on an on-demand basis, i.e. only if they are actually needed by the application. This can save I/O in use cases where the values associated with certain keys are not needed. For more details, see the new read option `allow_unprepared_value` and the iterator API `PrepareValue`.
+* Add a new file ingestion option `IngestExternalFileOptions::fill_cache` to support not adding blocks from ingested files into block cache during file ingestion.
+* The option `allow_unprepared_value` is now also supported for multi-column-family iterators (i.e. `CoalescingIterator` and `AttributeGroupIterator`).
+* When a file with just one range deletion (standalone range deletion file) is ingested via bulk loading, it will be marked for compaction. During compaction, this type of files can be used to directly filter out some input files that are not protected by any snapshots and completely deleted by the standalone range deletion file.
+
+### Behavior Changes
+* During file ingestion, overlapping files level assignment are done in multiple batches, so that they can potentially be assigned to lower levels other than always land on L0.
+* OPTIONS file to be loaded by remote worker is now preserved so that it does not get purged by the primary host. A similar technique as how we are preserving new SST files from getting purged is used for this. min_options_file_numbers_ is tracked like pending_outputs_ is tracked.
+* Trim readahead_size during scans so data blocks containing keys that are not in the same prefix as the seek key in `Seek()` are not prefetched when `ReadOptions::auto_readahead_size=true` (default value) and `ReadOptions::prefix_same_as_start = true`
+* Assigning levels for external files are done in the same way for universal compaction and leveled compaction. The old behavior tends to assign files to L0 while the new behavior will assign the files to the lowest level possible.
+
+### Bug Fixes
+* Fix a longstanding race condition in SetOptions for `block_based_table_factory` options. The fix has some subtle behavior changes because of copying and replacing the TableFactory on a change with SetOptions, including requiring an Iterator::Refresh() for an existing Iterator to use the latest options.
+* Fix under counting of allocated memory in the compressed secondary cache due to looking at the compressed block size rather than the actual memory allocated, which could be larger due to internal fragmentation.
+* `GetApproximateMemTableStats()` could return disastrously bad estimates 5-25% of the time. The function has been re-engineered to return much better estimates with similar CPU cost.
+* Skip insertion of compressed blocks in the secondary cache if the lowest_used_cache_tier DB option is kVolatileTier.
+* Fix an issue in level compaction where a small CF with small compaction debt can cause the DB to allow parallel compactions. (#13054)
+* Several DB option settings could be lost through `GetOptionsFromString()`, possibly elsewhere as well. Affected options, now fixed:`background_close_inactive_wals`, `write_dbid_to_manifest`, `write_identity_file`, `prefix_seek_opt_in_only`
+
+## 9.7.0 (09/20/2024)
+### New Features
+* Make Cache a customizable class that can be instantiated by the object registry.
+* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.
+* Add a new table property "rocksdb.key.largest.seqno" which records the largest sequence number of all keys in file. It is verified to be zero during SST file ingestion.
+
+### Behavior Changes
+* Changed the semantics of the BlobDB configuration option `blob_garbage_collection_force_threshold` to define a threshold for the overall garbage ratio of all blob files currently eligible for garbage collection (according to `blob_garbage_collection_age_cutoff`). This can provide better control over space amplification at the cost of slightly higher write amplification.
+* Set `write_dbid_to_manifest=true` by default. This means DB ID will now be preserved through backups, checkpoints, etc. by default. Also add `write_identity_file` option which can be set to false for anticipated future behavior.
+* In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018).
+* Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files.
+* Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion.
+* DB::Close now untracks files in SstFileManager, making avaialble any space used
+by them. Prior to this change they would be orphaned until the DB is re-opened.
+
+### Bug Fixes
+* Fix a bug in CompactRange() where result files may not be compacted in any future compaction. This can only happen when users configure CompactRangeOptions::change_level to true and the change level step of manual compaction fails (#13009).
+* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
+* Fix a bug with manual_wal_flush and auto error recovery from WAL failure that may cause CFs to be inconsistent (#12995). The fix will set potential WAL write failure as fatal error when manual_wal_flush is true, and disables auto error recovery from these errors.
+
+## 9.6.0 (08/19/2024)
+### New Features
+* *Best efforts recovery supports recovering to incomplete Version with a clean seqno cut that presents a valid point in time view from the user's perspective, if versioning history doesn't include atomic flush.
+* New option `BlockBasedTableOptions::decouple_partitioned_filters` should improve efficiency in serving read queries because filter and index partitions can consistently target the configured `metadata_block_size`. This option is currently opt-in.
+* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans.
+
+### Public API Changes
+* Add ticker stats to count file read retries due to checksum mismatch
+* Adds optional installation callback function for remote compaction
+
+### Behavior Changes
+* There may be less intra-L0 compaction triggered by total L0 size being too small. We now use compensated file size (tombstones are assigned some value size) when calculating L0 size and reduce the threshold for L0 size limit. This is to avoid accumulating too much data/tombstones in L0.
+
+### Bug Fixes
+* *Make DestroyDB supports slow deletion when it's configured in `SstFileManager`. The slow deletion is subject to the configured `rate_bytes_per_sec`, but not subject to the `max_trash_db_ratio`.
+* Fixed a bug where we set unprep_seqs_ even when WriteImpl() fails. This was caught by stress test write fault injection in WriteImpl(). This may have incorrectly caused iteration creation failure for unvalidated writes or returned wrong result for WriteUnpreparedTxn::GetUnpreparedSequenceNumbers().
+* Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries
+* Fixed a data race involving the background error status in `unordered_write` mode.
+* *Fix a bug where file snapshot functions like backup, checkpoint may attempt to copy a non-existing manifest file. #12882
+* Fix a bug where per kv checksum corruption may be ignored in MultiGet().
+* Fix a race condition in pessimistic transactions that could allow multiple transactions with the same name to be registered simultaneously, resulting in a crash or other unpredictable behavior.
+
 ## 9.5.0 (07/19/2024)
 ### Public API Changes
 * Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch
--- a/15
+++ b/15
@ -630,6 +630,11 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
 VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
 # Not yet supported: --show-leak-kinds=definite,possible,reachable --errors-for-leak-kinds=definite,possible,reachable

+# Work around valgrind hanging on systems with limited internet access
+ifneq ($(shell which git 2>/dev/null && git config --get https.proxy),)
+  export DEBUGINFOD_URLS=
+endif
+
 TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
 BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
 CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
@ -1164,16 +1169,16 @@ ubsan_crash_test_with_best_efforts_recovery: clean
 	$(MAKE) clean

 full_valgrind_test:
-	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
+	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check

 full_valgrind_test_some:
-	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
+	ROCKSDB_FULL_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check_some

 valgrind_test:
-	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
+	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check

 valgrind_test_some:
-	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check_some
+	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 PORTABLE=1 $(MAKE) valgrind_check_some

 valgrind_check: $(TESTS)
 	$(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests
@ -2484,7 +2489,7 @@ checkout_folly:
 	fi
 	@# Pin to a particular version for public CI, so that PR authors don't
 	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard c48fdd205c1c291651749d532b8055fe822bba25
+	cd third-party/folly && git reset --hard 03041f014b6e6ebb6119ffae8b7a37308f52e913
 	@# NOTE: this hack is required for clang in some cases
 	perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp
 	@# NOTE: this hack is required for gcc in some cases
--- a/16
+++ b/16
@ -362,9 +362,9 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "//folly/experimental/coro:coroutine",
        "//folly/experimental/coro:task",
        "//folly/synchronization:distributed_mutex",
-    ], headers=None, link_whole=False, extra_test_libs=False)
+    ], headers=glob(["**/*.h"]), link_whole=False, extra_test_libs=False)

-cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False)
+cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=[], link_whole=True, extra_test_libs=False)

 cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
        "db/db_test_util.cc",
@ -378,7 +378,7 @@ cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
        "tools/trace_analyzer_tool.cc",
        "utilities/agg_merge/test_agg_merge.cc",
        "utilities/cassandra/test_utils.cc",
-    ], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=True)
+    ], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=True)

 cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[
        "test_util/testutil.cc",
@ -386,9 +386,9 @@ cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[
        "tools/db_bench_tool.cc",
        "tools/simulated_hybrid_file_system.cc",
        "tools/trace_analyzer_tool.cc",
-    ], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=False)
+    ], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)

-cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=None, link_whole=False, extra_test_libs=False)
+cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)

 rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
        "db_stress_tool/batched_ops_stress.cc",
@ -410,13 +410,15 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
        "test_util/testutil.cc",
        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
        "tools/trace_analyzer_tool.cc",
-    ], headers=None)
+    ], headers=[])


 cpp_binary_wrapper(name="ldb", srcs=["tools/ldb.cc"], deps=[":rocksdb_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)

 cpp_binary_wrapper(name="db_stress", srcs=["db_stress_tool/db_stress.cc"], deps=[":rocksdb_stress_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)

+cpp_binary_wrapper(name="db_bench", srcs=["tools/db_bench.cc"], deps=[":rocksdb_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
+
 cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)

 cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
@ -5024,7 +5026,7 @@ cpp_unittest_wrapper(name="dynamic_bloom_test",
            extra_compiler_flags=[])


-cpp_library_wrapper(name="env_basic_test_lib", srcs=["env/env_basic_test.cc"], deps=[":rocksdb_test_lib"], headers=None, link_whole=False, extra_test_libs=True)
+cpp_library_wrapper(name="env_basic_test_lib", srcs=["env/env_basic_test.cc"], deps=[":rocksdb_test_lib"], headers=[], link_whole=False, extra_test_libs=True)

 cpp_unittest_wrapper(name="env_basic_test",
            srcs=["env/env_basic_test.cc"],
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-from __future__ import absolute_import, division, print_function, unicode_literals

 try:
    from builtins import str
@ -11,7 +10,7 @@ import json
 import os
 import sys

-from targets_builder import TARGETSBuilder
+from targets_builder import TARGETSBuilder, LiteralValue

 from util import ColorString

@ -132,7 +131,7 @@ def generate_targets(repo_path, deps_map):
    if len(sys.argv) >= 2:
        # Heuristically quote and canonicalize whitespace for inclusion
        # in how the file was generated.
-        extra_argv = " '{0}'".format(" ".join(sys.argv[1].split()))
+        extra_argv = " '{}'".format(" ".join(sys.argv[1].split()))

    TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv)

@ -150,6 +149,7 @@ def generate_targets(repo_path, deps_map):
            "//folly/experimental/coro:task",
            "//folly/synchronization:distributed_mutex",
        ],
+        headers=LiteralValue("glob([\"**/*.h\"])")
    )
    # rocksdb_whole_archive_lib
    TARGETS.add_library(
@ -158,7 +158,6 @@ def generate_targets(repo_path, deps_map):
        deps=[
            ":rocksdb_lib",
        ],
-        headers=None,
        extra_external_deps="",
        link_whole=True,
    )
@ -201,6 +200,10 @@ def generate_targets(repo_path, deps_map):
    TARGETS.add_binary(
        "db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"]
    )
+    # db_bench binary
+    TARGETS.add_binary(
+        "db_bench", ["tools/db_bench.cc"], [":rocksdb_tools_lib"]
+    )
    # cache_bench binary
    TARGETS.add_binary(
        "cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"]
@ -209,7 +212,7 @@ def generate_targets(repo_path, deps_map):
    for src in src_mk.get("MICROBENCH_SOURCES", []):
        name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0]
        TARGETS.add_binary(name, [src], [], extra_bench_libs=True)
-    print("Extra dependencies:\n{0}".format(json.dumps(deps_map)))
+    print(f"Extra dependencies:\n{json.dumps(deps_map)}")

    # Dictionary test executable name -> relative source file path
    test_source_map = {}
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@ -1,5 +1,4 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-from __future__ import absolute_import, division, print_function, unicode_literals

 try:
    from builtins import object, str
@ -9,17 +8,28 @@ import pprint

 import targets_cfg

+class LiteralValue:
+    def __init__(self, value):
+        self.value = value
+
+    def __str__(self):
+        return str(self.value)
+
+def smart_quote_value(val):
+    if isinstance(val, LiteralValue):
+        return str(val)
+    return '"%s"' % val

 def pretty_list(lst, indent=8):
    if lst is None or len(lst) == 0:
        return ""

    if len(lst) == 1:
-        return '"%s"' % lst[0]
+        return smart_quote_value(lst[0])

-    separator = '",\n%s"' % (" " * indent)
-    res = separator.join(sorted(lst))
-    res = "\n" + (" " * indent) + '"' + res + '",\n' + (" " * (indent - 4))
+    separator = ',\n%s' % (" " * indent)
+    res = separator.join(sorted(map(smart_quote_value, lst)))
+    res = "\n" + (" " * indent) + res + ',\n' + (" " * (indent - 4))
    return res


@ -48,7 +58,12 @@ class TARGETSBuilder:
        extra_test_libs=False,
    ):
        if headers is not None:
-            headers = "[" + pretty_list(headers) + "]"
+            if isinstance(headers, LiteralValue):
+                headers = str(headers)
+            else:
+                headers = "[" + pretty_list(headers) + "]"
+        else:
+            headers = "[]"
        with open(self.path, "ab") as targets_file:
            targets_file.write(
                targets_cfg.library_template.format(
@ -65,8 +80,7 @@ class TARGETSBuilder:
        self.total_lib = self.total_lib + 1

    def add_rocksdb_library(self, name, srcs, headers=None, external_dependencies=None):
-        if headers is not None:
-            headers = "[" + pretty_list(headers) + "]"
+        headers = "[" + pretty_list(headers) + "]"
        with open(self.path, "ab") as targets_file:
            targets_file.write(
                targets_cfg.rocksdb_library_template.format(
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@ -1,5 +1,4 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-from __future__ import absolute_import, division, print_function, unicode_literals

 rocksdb_target_header_template = """# This file \100generated by:
 #$ python3 buckifier/buckify_rocksdb.py{extra_argv}
--- a/buckifier/util.py
+++ b/buckifier/util.py
@ -2,7 +2,6 @@
 """
 This module keeps commonly used components.
 """
-from __future__ import absolute_import, division, print_function, unicode_literals

 try:
    from builtins import object
--- a/build_tools/amalgamate.py
+++ b/build_tools/amalgamate.py
@ -25,7 +25,6 @@
 #
 # The solution is to move the include out of the #ifdef.

-from __future__ import print_function

 import argparse
 import re
@ -62,7 +61,7 @@ def expand_include(

    included.add(include_path)
    with open(include_path) as f:
-        print('#line 1 "{}"'.format(include_path), file=source_out)
+        print(f'#line 1 "{include_path}"', file=source_out)
        process_file(
            f, include_path, source_out, header_out, include_paths, public_include_paths
        )
@ -118,7 +117,7 @@ def process_file(
                    )

            if expanded:
-                print('#line {} "{}"'.format(line + 1, abs_path), file=source_out)
+                print(f'#line {line + 1} "{abs_path}"', file=source_out)
        elif text != "#pragma once\n":
            source_out.write(text)

@ -157,8 +156,8 @@ def main():
    with open(filename) as f, open(args.source_out, "w") as source_out, open(
        args.header_out, "w"
    ) as header_out:
-        print('#line 1 "{}"'.format(filename), file=source_out)
-        print('#include "{}"'.format(header_out.name), file=source_out)
+        print(f'#line 1 "{filename}"', file=source_out)
+        print(f'#include "{header_out.name}"', file=source_out)
        process_file(
            f, abs_path, source_out, header_out, include_paths, public_include_paths
        )
--- a/build_tools/benchmark_log_tool.py
+++ b/build_tools/benchmark_log_tool.py
@ -102,7 +102,7 @@ class BenchmarkUtils:


 class ResultParser:
-    def __init__(self, field="(\w|[+-:.%])+", intrafield="(\s)+", separator="\t"):
+    def __init__(self, field=r"(\w|[+-:.%])+", intrafield=r"(\s)+", separator="\t"):
        self.field = re.compile(field)
        self.intra = re.compile(intrafield)
        self.sep = re.compile(separator)
@ -159,7 +159,7 @@ class ResultParser:


 def load_report_from_tsv(filename: str):
-    file = open(filename, "r")
+    file = open(filename)
    contents = file.readlines()
    file.close()
    parser = ResultParser()
--- a/build_tools/error_filter.py
+++ b/build_tools/error_filter.py
@ -9,7 +9,6 @@
    - Prints those error messages to stdout
 """

-from __future__ import absolute_import, division, print_function, unicode_literals

 import re
 import sys
@ -43,7 +42,7 @@ class GTestErrorParser(ErrorParserBase):
            return None
        gtest_fail_match = self._GTEST_FAIL_PATTERN.match(line)
        if gtest_fail_match:
-            return "%s failed: %s" % (self._last_gtest_name, gtest_fail_match.group(1))
+            return "{} failed: {}".format(self._last_gtest_name, gtest_fail_match.group(1))
        return None


@ -66,52 +65,52 @@ class CompilerErrorParser(MatchErrorParser):
        # format (link error):
        #   '<filename>:<line #>: error: <error msg>'
        # The below regex catches both
-        super(CompilerErrorParser, self).__init__(r"\S+:\d+: error:")
+        super().__init__(r"\S+:\d+: error:")


 class ScanBuildErrorParser(MatchErrorParser):
    def __init__(self):
-        super(ScanBuildErrorParser, self).__init__(r"scan-build: \d+ bugs found.$")
+        super().__init__(r"scan-build: \d+ bugs found.$")


 class DbCrashErrorParser(MatchErrorParser):
    def __init__(self):
-        super(DbCrashErrorParser, self).__init__(r"\*\*\*.*\^$|TEST FAILED.")
+        super().__init__(r"\*\*\*.*\^$|TEST FAILED.")


 class WriteStressErrorParser(MatchErrorParser):
    def __init__(self):
-        super(WriteStressErrorParser, self).__init__(
+        super().__init__(
            r"ERROR: write_stress died with exitcode=\d+"
        )


 class AsanErrorParser(MatchErrorParser):
    def __init__(self):
-        super(AsanErrorParser, self).__init__(r"==\d+==ERROR: AddressSanitizer:")
+        super().__init__(r"==\d+==ERROR: AddressSanitizer:")


 class UbsanErrorParser(MatchErrorParser):
    def __init__(self):
        # format: '<filename>:<line #>:<column #>: runtime error: <error msg>'
-        super(UbsanErrorParser, self).__init__(r"\S+:\d+:\d+: runtime error:")
+        super().__init__(r"\S+:\d+:\d+: runtime error:")


 class ValgrindErrorParser(MatchErrorParser):
    def __init__(self):
        # just grab the summary, valgrind doesn't clearly distinguish errors
        # from other log messages.
-        super(ValgrindErrorParser, self).__init__(r"==\d+== ERROR SUMMARY:")
+        super().__init__(r"==\d+== ERROR SUMMARY:")


 class CompatErrorParser(MatchErrorParser):
    def __init__(self):
-        super(CompatErrorParser, self).__init__(r"==== .*[Ee]rror.* ====$")
+        super().__init__(r"==== .*[Ee]rror.* ====$")


 class TsanErrorParser(MatchErrorParser):
    def __init__(self):
-        super(TsanErrorParser, self).__init__(r"WARNING: ThreadSanitizer:")
+        super().__init__(r"WARNING: ThreadSanitizer:")


 _TEST_NAME_TO_PARSERS = {
--- a/cache/cache.cc
+++ b/cache/cache.cc
@ -133,19 +133,25 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
                               std::shared_ptr<Cache>* result) {
  Status status;
  std::shared_ptr<Cache> cache;
-  if (value.find('=') == std::string::npos) {
-    cache = NewLRUCache(ParseSizeT(value));
-  } else {
-    LRUCacheOptions cache_opts;
-    status = OptionTypeInfo::ParseStruct(config_options, "",
-                                         &lru_cache_options_type_info, "",
-                                         value, &cache_opts);
-    if (status.ok()) {
-      cache = NewLRUCache(cache_opts);
+  if (StartsWith(value, "null")) {
+    cache = nullptr;
+  } else if (value.find("://") == std::string::npos) {
+    if (value.find('=') == std::string::npos) {
+      cache = NewLRUCache(ParseSizeT(value));
+    } else {
+      LRUCacheOptions cache_opts;
+      status = OptionTypeInfo::ParseStruct(config_options, "",
+                                           &lru_cache_options_type_info, "",
+                                           value, &cache_opts);
+      if (status.ok()) {
+        cache = NewLRUCache(cache_opts);
+      }
    }
-  }
-  if (status.ok()) {
-    result->swap(cache);
+    if (status.ok()) {
+      result->swap(cache);
+    }
+  } else {
+    status = LoadSharedObject<Cache>(config_options, value, result);
  }
  return status;
 }
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@ -886,6 +886,32 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
  ASSERT_EQ(special_count, kSpecialCount);
 }

+TEST_P(CacheTest, ApplyToHandleTest) {
+  std::string callback_state;
+  const auto callback = [&](const Slice& key, Cache::ObjectPtr value,
+                            size_t charge,
+                            const Cache::CacheItemHelper* helper) {
+    callback_state = std::to_string(DecodeKey(key)) + "," +
+                     std::to_string(DecodeValue(value)) + "," +
+                     std::to_string(charge);
+    assert(helper == &CacheTest::kHelper);
+  };
+
+  std::vector<std::string> inserted;
+
+  for (int i = 0; i < 10; ++i) {
+    Insert(i, i * 2, i + 1);
+    inserted.push_back(std::to_string(i) + "," + std::to_string(i * 2) + "," +
+                       std::to_string(i + 1));
+  }
+  for (int i = 0; i < 10; ++i) {
+    Cache::Handle* handle = cache_->Lookup(EncodeKey(i));
+    cache_->ApplyToHandle(cache_.get(), handle, callback);
+    EXPECT_EQ(inserted[i], callback_state);
+    cache_->Release(handle);
+  }
+}
+
 TEST_P(CacheTest, DefaultShardBits) {
  // Prevent excessive allocation (to save time & space)
  estimated_value_size_ = 100000;
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@ -1444,6 +1444,22 @@ const Cache::CacheItemHelper* BaseHyperClockCache<Table>::GetCacheItemHelper(
  return h->helper;
 }

+template <class Table>
+void BaseHyperClockCache<Table>::ApplyToHandle(
+    Cache* cache, Handle* handle,
+    const std::function<void(const Slice& key, Cache::ObjectPtr value,
+                             size_t charge, const CacheItemHelper* helper)>&
+        callback) {
+  BaseHyperClockCache<Table>* cache_ptr =
+      static_cast<BaseHyperClockCache<Table>*>(cache);
+  auto h = static_cast<const typename Table::HandleImpl*>(handle);
+  UniqueId64x2 unhashed;
+  auto hash_seed = cache_ptr->GetShard(h->GetHash()).GetTable().GetHashSeed();
+  callback(
+      ClockCacheShard<Table>::ReverseHash(h->hashed_key, &unhashed, hash_seed),
+      h->value, h->GetTotalCharge(), h->helper);
+}
+
 namespace {

 // For each cache shard, estimate what the table load factor would be if
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -1128,6 +1128,12 @@ class BaseHyperClockCache : public ShardedCache<ClockCacheShard<Table>> {

  const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;

+  void ApplyToHandle(
+      Cache* cache, Handle* handle,
+      const std::function<void(const Slice& key, Cache::ObjectPtr obj,
+                               size_t charge, const CacheItemHelper* helper)>&
+          callback) override;
+
  void ReportProblems(
      const std::shared_ptr<Logger>& /*info_log*/) const override;
 };
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@ -79,7 +79,11 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
    data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1,
                              static_cast<uint32_t*>(&source_32));
    source = static_cast<CacheTier>(source_32);
-    handle_value_charge -= (data_ptr - ptr->get());
+    uint64_t data_size = 0;
+    data_ptr = GetVarint64Ptr(data_ptr, ptr->get() + handle_value_charge,
+                              static_cast<uint64_t*>(&data_size));
+    assert(handle_value_charge > data_size);
+    handle_value_charge = data_size;
  }
  MemoryAllocator* allocator = cache_options_.memory_allocator.get();

@ -169,13 +173,15 @@ Status CompressedSecondaryCache::InsertInternal(
  }

  auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
-  char header[10];
+  char header[20];
  char* payload = header;
  payload = EncodeVarint32(payload, static_cast<uint32_t>(type));
  payload = EncodeVarint32(payload, static_cast<uint32_t>(source));
+  size_t data_size = (*helper->size_cb)(value);
+  char* data_size_ptr = payload;
+  payload = EncodeVarint64(payload, data_size);

  size_t header_size = payload - header;
-  size_t data_size = (*helper->size_cb)(value);
  size_t total_size = data_size + header_size;
  CacheAllocationPtr ptr =
      AllocateBlock(total_size, cache_options_.memory_allocator.get());
@ -210,6 +216,8 @@ Status CompressedSecondaryCache::InsertInternal(

    val = Slice(compressed_val);
    data_size = compressed_val.size();
+    payload = EncodeVarint64(data_size_ptr, data_size);
+    header_size = payload - header;
    total_size = header_size + data_size;
    PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size);

@ -222,14 +230,21 @@ Status CompressedSecondaryCache::InsertInternal(

  PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
  if (cache_options_.enable_custom_split_merge) {
-    size_t charge{0};
-    CacheValueChunk* value_chunks_head =
-        SplitValueIntoChunks(val, cache_options_.compression_type, charge);
-    return cache_->Insert(key, value_chunks_head, internal_helper, charge);
+    size_t split_charge{0};
+    CacheValueChunk* value_chunks_head = SplitValueIntoChunks(
+        val, cache_options_.compression_type, split_charge);
+    return cache_->Insert(key, value_chunks_head, internal_helper,
+                          split_charge);
  } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    size_t charge = malloc_usable_size(ptr.get());
+#else
+    size_t charge = total_size;
+#endif
    std::memcpy(ptr.get(), header, header_size);
    CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
-    return cache_->Insert(key, buf, internal_helper, total_size);
+    charge += sizeof(CacheAllocationPtr);
+    return cache_->Insert(key, buf, internal_helper, charge);
  }
 }

@ -398,6 +413,21 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
  }
 }

+size_t CompressedSecondaryCache::TEST_GetCharge(const Slice& key) {
+  Cache::Handle* lru_handle = cache_->Lookup(key);
+  if (lru_handle == nullptr) {
+    return 0;
+  }
+
+  size_t charge = cache_->GetCharge(lru_handle);
+  if (cache_->Value(lru_handle) != nullptr &&
+      !cache_options_.enable_custom_split_merge) {
+    charge -= 10;
+  }
+  cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+  return charge;
+}
+
 std::shared_ptr<SecondaryCache>
 CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
  return std::make_shared<CompressedSecondaryCache>(*this);
--- a/cache/compressed_secondary_cache.h
+++ b/cache/compressed_secondary_cache.h
@ -139,6 +139,8 @@ class CompressedSecondaryCache : public SecondaryCache {
                        const Cache::CacheItemHelper* helper,
                        CompressionType type, CacheTier source);

+  size_t TEST_GetCharge(const Slice& key);
+
  // TODO: clean up to use cleaner interfaces in typed_cache.h
  const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
  std::shared_ptr<Cache> cache_;
--- a/cache/compressed_secondary_cache_test.cc
+++ b/cache/compressed_secondary_cache_test.cc
@ -39,6 +39,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
 protected:
  void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
                       bool sec_cache_is_compressed) {
+    CompressedSecondaryCache* comp_sec_cache =
+        static_cast<CompressedSecondaryCache*>(sec_cache.get());
    get_perf_context()->Reset();
    bool kept_in_sec_cache{true};
    // Lookup an non-existent key.
@ -66,6 +68,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
    ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);

+    ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), 1000);
+
    std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
                          /*stats=*/nullptr, kept_in_sec_cache);
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@ -677,6 +677,17 @@ const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper(
  return h->helper;
 }

+void LRUCache::ApplyToHandle(
+    Cache* cache, Handle* handle,
+    const std::function<void(const Slice& key, ObjectPtr value, size_t charge,
+                             const CacheItemHelper* helper)>& callback) {
+  auto cache_ptr = static_cast<LRUCache*>(cache);
+  auto h = static_cast<const LRUHandle*>(handle);
+  callback(h->key(), h->value,
+           h->GetCharge(cache_ptr->GetShard(0).metadata_charge_policy_),
+           h->helper);
+}
+
 size_t LRUCache::TEST_GetLRUSize() {
  return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
 }
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@ -452,6 +452,12 @@ class LRUCache
  size_t GetCharge(Handle* handle) const override;
  const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;

+  void ApplyToHandle(
+      Cache* cache, Handle* handle,
+      const std::function<void(const Slice& key, ObjectPtr obj, size_t charge,
+                               const CacheItemHelper* helper)>& callback)
+      override;
+
  // Retrieves number of elements in LRU, for unit test purpose only.
  size_t TEST_GetLRUSize();
  // Retrieves high pri pool ratio.
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@ -271,7 +271,8 @@ Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value,
  // Warm up the secondary cache with the compressed block. The secondary
  // cache may choose to ignore it based on the admission policy.
  if (value != nullptr && !compressed_value.empty() &&
-      adm_policy_ == TieredAdmissionPolicy::kAdmPolicyThreeQueue) {
+      adm_policy_ == TieredAdmissionPolicy::kAdmPolicyThreeQueue &&
+      helper->IsSecondaryCacheCompatible()) {
    Status status = secondary_cache_->InsertSaved(key, compressed_value, type);
    assert(status.ok() || status.IsNotSupported());
  }
--- a/cache/tiered_secondary_cache_test.cc
+++ b/cache/tiered_secondary_cache_test.cc
@ -253,6 +253,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicTest) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));

  // Disable paranoid_file_checks so that flush will not read back the newly
@ -364,6 +365,7 @@ TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));

  options.paranoid_file_checks = false;
@ -506,6 +508,7 @@ TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));

  options.paranoid_file_checks = false;
@ -606,6 +609,7 @@ TEST_F(DBTieredSecondaryCacheTest, ReadyBeforeWaitAllTest) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  options.statistics = CreateDBStatistics();

@ -717,6 +721,7 @@ TEST_F(DBTieredSecondaryCacheTest, IterateTest) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));

  options.paranoid_file_checks = false;
@ -760,6 +765,54 @@ TEST_F(DBTieredSecondaryCacheTest, IterateTest) {
  Destroy(options);
 }

+TEST_F(DBTieredSecondaryCacheTest, VolatileTierTest) {
+  if (!LZ4_Supported()) {
+    ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+    return;
+  }
+
+  BlockBasedTableOptions table_options;
+  // We want a block cache of size 5KB, and a compressed secondary cache of
+  // size 5KB. However, we specify a block cache size of 256KB here in order
+  // to take into account the cache reservation in the block cache on
+  // behalf of the compressed cache. The unit of cache reservation is 256KB.
+  // The effective block cache capacity will be calculated as 256 + 5 = 261KB,
+  // and 256KB will be reserved for the compressed cache, leaving 5KB for
+  // the primary block cache. We only have to worry about this here because
+  // the cache size is so small.
+  table_options.block_cache = NewCache(256 * 1024, 5 * 1024, 256 * 1024);
+  table_options.block_size = 4 * 1024;
+  table_options.cache_index_and_filter_blocks = false;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Disable paranoid_file_checks so that flush will not read back the newly
+  // written file
+  options.paranoid_file_checks = false;
+  options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 256;
+  for (int i = 0; i < N; i++) {
+    std::string p_v;
+    test::CompressibleString(&rnd, 0.5, 1007, &p_v);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  // Since lowest_used_cache_tier is the volatile tier, nothing should be
+  // inserted in the secondary cache.
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 0u);
+  ASSERT_EQ(nvm_sec_cache()->num_misses(), 0u);
+
+  Destroy(options);
+}
+
 class DBTieredAdmPolicyTest
    : public DBTieredSecondaryCacheTest,
      public testing::WithParamInterface<TieredAdmissionPolicy> {};
@ -784,6 +837,7 @@ TEST_P(DBTieredAdmPolicyTest, CompressedOnlyTest) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));

  size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
@ -836,6 +890,7 @@ TEST_P(DBTieredAdmPolicyTest, CompressedCacheAdmission) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));

  size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage();
@ -937,6 +992,7 @@ TEST_F(DBTieredSecondaryCacheTest, FSBufferTest) {
  table_options.cache_index_and_filter_blocks = false;
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  options.statistics = CreateDBStatistics();
  options.env = wrap_env.get();
--- a/coverage/parse_gcov_output.py
+++ b/coverage/parse_gcov_output.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

-from __future__ import print_function

 import optparse
 import re
@ -109,11 +108,11 @@ def report_coverage():

    # Check if we need to display coverage info for interested files.
    if len(interested_files):
-        per_file_coverage = dict(
-            (fname, per_file_coverage[fname])
+        per_file_coverage = {
+            fname: per_file_coverage[fname]
            for fname in interested_files
            if fname in per_file_coverage
-        )
+        }
        # If we only interested in several files, it makes no sense to report
        # the total_coverage
        total_coverage = None
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@ -45,20 +45,23 @@ void ArenaWrappedDBIter::Init(
    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
    uint64_t version_number, ReadCallback* read_callback,
    ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
-  auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(
-      env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator,
-      /* iter */ nullptr, version, sequence, true,
-      max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index);
-  sv_number_ = version_number;
  read_options_ = read_options;
-  allow_refresh_ = allow_refresh;
-  memtable_range_tombstone_iter_ = nullptr;
-
  if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
                             FSSupportedOps::kAsyncIO)) {
    read_options_.async_io = false;
  }
+  read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
+
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
+                              ioptions.user_comparator,
+                              /* iter */ nullptr, version, sequence, true,
+                              max_sequential_skip_in_iteration, read_callback,
+                              cfh, expose_blob_index);
+
+  sv_number_ = version_number;
+  allow_refresh_ = allow_refresh;
+  memtable_range_tombstone_iter_ = nullptr;
 }

 Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }
--- a/db/arena_wrapped_db_iter.h
+++ b/db/arena_wrapped_db_iter.h
@ -83,6 +83,8 @@ class ArenaWrappedDBIter : public Iterator {
  Status Refresh() override;
  Status Refresh(const Snapshot*) override;

+  bool PrepareValue() override { return db_iter_->PrepareValue(); }
+
  void Init(Env* env, const ReadOptions& read_options,
            const ImmutableOptions& ioptions,
            const MutableCFOptions& mutable_cf_options, const Version* version,
--- a/db/attribute_group_iterator_impl.h
+++ b/db/attribute_group_iterator_impl.h
@ -13,14 +13,11 @@ namespace ROCKSDB_NAMESPACE {
 class AttributeGroupIteratorImpl : public AttributeGroupIterator {
 public:
  AttributeGroupIteratorImpl(
-      const Comparator* comparator,
-      const std::vector<ColumnFamilyHandle*>& column_families,
-      const std::vector<Iterator*>& child_iterators)
-      : impl_(
-            comparator, column_families, child_iterators, [this]() { Reset(); },
-            [this](const autovector<MultiCfIteratorInfo>& items) {
-              AddToAttributeGroups(items);
-            }) {}
+      const ReadOptions& read_options, const Comparator* comparator,
+      std::vector<std::pair<ColumnFamilyHandle*, std::unique_ptr<Iterator>>>&&
+          cfh_iter_pairs)
+      : impl_(read_options, comparator, std::move(cfh_iter_pairs),
+              ResetFunc(this), PopulateFunc(this)) {}
  ~AttributeGroupIteratorImpl() override {}

  // No copy allowed
@ -45,8 +42,36 @@ class AttributeGroupIteratorImpl : public AttributeGroupIterator {

  void Reset() { attribute_groups_.clear(); }

+  bool PrepareValue() override { return impl_.PrepareValue(); }
+
 private:
-  MultiCfIteratorImpl impl_;
+  class ResetFunc {
+   public:
+    explicit ResetFunc(AttributeGroupIteratorImpl* iter) : iter_(iter) {}
+
+    void operator()() const {
+      assert(iter_);
+      iter_->Reset();
+    }
+
+   private:
+    AttributeGroupIteratorImpl* iter_;
+  };
+
+  class PopulateFunc {
+   public:
+    explicit PopulateFunc(AttributeGroupIteratorImpl* iter) : iter_(iter) {}
+
+    void operator()(const autovector<MultiCfIteratorInfo>& items) const {
+      assert(iter_);
+      iter_->AddToAttributeGroups(items);
+    }
+
+   private:
+    AttributeGroupIteratorImpl* iter_;
+  };
+
+  MultiCfIteratorImpl<ResetFunc, PopulateFunc> impl_;
  IteratorAttributeGroups attribute_groups_;
  void AddToAttributeGroups(const autovector<MultiCfIteratorInfo>& items);
 };
--- a/db/blob/blob_file_cache.cc
+++ b/db/blob/blob_file_cache.cc
@ -42,6 +42,7 @@ Status BlobFileCache::GetBlobFileReader(
  assert(blob_file_reader);
  assert(blob_file_reader->IsEmpty());

+  // NOTE: sharing same Cache with table_cache
  const Slice key = GetSliceForKey(&blob_file_number);

  assert(cache_);
@ -98,4 +99,13 @@ Status BlobFileCache::GetBlobFileReader(
  return Status::OK();
 }

+void BlobFileCache::Evict(uint64_t blob_file_number) {
+  // NOTE: sharing same Cache with table_cache
+  const Slice key = GetSliceForKey(&blob_file_number);
+
+  assert(cache_);
+
+  cache_.get()->Erase(key);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/blob/blob_file_cache.h
+++ b/db/blob/blob_file_cache.h
@ -36,6 +36,15 @@ class BlobFileCache {
                           uint64_t blob_file_number,
                           CacheHandleGuard<BlobFileReader>* blob_file_reader);

+  // Called when a blob file is obsolete to ensure it is removed from the cache
+  // to avoid effectively leaking the open file and assicated memory
+  void Evict(uint64_t blob_file_number);
+
+  // Used to identify cache entries for blob files (not normally useful)
+  static const Cache::CacheItemHelper* GetHelper() {
+    return CacheInterface::GetBasicHelper();
+  }
+
 private:
  using CacheInterface =
      BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>;
--- a/db/blob/blob_source.cc
+++ b/db/blob/blob_source.cc
@ -20,23 +20,24 @@

 namespace ROCKSDB_NAMESPACE {

-BlobSource::BlobSource(const ImmutableOptions* immutable_options,
+BlobSource::BlobSource(const ImmutableOptions& immutable_options,
+                       const MutableCFOptions& mutable_cf_options,
                       const std::string& db_id,
                       const std::string& db_session_id,
                       BlobFileCache* blob_file_cache)
    : db_id_(db_id),
      db_session_id_(db_session_id),
-      statistics_(immutable_options->statistics.get()),
+      statistics_(immutable_options.statistics.get()),
      blob_file_cache_(blob_file_cache),
-      blob_cache_(immutable_options->blob_cache),
-      lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
+      blob_cache_(immutable_options.blob_cache),
+      lowest_used_cache_tier_(immutable_options.lowest_used_cache_tier) {
  auto bbto =
-      immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
+      mutable_cf_options.table_factory->GetOptions<BlockBasedTableOptions>();
  if (bbto &&
      bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
              .charged == CacheEntryRoleOptions::Decision::kEnabled) {
    blob_cache_ = SharedCacheInterface{std::make_shared<ChargedCache>(
-        immutable_options->blob_cache, bbto->block_cache)};
+        immutable_options.blob_cache, bbto->block_cache)};
  }
 }

--- a/db/blob/blob_source.h
+++ b/db/blob/blob_source.h
@ -21,6 +21,7 @@
 namespace ROCKSDB_NAMESPACE {

 struct ImmutableOptions;
+struct MutableCFOptions;
 class Status;
 class FilePrefetchBuffer;
 class Slice;
@ -31,7 +32,10 @@ class Slice;
 // storage with minimal cost.
 class BlobSource {
 public:
-  BlobSource(const ImmutableOptions* immutable_options,
+  // NOTE: db_id, db_session_id, and blob_file_cache are saved by reference or
+  // pointer.
+  BlobSource(const ImmutableOptions& immutable_options,
+             const MutableCFOptions& mutable_cf_options,
             const std::string& db_id, const std::string& db_session_id,
             BlobFileCache* blob_file_cache);

--- a/db/blob/blob_source_test.cc
+++ b/db/blob/blob_source_test.cc
@ -148,6 +148,7 @@ TEST_F(BlobSourceTest, GetBlobsFromCache) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
+  MutableCFOptions mutable_cf_options(options_);

  constexpr uint32_t column_family_id = 1;
  constexpr bool has_ttl = false;
@ -193,8 +194,8 @@ TEST_F(BlobSourceTest, GetBlobsFromCache) {
          backing_cache.get(), &immutable_options, &file_options,
          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);

-  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
-                         blob_file_cache.get());
+  BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
+                         db_session_id_, blob_file_cache.get());

  ReadOptions read_options;
  read_options.verify_checksums = true;
@ -464,6 +465,7 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
+  MutableCFOptions mutable_cf_options(options_);

  constexpr uint32_t column_family_id = 1;
  constexpr bool has_ttl = false;
@ -498,8 +500,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) {
          backing_cache.get(), &immutable_options, &file_options,
          column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/);

-  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
-                         blob_file_cache.get());
+  BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
+                         db_session_id_, blob_file_cache.get());

  ReadOptions read_options;
  read_options.verify_checksums = true;
@ -589,6 +591,7 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
+  MutableCFOptions mutable_cf_options(options_);

  constexpr uint32_t column_family_id = 1;
  constexpr bool has_ttl = false;
@ -644,8 +647,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
          backing_cache.get(), &immutable_options, &file_options,
          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);

-  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
-                         blob_file_cache.get());
+  BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
+                         db_session_id_, blob_file_cache.get());

  ReadOptions read_options;
  read_options.verify_checksums = true;
@ -782,6 +785,7 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
+  MutableCFOptions mutable_cf_options(options_);

  constexpr uint32_t column_family_id = 1;
  constexpr bool has_ttl = false;
@ -827,8 +831,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
          backing_cache.get(), &immutable_options, &file_options,
          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);

-  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
-                         blob_file_cache.get());
+  BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
+                         db_session_id_, blob_file_cache.get());

  ReadOptions read_options;
  read_options.verify_checksums = true;
@ -1105,6 +1109,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
+  MutableCFOptions mutable_cf_options(options_);

  constexpr uint32_t column_family_id = 1;
  constexpr bool has_ttl = false;
@ -1137,8 +1142,8 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
      backing_cache.get(), &immutable_options, &file_options, column_family_id,
      blob_file_read_hist, nullptr /*IOTracer*/));

-  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
-                         blob_file_cache.get());
+  BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
+                         db_session_id_, blob_file_cache.get());

  CacheHandleGuard<BlobFileReader> file_reader;
  ReadOptions read_options;
@ -1405,6 +1410,7 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
+  MutableCFOptions mutable_cf_options(options_);

  constexpr ExpirationRange expiration_range;

@ -1426,8 +1432,8 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
          backing_cache.get(), &immutable_options, &file_options,
          kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);

-  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
-                         blob_file_cache.get());
+  BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
+                         db_session_id_, blob_file_cache.get());

  ConcurrentCacheReservationManager* cache_res_mgr =
      static_cast<ChargedCache*>(blob_source.GetBlobCache())
@ -1519,6 +1525,8 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
+  MutableCFOptions mutable_cf_options(options_);
+
  constexpr size_t blob_size = 24 << 10;  // 24KB
  for (size_t i = 0; i < kNumBlobs; ++i) {
    blob_file_size_ -= blobs_[i].size();  // old blob size
@ -1546,8 +1554,8 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
          backing_cache.get(), &immutable_options, &file_options,
          kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);

-  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
-                         blob_file_cache.get());
+  BlobSource blob_source(immutable_options, mutable_cf_options, db_id_,
+                         db_session_id_, blob_file_cache.get());

  ConcurrentCacheReservationManager* cache_res_mgr =
      static_cast<ChargedCache*>(blob_source.GetBlobCache())
--- a/db/blob/db_blob_basic_test.cc
+++ b/db/blob/db_blob_basic_test.cc
@ -374,6 +374,115 @@ TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) {
  }
 }

+TEST_F(DBBlobBasicTest, IterateBlobsAllowUnpreparedValue) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  constexpr size_t num_blobs = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> blobs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.emplace_back("key" + std::to_string(i));
+    blobs.emplace_back("blob" + std::to_string(i));
+    ASSERT_OK(Put(keys[i], blobs[i]));
+  }
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+  read_options.allow_unprepared_value = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+  {
+    size_t i = 0;
+
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(iter->key(), keys[i]);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_OK(iter->status());
+
+      ASSERT_TRUE(iter->PrepareValue());
+
+      ASSERT_EQ(iter->key(), keys[i]);
+      ASSERT_EQ(iter->value(), blobs[i]);
+      ASSERT_OK(iter->status());
+
+      ++i;
+    }
+
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(i, num_blobs);
+  }
+
+  {
+    size_t i = 0;
+
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_OK(iter->status());
+
+      ASSERT_TRUE(iter->PrepareValue());
+
+      ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
+      ASSERT_EQ(iter->value(), blobs[num_blobs - 1 - i]);
+      ASSERT_OK(iter->status());
+
+      ++i;
+    }
+
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(i, num_blobs);
+  }
+
+  {
+    size_t i = 1;
+
+    for (iter->Seek(keys[i]); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(iter->key(), keys[i]);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_OK(iter->status());
+
+      ASSERT_TRUE(iter->PrepareValue());
+
+      ASSERT_EQ(iter->key(), keys[i]);
+      ASSERT_EQ(iter->value(), blobs[i]);
+      ASSERT_OK(iter->status());
+
+      ++i;
+    }
+
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(i, num_blobs);
+  }
+
+  {
+    size_t i = 1;
+
+    for (iter->SeekForPrev(keys[num_blobs - 1 - i]); iter->Valid();
+         iter->Prev()) {
+      ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_OK(iter->status());
+
+      ASSERT_TRUE(iter->PrepareValue());
+
+      ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]);
+      ASSERT_EQ(iter->value(), blobs[num_blobs - 1 - i]);
+      ASSERT_OK(iter->status());
+
+      ++i;
+    }
+
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(i, num_blobs);
+  }
+}
+
 TEST_F(DBBlobBasicTest, MultiGetBlobs) {
  constexpr size_t min_blob_size = 6;

@ -1655,6 +1764,46 @@ TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
  SyncPoint::GetInstance()->ClearAllCallBacks();
 }

+TEST_P(DBBlobBasicIOErrorTest, IterateBlobsAllowUnpreparedValue_IOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadOptions read_options;
+  read_options.allow_unprepared_value = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), key);
+  ASSERT_TRUE(iter->value().empty());
+  ASSERT_OK(iter->status());
+
+  ASSERT_FALSE(iter->PrepareValue());
+
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
  Options options = GetDefaultOptions();

--- a/db/builder.cc
+++ b/db/builder.cc
@ -53,7 +53,7 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
  assert((tboptions.column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
         tboptions.column_family_name.empty());
-  return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file);
+  return tboptions.moptions.table_factory->NewTableBuilder(tboptions, file);
 }

 Status BuildTable(
@ -206,10 +206,6 @@ Status BuildTable(
        /*compaction=*/nullptr, compaction_filter.get(),
        /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);

-    const size_t ts_sz = ucmp->timestamp_size();
-    const bool logical_strip_timestamp =
-        ts_sz > 0 && !ioptions.persist_user_defined_timestamps;
-
    SequenceNumber smallest_preferred_seqno = kMaxSequenceNumber;
    std::string key_after_flush_buf;
    std::string value_buf;
@ -222,16 +218,6 @@ Status BuildTable(
      Slice key_after_flush = key_after_flush_buf;
      Slice value_after_flush = value;

-      // If user defined timestamps will be stripped from user key after flush,
-      // the in memory version of the key act logically the same as one with a
-      // minimum timestamp. We update the timestamp here so file boundary and
-      // output validator, block builder all see the effect of the stripping.
-      if (logical_strip_timestamp) {
-        key_after_flush_buf.clear();
-        ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz);
-        key_after_flush = key_after_flush_buf;
-      }
-
      if (ikey.type == kTypeValuePreferredSeqno) {
        auto [unpacked_value, unix_write_time] =
            ParsePackedValueWithWriteTime(value);
@ -291,11 +277,7 @@ Status BuildTable(
      Slice last_tombstone_start_user_key{};
      for (range_del_it->SeekToFirst(); range_del_it->Valid();
           range_del_it->Next()) {
-        // When user timestamp should not be persisted, we logically strip a
-        // range tombstone's start and end key's timestamp (replace it with min
-        // timestamp) before passing them along to table builder and to update
-        // file boundaries.
-        auto tombstone = range_del_it->Tombstone(logical_strip_timestamp);
+        auto tombstone = range_del_it->Tombstone();
        std::pair<InternalKey, Slice> kv = tombstone.Serialize();
        builder->Add(kv.first.Encode(), kv.second);
        InternalKey tombstone_end = tombstone.SerializeEndKey();
@ -438,8 +420,7 @@ Status BuildTable(
      // the goal is to cache it here for further user reads.
      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
          tboptions.read_options, file_options, tboptions.internal_comparator,
-          *meta, nullptr /* range_del_agg */,
-          mutable_cf_options.prefix_extractor, nullptr,
+          *meta, nullptr /* range_del_agg */, mutable_cf_options, nullptr,
          (internal_stats == nullptr) ? nullptr
                                      : internal_stats->GetFileReadHist(0),
          TableReaderCaller::kFlush, /*arena=*/nullptr,
@ -447,8 +428,7 @@ Status BuildTable(
          MaxFileSizeForL0MetaPin(mutable_cf_options),
          /*smallest_compaction_key=*/nullptr,
          /*largest_compaction_key*/ nullptr,
-          /*allow_unprepared_value*/ false,
-          mutable_cf_options.block_protection_bytes_per_key));
+          /*allow_unprepared_value*/ false));
      s = it->status();
      if (s.ok() && paranoid_file_checks) {
        OutputValidator file_validator(tboptions.internal_comparator,
@ -480,9 +460,18 @@ Status BuildTable(
      Status prepare =
          WritableFileWriter::PrepareIOOptions(tboptions.write_options, opts);
      if (prepare.ok()) {
+        // FIXME: track file for "slow" deletion, e.g. into the
+        // VersionSet::obsolete_files_ pipeline
        Status ignored = fs->DeleteFile(fname, opts, dbg);
        ignored.PermitUncheckedError();
      }
+      // Ensure we don't leak table cache entries when throwing away output
+      // files. (The usual logic in PurgeObsoleteFiles is not applicable because
+      // this function deletes the obsolete file itself, while they should
+      // probably go into the VersionSet::obsolete_files_ pipeline.)
+      TableCache::ReleaseObsolete(table_cache->get_cache().get(),
+                                  meta->fd.GetNumber(), nullptr /*handle*/,
+                                  mutable_cf_options.uncache_aggressiveness);
    }

    assert(blob_file_additions || blob_file_paths.empty());
--- a/db/c.cc
+++ b/db/c.cc
@ -4075,6 +4075,15 @@ void rocksdb_options_set_write_dbid_to_manifest(
  opt->rep.write_dbid_to_manifest = write_dbid_to_manifest;
 }

+unsigned char rocksdb_options_get_write_identity_file(rocksdb_options_t* opt) {
+  return opt->rep.write_identity_file;
+}
+
+void rocksdb_options_set_write_identity_file(
+    rocksdb_options_t* opt, unsigned char write_identity_file) {
+  opt->rep.write_identity_file = write_identity_file;
+}
+
 unsigned char rocksdb_options_get_track_and_verify_wals_in_manifest(
    rocksdb_options_t* opt) {
  return opt->rep.track_and_verify_wals_in_manifest;
--- a/db/c_test.c
+++ b/db/c_test.c
@ -772,6 +772,8 @@ int main(int argc, char** argv) {
  rocksdb_options_set_write_buffer_size(options, 100000);
  rocksdb_options_set_paranoid_checks(options, 1);
  rocksdb_options_set_max_open_files(options, 10);
+  /* Compatibility with how test was written */
+  rocksdb_options_set_write_dbid_to_manifest(options, 0);

  table_options = rocksdb_block_based_options_create();
  rocksdb_block_based_options_set_block_cache(table_options, cache);
@ -962,15 +964,24 @@ int main(int argc, char** argv) {
    rocksdb_options_t* options_dbid_in_manifest = rocksdb_options_create();
    rocksdb_options_set_create_if_missing(options_dbid_in_manifest, 1);

+    rocksdb_options_set_write_dbid_to_manifest(options_dbid_in_manifest, false);
    unsigned char write_to_manifest =
        rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest);
    CheckCondition(!write_to_manifest);
    rocksdb_options_set_write_dbid_to_manifest(options_dbid_in_manifest, true);
-    CheckCondition(!write_to_manifest);
    write_to_manifest =
        rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest);
    CheckCondition(write_to_manifest);

+    rocksdb_options_set_write_identity_file(options_dbid_in_manifest, true);
+    unsigned char write_identity_file =
+        rocksdb_options_get_write_identity_file(options_dbid_in_manifest);
+    CheckCondition(write_identity_file);
+    rocksdb_options_set_write_identity_file(options_dbid_in_manifest, false);
+    write_identity_file =
+        rocksdb_options_get_write_identity_file(options_dbid_in_manifest);
+    CheckCondition(!write_identity_file);
+
    db = rocksdb_open(options_dbid_in_manifest, dbbackupname, &err);
    CheckNoError(err);

--- a/db/coalescing_iterator.h
+++ b/db/coalescing_iterator.h
@ -12,14 +12,12 @@ namespace ROCKSDB_NAMESPACE {
 // EXPERIMENTAL
 class CoalescingIterator : public Iterator {
 public:
-  CoalescingIterator(const Comparator* comparator,
-                     const std::vector<ColumnFamilyHandle*>& column_families,
-                     const std::vector<Iterator*>& child_iterators)
-      : impl_(
-            comparator, column_families, child_iterators, [this]() { Reset(); },
-            [this](const autovector<MultiCfIteratorInfo>& items) {
-              Coalesce(items);
-            }) {}
+  CoalescingIterator(
+      const ReadOptions& read_options, const Comparator* comparator,
+      std::vector<std::pair<ColumnFamilyHandle*, std::unique_ptr<Iterator>>>&&
+          cfh_iter_pairs)
+      : impl_(read_options, comparator, std::move(cfh_iter_pairs),
+              ResetFunc(this), PopulateFunc(this)) {}
  ~CoalescingIterator() override {}

  // No copy allowed
@ -50,8 +48,36 @@ class CoalescingIterator : public Iterator {
    wide_columns_.clear();
  }

+  bool PrepareValue() override { return impl_.PrepareValue(); }
+
 private:
-  MultiCfIteratorImpl impl_;
+  class ResetFunc {
+   public:
+    explicit ResetFunc(CoalescingIterator* iter) : iter_(iter) {}
+
+    void operator()() const {
+      assert(iter_);
+      iter_->Reset();
+    }
+
+   private:
+    CoalescingIterator* iter_;
+  };
+
+  class PopulateFunc {
+   public:
+    explicit PopulateFunc(CoalescingIterator* iter) : iter_(iter) {}
+
+    void operator()(const autovector<MultiCfIteratorInfo>& items) const {
+      assert(iter_);
+      iter_->Coalesce(items);
+    }
+
+   private:
+    CoalescingIterator* iter_;
+  };
+
+  MultiCfIteratorImpl<ResetFunc, PopulateFunc> impl_;
  Slice value_;
  WideColumns wide_columns_;

--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -466,7 +466,7 @@ void SuperVersion::Cleanup() {
  // decrement reference to the immutable MemtableList
  // this SV object was pointing to.
  imm->Unref(&to_delete);
-  MemTable* m = mem->Unref();
+  ReadOnlyMemTable* m = mem->Unref();
  if (m != nullptr) {
    auto* memory_usage = current->cfd()->imm()->current_memory_usage();
    assert(*memory_usage >= m->ApproximateMemoryUsage());
@ -595,8 +595,8 @@ ColumnFamilyData::ColumnFamilyData(
    blob_file_cache_.reset(
        new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
                          internal_stats_->GetBlobFileReadHist(), io_tracer));
-    blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id,
-                                      blob_file_cache_.get()));
+    blob_source_.reset(new BlobSource(ioptions_, mutable_cf_options_, db_id,
+                                      db_session_id, blob_file_cache_.get()));

    if (ioptions_.compaction_style == kCompactionStyleLevel) {
      compaction_picker_.reset(
@ -693,9 +693,9 @@ ColumnFamilyData::~ColumnFamilyData() {
  if (mem_ != nullptr) {
    delete mem_->Unref();
  }
-  autovector<MemTable*> to_delete;
+  autovector<ReadOnlyMemTable*> to_delete;
  imm_.current()->Unref(&to_delete);
-  for (MemTable* m : to_delete) {
+  for (auto* m : to_delete) {
    delete m;
  }

@ -901,7 +901,11 @@ uint64_t GetPendingCompactionBytesForCompactionSpeedup(
    return slowdown_threshold;
  }

-  uint64_t size_threshold = bottommost_files_size / kBottommostSizeDivisor;
+  // Prevent a small CF from triggering parallel compactions for other CFs.
+  // Require compaction debt to be more than a full L0 to Lbase compaction.
+  const uint64_t kMinDebtSize = 2 * mutable_cf_options.max_bytes_for_level_base;
+  uint64_t size_threshold =
+      std::max(bottommost_files_size / kBottommostSizeDivisor, kMinDebtSize);
  return std::min(size_threshold, slowdown_threshold);
 }

@ -1172,10 +1176,12 @@ bool ColumnFamilyData::NeedsCompaction() const {

 Compaction* ColumnFamilyData::PickCompaction(
    const MutableCFOptions& mutable_options,
-    const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
+    const MutableDBOptions& mutable_db_options,
+    const std::vector<SequenceNumber>& existing_snapshots,
+    const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer) {
  auto* result = compaction_picker_->PickCompaction(
-      GetName(), mutable_options, mutable_db_options, current_->storage_info(),
-      log_buffer);
+      GetName(), mutable_options, mutable_db_options, existing_snapshots,
+      snapshot_checker, current_->storage_info(), log_buffer);
  if (result != nullptr) {
    result->FinalizeInputInfo(current_);
  }
@ -1201,8 +1207,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
  read_opts.total_order_seek = true;
  MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
  merge_iter_builder.AddIterator(super_version->mem->NewIterator(
-      read_opts, /*seqno_to_time_mapping=*/nullptr, &arena));
+      read_opts, /*seqno_to_time_mapping=*/nullptr, &arena,
+      /*prefix_extractor=*/nullptr));
  super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
+                                   /*prefix_extractor=*/nullptr,
                                   &merge_iter_builder,
                                   false /* add_range_tombstone_iter */);
  ScopedArenaPtr<InternalIterator> memtable_iter(merge_iter_builder.Finish());
@ -1565,28 +1573,6 @@ Status ColumnFamilyData::SetOptions(
  return s;
 }

-// REQUIRES: DB mutex held
-Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
-  if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
-    return Env::WLTH_NOT_SET;
-  }
-  if (level == 0) {
-    return Env::WLTH_MEDIUM;
-  }
-  int base_level = current_->storage_info()->base_level();
-
-  // L1: medium, L2: long, ...
-  if (level - base_level >= 2) {
-    return Env::WLTH_EXTREME;
-  } else if (level < base_level) {
-    // There is no restriction which prevents level passed in to be smaller
-    // than base_level.
-    return Env::WLTH_MEDIUM;
-  }
-  return static_cast<Env::WriteLifeTimeHint>(
-      level - base_level + static_cast<int>(Env::WLTH_MEDIUM));
-}
-
 Status ColumnFamilyData::AddDirectories(
    std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
  Status s;
@ -1652,6 +1638,9 @@ bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
  }
  for (const Slice& table_newest_udt :
       imm()->GetTablesNewestUDT(max_memtable_id)) {
+    if (table_newest_udt.empty()) {
+      continue;
+    }
    assert(table_newest_udt.size() == full_history_ts_low.size());
    // Checking the newest UDT contained in MemTable with ascending ID up to
    // `max_memtable_id`. Return immediately on finding the first MemTable that
--- a/db/column_family.h
+++ b/db/column_family.h
@ -16,6 +16,7 @@

 #include "cache/cache_reservation_manager.h"
 #include "db/memtable_list.h"
+#include "db/snapshot_checker.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
 #include "db/write_batch_internal.h"
@ -206,7 +207,7 @@ struct SuperVersion {
  // Accessing members of this class is not thread-safe and requires external
  // synchronization (ie db mutex held or on write thread).
  ColumnFamilyData* cfd;
-  MemTable* mem;
+  ReadOnlyMemTable* mem;
  MemTableListVersion* imm;
  Version* current;
  MutableCFOptions mutable_cf_options;
@ -268,7 +269,7 @@ struct SuperVersion {
  // We need to_delete because during Cleanup(), imm->Unref() returns
  // all memtables that we need to free through this vector. We then
  // delete all those memtables outside of mutex, during destruction
-  autovector<MemTable*> to_delete;
+  autovector<ReadOnlyMemTable*> to_delete;
 };

 Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
@ -385,9 +386,9 @@ class ColumnFamilyData {
  uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
  uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
  uint64_t GetTotalBlobFileSize() const;  // REQUIRE: DB mutex held
+  // REQUIRE: DB mutex held
  void SetMemtable(MemTable* new_mem) {
-    uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
-    new_mem->SetID(memtable_id);
+    new_mem->SetID(++last_memtable_id_);
    mem_ = new_mem;
  }

@ -401,15 +402,18 @@ class ColumnFamilyData {
                         SequenceNumber earliest_seq);

  TableCache* table_cache() const { return table_cache_.get(); }
+  BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
  BlobSource* blob_source() const { return blob_source_.get(); }

  // See documentation in compaction_picker.h
  // REQUIRES: DB mutex held
  bool NeedsCompaction() const;
  // REQUIRES: DB mutex held
-  Compaction* PickCompaction(const MutableCFOptions& mutable_options,
-                             const MutableDBOptions& mutable_db_options,
-                             LogBuffer* log_buffer);
+  Compaction* PickCompaction(
+      const MutableCFOptions& mutable_options,
+      const MutableDBOptions& mutable_db_options,
+      const std::vector<SequenceNumber>& existing_snapshots,
+      const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer);

  // Check if the passed range overlap with any running compactions.
  // REQUIRES: DB mutex held
@ -511,8 +515,6 @@ class ColumnFamilyData {
    return initial_cf_options_;
  }

-  Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
-
  // created_dirs remembers directory created, so that we don't need to call
  // the same data creation operation again.
  Status AddDirectories(
@ -671,7 +673,7 @@ class ColumnFamilyData {
  bool allow_2pc_;

  // Memtable id to track flush.
-  std::atomic<uint64_t> last_memtable_id_;
+  uint64_t last_memtable_id_;

  // Directories corresponding to cf_paths.
  std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@ -3012,19 +3012,25 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForCompactionDebt) {
  ASSERT_OK(db_->Flush(FlushOptions()));

  {
-    // 1MB debt is way bigger than bottommost data so definitely triggers
-    // speedup.
    VersionStorageInfo* vstorage = cfd->current()->storage_info();
-    vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */,
-                                                         dbmu);
-    RecalculateWriteStallConditions(cfd, mutable_cf_options);
-    ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
-
    // Eight bytes is way smaller than bottommost data so definitely does not
    // trigger speedup.
    vstorage->TEST_set_estimated_compaction_needed_bytes(8, dbmu);
    RecalculateWriteStallConditions(cfd, mutable_cf_options);
    ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+    // 1MB is much larger than bottommost level size. However, since it's too
+    // small in terms of absolute size, it does not trigger parallel compaction
+    // in this case (see GetPendingCompactionBytesForCompactionSpeedup()).
+    vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */,
+                                                         dbmu);
+    RecalculateWriteStallConditions(cfd, mutable_cf_options);
+    ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+    vstorage->TEST_set_estimated_compaction_needed_bytes(
+        2 * mutable_cf_options.max_bytes_for_level_base, dbmu);
+    RecalculateWriteStallConditions(cfd, mutable_cf_options);
+    ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
  }
 }

@ -3067,12 +3073,20 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
  WaitForCompaction();
  AssertFilesPerLevel("0,1", 0 /* cf */);

+  // We should calculate the limit by obtaining the number of env background
+  // threads, because the current test case will share the same env
+  // with another case that may have already increased the number of
+  // background threads which is larger than kParallelismLimit
+  const auto limit = env_->GetBackgroundThreads(Env::Priority::LOW);
+
  // Block the compaction thread pool so marked files accumulate in L0.
-  test::SleepingBackgroundTask sleeping_tasks[kParallelismLimit];
-  for (int i = 0; i < kParallelismLimit; i++) {
+  std::vector<std::shared_ptr<test::SleepingBackgroundTask>> sleeping_tasks;
+  for (int i = 0; i < limit; i++) {
+    sleeping_tasks.emplace_back(
+        std::make_shared<test::SleepingBackgroundTask>());
    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
-                   &sleeping_tasks[i], Env::Priority::LOW);
-    sleeping_tasks[i].WaitUntilSleeping();
+                   sleeping_tasks[i].get(), Env::Priority::LOW);
+    sleeping_tasks[i]->WaitUntilSleeping();
  }

  // Zero marked upper-level files. No speedup.
@ -3091,9 +3105,9 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
  ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed());
  AssertFilesPerLevel("2,1", 0 /* cf */);

-  for (int i = 0; i < kParallelismLimit; i++) {
-    sleeping_tasks[i].WakeUp();
-    sleeping_tasks[i].WaitUntilDone();
+  for (int i = 0; i < limit; i++) {
+    sleeping_tasks[i]->WakeUp();
+    sleeping_tasks[i]->WaitUntilDone();
  }
 }

@ -3862,6 +3876,91 @@ TEST_F(ManualFlushSkipRetainUDTTest, ManualFlush) {
  Close();
 }

+TEST_F(ManualFlushSkipRetainUDTTest, FlushRemovesStaleEntries) {
+  column_family_options_.max_write_buffer_number = 4;
+  Open();
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0)));
+
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  ColumnFamilyData* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+  for (int version = 0; version < 100; version++) {
+    if (version == 50) {
+      ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable(cfd));
+    }
+    ASSERT_OK(
+        Put(0, "foo", EncodeAsUint64(version), "v" + std::to_string(version)));
+  }
+
+  ASSERT_OK(Flush(0));
+  TablePropertiesCollection tables_properties;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&tables_properties));
+  ASSERT_EQ(1, tables_properties.size());
+  std::shared_ptr<const TableProperties> table_properties =
+      tables_properties.begin()->second;
+  ASSERT_EQ(1, table_properties->num_entries);
+  ASSERT_EQ(0, table_properties->num_deletions);
+  ASSERT_EQ(0, table_properties->num_range_deletions);
+  CheckEffectiveCutoffTime(100);
+  CheckAutomaticFlushRetainUDT(101);
+
+  Close();
+}
+
+TEST_F(ManualFlushSkipRetainUDTTest, RangeDeletionFlushRemovesStaleEntries) {
+  column_family_options_.max_write_buffer_number = 4;
+  Open();
+  // TODO(yuzhangyu): a non 0 full history ts low is needed for this garbage
+  // collection to kick in. This doesn't work well for the very first flush of
+  // the column family. Not a big issue, but would be nice to improve this.
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(9)));
+
+  for (int i = 10; i < 100; i++) {
+    ASSERT_OK(Put(0, "foo" + std::to_string(i), EncodeAsUint64(i),
+                  "val" + std::to_string(i)));
+    if (i % 2 == 1) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), "foo" + std::to_string(i - 1),
+                                 "foo" + std::to_string(i), EncodeAsUint64(i)));
+    }
+  }
+
+  ASSERT_OK(Flush(0));
+  CheckEffectiveCutoffTime(100);
+  std::string read_ts = EncodeAsUint64(100);
+  std::string min_ts = EncodeAsUint64(0);
+  ReadOptions ropts;
+  Slice read_ts_slice = read_ts;
+  std::string value;
+  ropts.timestamp = &read_ts_slice;
+  {
+    Iterator* iter = db_->NewIterator(ropts);
+    iter->SeekToFirst();
+    int i = 11;
+    while (iter->Valid()) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("foo" + std::to_string(i), iter->key());
+      ASSERT_EQ("val" + std::to_string(i), iter->value());
+      ASSERT_EQ(min_ts, iter->timestamp());
+      iter->Next();
+      i += 2;
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+  }
+  TablePropertiesCollection tables_properties;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&tables_properties));
+  ASSERT_EQ(1, tables_properties.size());
+  std::shared_ptr<const TableProperties> table_properties =
+      tables_properties.begin()->second;
+  // 45 point data + 45 range deletions. 45 obsolete point data are garbage
+  // collected.
+  ASSERT_EQ(90, table_properties->num_entries);
+  ASSERT_EQ(45, table_properties->num_deletions);
+  ASSERT_EQ(45, table_properties->num_range_deletions);
+
+  Close();
+}
+
 TEST_F(ManualFlushSkipRetainUDTTest, ManualCompaction) {
  Open();
  ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0)));
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@ -283,9 +283,10 @@ Compaction::Compaction(
    uint32_t _output_path_id, CompressionType _compression,
    CompressionOptions _compression_opts, Temperature _output_temperature,
    uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
-    bool _manual_compaction, const std::string& _trim_ts, double _score,
-    bool _deletion_compaction, bool l0_files_might_overlap,
-    CompactionReason _compaction_reason,
+    std::optional<SequenceNumber> _earliest_snapshot,
+    const SnapshotChecker* _snapshot_checker, bool _manual_compaction,
+    const std::string& _trim_ts, double _score, bool _deletion_compaction,
+    bool l0_files_might_overlap, CompactionReason _compaction_reason,
    BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
    double _blob_garbage_collection_age_cutoff)
    : input_vstorage_(vstorage),
@ -307,6 +308,8 @@ Compaction::Compaction(
      l0_files_might_overlap_(l0_files_might_overlap),
      inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
      grandparents_(std::move(_grandparents)),
+      earliest_snapshot_(_earliest_snapshot),
+      snapshot_checker_(_snapshot_checker),
      score_(_score),
      bottommost_level_(
          // For simplicity, we don't support the concept of "bottommost level"
@ -342,8 +345,9 @@ Compaction::Compaction(
          _compaction_reason == CompactionReason::kExternalSstIngestion ||
                  _compaction_reason == CompactionReason::kRefitLevel
              ? Compaction::kInvalidLevel
-              : EvaluatePenultimateLevel(vstorage, immutable_options_,
-                                         start_level_, output_level_)) {
+              : EvaluatePenultimateLevel(vstorage, mutable_cf_options_,
+                                         immutable_options_, start_level_,
+                                         output_level_)) {
  MarkFilesBeingCompacted(true);
  if (is_manual_compaction_) {
    compaction_reason_ = CompactionReason::kManualCompaction;
@ -367,9 +371,13 @@ Compaction::Compaction(
  // setup input_levels_
  {
    input_levels_.resize(num_input_levels());
-    for (size_t which = 0; which < num_input_levels(); which++) {
-      DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
-                                &arena_);
+    if (earliest_snapshot_.has_value()) {
+      FilterInputsForCompactionIterator();
+    } else {
+      for (size_t which = 0; which < num_input_levels(); which++) {
+        DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+                                  &arena_);
+      }
    }
  }

@ -686,12 +694,11 @@ bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
 };

 // Mark (or clear) each file that is being compacted
-void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+void Compaction::MarkFilesBeingCompacted(bool being_compacted) const {
  for (size_t i = 0; i < num_input_levels(); i++) {
    for (size_t j = 0; j < inputs_[i].size(); j++) {
-      assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
-                               : inputs_[i][j]->being_compacted);
-      inputs_[i][j]->being_compacted = mark_as_compacted;
+      assert(being_compacted != inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = being_compacted;
    }
  }
 }
@ -735,7 +742,7 @@ uint64_t Compaction::CalculateTotalInputSize() const {
  return size;
 }

-void Compaction::ReleaseCompactionFiles(Status status) {
+void Compaction::ReleaseCompactionFiles(const Status& status) {
  MarkFilesBeingCompacted(false);
  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
 }
@ -746,8 +753,10 @@ void Compaction::ResetNextCompactionIndex() {
 }

 namespace {
-int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+int InputSummary(const std::vector<FileMetaData*>& files,
+                 const std::vector<bool>& files_filtered, char* output,
                 int len) {
+  assert(files_filtered.empty() || (files.size() == files_filtered.size()));
  *output = '\0';
  int write = 0;
  for (size_t i = 0; i < files.size(); i++) {
@ -755,8 +764,14 @@ int InputSummary(const std::vector<FileMetaData*>& files, char* output,
    int ret;
    char sztxt[16];
    AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
-    ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
-                   files.at(i)->fd.GetNumber(), sztxt);
+    if (files_filtered.empty()) {
+      ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+                     files.at(i)->fd.GetNumber(), sztxt);
+    } else {
+      ret = snprintf(output + write, sz, "%" PRIu64 "(%s filtered:%s) ",
+                     files.at(i)->fd.GetNumber(), sztxt,
+                     files_filtered.at(i) ? "true" : "false");
+    }
    if (ret < 0 || ret >= sz) {
      break;
    }
@ -782,8 +797,15 @@ void Compaction::Summary(char* output, int len) {
        return;
      }
    }
-    write +=
-        InputSummary(inputs_[level_iter].files, output + write, len - write);
+
+    assert(non_start_level_input_files_filtered_.empty() ||
+           non_start_level_input_files_filtered_.size() == inputs_.size() - 1);
+    write += InputSummary(
+        inputs_[level_iter].files,
+        (level_iter == 0 || non_start_level_input_files_filtered_.empty())
+            ? std::vector<bool>{}
+            : non_start_level_input_files_filtered_[level_iter - 1],
+        output + write, len - write);
    if (write < 0 || write >= len) {
      return;
    }
@ -866,7 +888,7 @@ bool Compaction::ShouldFormSubcompactions() const {
    return false;
  }

-  if (cfd_->ioptions()->table_factory->Name() ==
+  if (mutable_cf_options_.table_factory->Name() ==
      TableFactory::kPlainTableName()) {
    return false;
  }
@ -914,6 +936,25 @@ bool Compaction::DoesInputReferenceBlobFiles() const {
  return false;
 }

+uint64_t Compaction::MaxInputFileNewestKeyTime(const InternalKey* start,
+                                               const InternalKey* end) const {
+  uint64_t newest_key_time = kUnknownNewestKeyTime;
+  const InternalKeyComparator& icmp =
+      column_family_data()->internal_comparator();
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
+        continue;
+      }
+      if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
+        continue;
+      }
+      newest_key_time = std::max(newest_key_time, file->TryGetNewestKeyTime());
+    }
+  }
+  return newest_key_time;
+}
+
 uint64_t Compaction::MinInputFileOldestAncesterTime(
    const InternalKey* start, const InternalKey* end) const {
  uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max();
@ -949,6 +990,7 @@ uint64_t Compaction::MinInputFileEpochNumber() const {

 int Compaction::EvaluatePenultimateLevel(
    const VersionStorageInfo* vstorage,
+    const MutableCFOptions& mutable_cf_options,
    const ImmutableOptions& immutable_options, const int start_level,
    const int output_level) {
  // TODO: currently per_key_placement feature only support level and universal
@ -980,7 +1022,7 @@ int Compaction::EvaluatePenultimateLevel(
  }

  bool supports_per_key_placement =
-      immutable_options.preclude_last_level_data_seconds > 0;
+      mutable_cf_options.preclude_last_level_data_seconds > 0;

  // it could be overridden by unittest
  TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
@ -992,4 +1034,69 @@ int Compaction::EvaluatePenultimateLevel(
  return penultimate_level;
 }

+void Compaction::FilterInputsForCompactionIterator() {
+  assert(earliest_snapshot_.has_value());
+  // cfd_ is not populated at Compaction construction time, get it from
+  // VersionStorageInfo instead.
+  assert(input_vstorage_);
+  const auto* ucmp = input_vstorage_->user_comparator();
+  assert(ucmp);
+  // Simply comparing file boundaries when user-defined timestamp is defined
+  // is not as safe because we need to also compare timestamp to know for
+  // sure. Although entries with higher timestamp is also supposed to have
+  // higher sequence number for the same user key (without timestamp).
+  assert(ucmp->timestamp_size() == 0);
+  size_t num_input_levels = inputs_.size();
+  // TODO(yuzhangyu): filtering of older L0 file by new L0 file is not
+  // supported yet.
+  FileMetaData* rangedel_candidate = inputs_[0].level == 0
+                                         ? inputs_[0].files.back()
+                                         : inputs_[0].files.front();
+  assert(rangedel_candidate);
+  if (!rangedel_candidate->FileIsStandAloneRangeTombstone() ||
+      !DataIsDefinitelyInSnapshot(rangedel_candidate->fd.smallest_seqno,
+                                  earliest_snapshot_.value(),
+                                  snapshot_checker_)) {
+    for (size_t level = 0; level < num_input_levels; level++) {
+      DoGenerateLevelFilesBrief(&input_levels_[level], inputs_[level].files,
+                                &arena_);
+    }
+    return;
+  }
+
+  Slice rangedel_start_ukey = rangedel_candidate->smallest.user_key();
+  Slice rangedel_end_ukey = rangedel_candidate->largest.user_key();
+  SequenceNumber rangedel_seqno = rangedel_candidate->fd.smallest_seqno;
+
+  std::vector<std::vector<FileMetaData*>> non_start_level_input_files;
+  non_start_level_input_files.reserve(num_input_levels - 1);
+  non_start_level_input_files_filtered_.reserve(num_input_levels - 1);
+  for (size_t level = 1; level < num_input_levels; level++) {
+    non_start_level_input_files.emplace_back();
+    non_start_level_input_files_filtered_.emplace_back();
+    for (FileMetaData* file : inputs_[level].files) {
+      non_start_level_input_files_filtered_.back().push_back(false);
+      // When range data and point data has the same sequence number, point
+      // data wins. Range deletion end key is exclusive, so check it's bigger
+      // than file right boundary user key.
+      if (rangedel_seqno > file->fd.largest_seqno &&
+          ucmp->CompareWithoutTimestamp(rangedel_start_ukey,
+                                        file->smallest.user_key()) <= 0 &&
+          ucmp->CompareWithoutTimestamp(rangedel_end_ukey,
+                                        file->largest.user_key()) > 0) {
+        non_start_level_input_files_filtered_.back().back() = true;
+      } else {
+        non_start_level_input_files.back().push_back(file);
+      }
+    }
+  }
+
+  DoGenerateLevelFilesBrief(&input_levels_[0], inputs_[0].files, &arena_);
+  assert(non_start_level_input_files.size() == num_input_levels - 1);
+  for (size_t level = 1; level < num_input_levels; level++) {
+    DoGenerateLevelFilesBrief(&input_levels_[level],
+                              non_start_level_input_files[level - 1], &arena_);
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@ -8,6 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #pragma once
+
+#include "db/snapshot_checker.h"
 #include "db/version_set.h"
 #include "memory/arena.h"
 #include "options/cf_options.h"
@ -90,6 +92,8 @@ class Compaction {
             CompressionOptions compression_opts,
             Temperature output_temperature, uint32_t max_subcompactions,
             std::vector<FileMetaData*> grandparents,
+             std::optional<SequenceNumber> earliest_snapshot,
+             const SnapshotChecker* snapshot_checker,
             bool manual_compaction = false, const std::string& trim_ts = "",
             double score = -1, bool deletion_compaction = false,
             bool l0_files_might_overlap = true,
@ -230,7 +234,7 @@ class Compaction {
  // Delete this compaction from the list of running compactions.
  //
  // Requirement: DB mutex held
-  void ReleaseCompactionFiles(Status status);
+  void ReleaseCompactionFiles(const Status& status);

  // Returns the summary of the compaction in "output" with maximum "len"
  // in bytes.  The caller is responsible for the memory management of
@ -401,6 +405,12 @@ class Compaction {
    return blob_garbage_collection_age_cutoff_;
  }

+  // start and end are sub compact range. Null if no boundary.
+  // This is used to calculate the newest_key_time table property after
+  // compaction.
+  uint64_t MaxInputFileNewestKeyTime(const InternalKey* start,
+                                     const InternalKey* end) const;
+
  // start and end are sub compact range. Null if no boundary.
  // This is used to filter out some input files' ancester's time range.
  uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
@ -430,18 +440,19 @@ class Compaction {
  // penultimate level. The safe key range is populated by
  // `PopulatePenultimateLevelOutputRange()`.
  // Which could potentially disable all penultimate level output.
-  static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage,
-                                      const ImmutableOptions& immutable_options,
-                                      const int start_level,
-                                      const int output_level);
+  static int EvaluatePenultimateLevel(
+      const VersionStorageInfo* vstorage,
+      const MutableCFOptions& mutable_cf_options,
+      const ImmutableOptions& immutable_options, const int start_level,
+      const int output_level);
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool being_compacted) const;

 private:

  Status InitInputTableProperties();

-  // mark (or clear) all files that are being compacted
-  void MarkFilesBeingCompacted(bool mark_as_compacted);
-
  // get the smallest and largest key present in files to be compacted
  static void GetBoundaryKeys(VersionStorageInfo* vstorage,
                              const std::vector<CompactionInputFiles>& inputs,
@ -460,6 +471,13 @@ class Compaction {
  // `Compaction::WithinPenultimateLevelOutputRange()`.
  void PopulatePenultimateLevelOutputRange();

+  // If oldest snapshot is specified at Compaction construction time, we have
+  // an opportunity to optimize inputs for compaction iterator for this case:
+  // When a standalone range deletion file on the start level is recognized and
+  // can be determined to completely shadow some input files on non-start level.
+  // These files will be filtered out and later not feed to compaction iterator.
+  void FilterInputsForCompactionIterator();
+
  // Get the atomic file boundaries for all files in the compaction. Necessary
  // in order to avoid the scenario described in
  // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
@ -510,12 +528,27 @@ class Compaction {
  // Compaction input files organized by level. Constant after construction
  const std::vector<CompactionInputFiles> inputs_;

-  // A copy of inputs_, organized more closely in memory
+  // All files from inputs_ that are not filtered and will be fed to compaction
+  // iterator, organized more closely in memory.
  autovector<LevelFilesBrief, 2> input_levels_;

  // State used to check for number of overlapping grandparent files
  // (grandparent == "output_level_ + 1")
  std::vector<FileMetaData*> grandparents_;
+
+  // The earliest snapshot and snapshot checker at compaction picking time.
+  // These fields are only set for deletion triggered compactions picked in
+  // universal compaction. And when user-defined timestamp is not enabled.
+  // It will be used to possibly filter out some non start level input files.
+  std::optional<SequenceNumber> earliest_snapshot_;
+  const SnapshotChecker* snapshot_checker_;
+
+  // Markers for which non start level input files are filtered out if
+  // applicable. Only applicable if earliest_snapshot_ is provided and input
+  // start level has a standalone range deletion file.
+  std::vector<std::vector<bool>> non_start_level_input_files_filtered_;
+
+  //  bool standalone_range_tombstones_used_for_filtering_inputs_;
  const double score_;  // score that was used to pick this compaction.

  // Is this compaction creating a file in the bottom most level?
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@ -540,18 +540,12 @@ class CompactionIterator {

 inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
                                                     SequenceNumber snapshot) {
-  return ((seq) <= (snapshot) &&
-          (snapshot_checker_ == nullptr ||
-           LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
-                  SnapshotCheckerResult::kInSnapshot)));
+  return DataIsDefinitelyInSnapshot(seq, snapshot, snapshot_checker_);
 }

 inline bool CompactionIterator::DefinitelyNotInSnapshot(
    SequenceNumber seq, SequenceNumber snapshot) {
-  return ((seq) > (snapshot) ||
-          (snapshot_checker_ != nullptr &&
-           UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
-                    SnapshotCheckerResult::kNotInSnapshot)));
+  return DataIsDefinitelyNotInSnapshot(seq, snapshot, snapshot_checker_);
 }

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@ -251,12 +251,13 @@ void CompactionJob::Prepare() {

  // Generate file_levels_ for compaction before making Iterator
  auto* c = compact_->compaction;
-  ColumnFamilyData* cfd = c->column_family_data();
+  [[maybe_unused]] ColumnFamilyData* cfd = c->column_family_data();
  assert(cfd != nullptr);
-  assert(cfd->current()->storage_info()->NumLevelFiles(
-             compact_->compaction->level()) > 0);
+  const VersionStorageInfo* storage_info = c->input_version()->storage_info();
+  assert(storage_info);
+  assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);

-  write_hint_ = cfd->CalculateSSTWriteHint(c->output_level());
+  write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
  bottommost_level_ = c->bottommost_level();

  if (c->ShouldFormSubcompactions()) {
@ -287,8 +288,8 @@ void CompactionJob::Prepare() {
  // to encode seqno->time to the output files.

  uint64_t preserve_time_duration =
-      std::max(c->immutable_options()->preserve_internal_time_seconds,
-               c->immutable_options()->preclude_last_level_data_seconds);
+      std::max(c->mutable_cf_options()->preserve_internal_time_seconds,
+               c->mutable_cf_options()->preclude_last_level_data_seconds);

  if (preserve_time_duration > 0) {
    const ReadOptions read_options(Env::IOActivity::kCompaction);
@ -297,8 +298,8 @@ void CompactionJob::Prepare() {
    for (const auto& each_level : *c->inputs()) {
      for (const auto& fmd : each_level.files) {
        std::shared_ptr<const TableProperties> tp;
-        Status s =
-            cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr);
+        Status s = c->input_version()->GetTableProperties(read_options, &tp,
+                                                          fmd, nullptr);
        if (s.ok()) {
          s = seqno_to_time_mapping_.DecodeFrom(tp->seqno_to_time_mapping);
        }
@ -325,8 +326,8 @@ void CompactionJob::Prepare() {
      seqno_to_time_mapping_.Enforce(_current_time);
      seqno_to_time_mapping_.GetCurrentTieringCutoffSeqnos(
          static_cast<uint64_t>(_current_time),
-          c->immutable_options()->preserve_internal_time_seconds,
-          c->immutable_options()->preclude_last_level_data_seconds,
+          c->mutable_cf_options()->preserve_internal_time_seconds,
+          c->mutable_cf_options()->preclude_last_level_data_seconds,
          &preserve_time_min_seqno_, &preclude_last_level_min_seqno_);
    }
    // For accuracy of the GetProximalSeqnoBeforeTime queries above, we only
@ -468,7 +469,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
  ReadOptions read_options(Env::IOActivity::kCompaction);
  read_options.rate_limiter_priority = GetRateLimiterPriority();
  auto* c = compact_->compaction;
-  if (c->immutable_options()->table_factory->Name() ==
+  if (c->mutable_cf_options()->table_factory->Name() ==
      TableFactory::kPlainTableName()) {
    return;
  }
@ -505,9 +506,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
        FileMetaData* f = flevel->files[i].file_metadata;
        std::vector<TableReader::Anchor> my_anchors;
        Status s = cfd->table_cache()->ApproximateKeyAnchors(
-            read_options, icomp, *f,
-            c->mutable_cf_options()->block_protection_bytes_per_key,
-            my_anchors);
+            read_options, icomp, *f, *c->mutable_cf_options(), my_anchors);
        if (!s.ok() || my_anchors.empty()) {
          my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
        }
@ -710,8 +709,6 @@ Status CompactionJob::Run() {
      }
    }
    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-    auto& prefix_extractor =
-        compact_->compaction->mutable_cf_options()->prefix_extractor;
    std::atomic<size_t> next_file_idx(0);
    auto verify_table = [&](Status& output_status) {
      while (true) {
@ -732,7 +729,8 @@ Status CompactionJob::Run() {
        InternalIterator* iter = cfd->table_cache()->NewIterator(
            verify_table_read_options, file_options_,
            cfd->internal_comparator(), files_output[file_idx]->meta,
-            /*range_del_agg=*/nullptr, prefix_extractor,
+            /*range_del_agg=*/nullptr,
+            *compact_->compaction->mutable_cf_options(),
            /*table_reader_ptr=*/nullptr,
            cfd->internal_stats()->GetFileReadHist(
                compact_->compaction->output_level()),
@ -742,9 +740,7 @@ Status CompactionJob::Run() {
                *compact_->compaction->mutable_cf_options()),
            /*smallest_compaction_key=*/nullptr,
            /*largest_compaction_key=*/nullptr,
-            /*allow_unprepared_value=*/false,
-            compact_->compaction->mutable_cf_options()
-                ->block_protection_bytes_per_key);
+            /*allow_unprepared_value=*/false);
        auto s = iter->status();

        if (s.ok() && paranoid_file_checks_) {
@ -805,6 +801,12 @@ Status CompactionJob::Run() {
    }
  }

+  // Before the compaction starts, is_remote_compaction was set to true if
+  // compaction_service is set. We now know whether each sub_compaction was
+  // done remotely or not. Reset is_remote_compaction back to false and allow
+  // AggregateCompactionStats() to set the right value.
+  compaction_job_stats_->is_remote_compaction = false;
+
  // Finish up all bookkeeping to unify the subcompaction results.
  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
  uint64_t num_input_range_del = 0;
@ -1083,6 +1085,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
    }
    // fallback to local compaction
    assert(comp_status == CompactionServiceJobStatus::kUseLocal);
+    sub_compact->compaction_job_stats.is_remote_compaction = false;
  }

  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
@ -1911,6 +1914,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
    oldest_ancester_time = current_time;
  }

+  uint64_t newest_key_time = sub_compact->compaction->MaxInputFileNewestKeyTime(
+      sub_compact->start.has_value() ? &tmp_start : nullptr,
+      sub_compact->end.has_value() ? &tmp_end : nullptr);
+
  // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
  uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber();
  {
@ -1960,7 +1967,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
      cfd->internal_tbl_prop_coll_factories(),
      sub_compact->compaction->output_compression(),
      sub_compact->compaction->output_compression_opts(), cfd->GetID(),
-      cfd->GetName(), sub_compact->compaction->output_level(),
+      cfd->GetName(), sub_compact->compaction->output_level(), newest_key_time,
      bottommost_level_, TableFileCreationReason::kCompaction,
      0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
      sub_compact->compaction->max_output_file_size(), file_number,
@ -2000,10 +2007,12 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
  bool has_error = false;
  const ReadOptions read_options(Env::IOActivity::kCompaction);
  const auto& input_table_properties = compaction->GetInputTableProperties();
+  // TODO(yuzhangyu): add dedicated stats for filtered files.
  for (int input_level = 0;
       input_level < static_cast<int>(compaction->num_input_levels());
       ++input_level) {
-    size_t num_input_files = compaction->num_input_files(input_level);
+    const LevelFilesBrief* flevel = compaction->input_levels(input_level);
+    size_t num_input_files = flevel->num_files;
    uint64_t* bytes_read;
    if (compaction->level(input_level) != compaction->output_level()) {
      compaction_stats_.stats.num_input_files_in_non_output_levels +=
@ -2015,7 +2024,7 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
      bytes_read = &compaction_stats_.stats.bytes_read_output_level;
    }
    for (size_t i = 0; i < num_input_files; ++i) {
-      const FileMetaData* file_meta = compaction->input(input_level, i);
+      const FileMetaData* file_meta = flevel->files[i].file_metadata;
      *bytes_read += file_meta->fd.GetFileSize();
      uint64_t file_input_entries = file_meta->num_entries;
      uint64_t file_num_range_del = file_meta->num_range_deletions;
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@ -209,6 +209,8 @@ class CompactionJob {
  // Returns true iff compaction_stats_.stats.num_input_records and
  // num_input_range_del are calculated successfully.
  bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
+  virtual void UpdateCompactionJobStats(
+      const InternalStats::CompactionStats& stats) const;
  void LogCompaction();
  virtual void RecordCompactionIOStats();
  void CleanupCompaction();
@ -279,8 +281,7 @@ class CompactionJob {
                                  bool* compaction_released);
  Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
                                  CompactionOutputs& outputs);
-  void UpdateCompactionJobStats(
-      const InternalStats::CompactionStats& stats) const;
+
  void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
                         CompactionJobStats* compaction_job_stats = nullptr);

@ -377,9 +378,7 @@ class CompactionJob {
 // doesn't contain the LSM tree information, which is passed though MANIFEST
 // file.
 struct CompactionServiceInput {
-  ColumnFamilyDescriptor column_family;
-
-  DBOptions db_options;
+  std::string cf_name;

  std::vector<SequenceNumber> snapshots;

@ -387,7 +386,7 @@ struct CompactionServiceInput {
  // files needed for this compaction, for both input level files and output
  // level files.
  std::vector<std::string> input_files;
-  int output_level;
+  int output_level = 0;

  // db_id is used to generate unique id of sst on the remote compactor
  std::string db_id;
@ -398,13 +397,12 @@ struct CompactionServiceInput {
  bool has_end = false;
  std::string end;

+  uint64_t options_file_number = 0;
+
  // serialization interface to read and write the object
  static Status Read(const std::string& data_str, CompactionServiceInput* obj);
  Status Write(std::string* output);

-  // Initialize a dummy ColumnFamilyDescriptor
-  CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
-
 #ifndef NDEBUG
  bool TEST_Equals(CompactionServiceInput* other);
  bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
@ -418,20 +416,25 @@ struct CompactionServiceOutputFile {
  SequenceNumber largest_seqno;
  std::string smallest_internal_key;
  std::string largest_internal_key;
-  uint64_t oldest_ancester_time;
-  uint64_t file_creation_time;
-  uint64_t epoch_number;
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+  uint64_t file_creation_time = kUnknownFileCreationTime;
+  uint64_t epoch_number = kUnknownEpochNumber;
+  std::string file_checksum = kUnknownFileChecksum;
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
  uint64_t paranoid_hash;
  bool marked_for_compaction;
-  UniqueId64x2 unique_id;
+  UniqueId64x2 unique_id{};
+  TableProperties table_properties;

  CompactionServiceOutputFile() = default;
  CompactionServiceOutputFile(
      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
      std::string _smallest_internal_key, std::string _largest_internal_key,
      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
-      uint64_t _epoch_number, uint64_t _paranoid_hash,
-      bool _marked_for_compaction, UniqueId64x2 _unique_id)
+      uint64_t _epoch_number, const std::string& _file_checksum,
+      const std::string& _file_checksum_func_name, uint64_t _paranoid_hash,
+      bool _marked_for_compaction, UniqueId64x2 _unique_id,
+      const TableProperties& _table_properties)
      : file_name(name),
        smallest_seqno(smallest),
        largest_seqno(largest),
@ -440,9 +443,12 @@ struct CompactionServiceOutputFile {
        oldest_ancester_time(_oldest_ancester_time),
        file_creation_time(_file_creation_time),
        epoch_number(_epoch_number),
+        file_checksum(_file_checksum),
+        file_checksum_func_name(_file_checksum_func_name),
        paranoid_hash(_paranoid_hash),
        marked_for_compaction(_marked_for_compaction),
-        unique_id(std::move(_unique_id)) {}
+        unique_id(std::move(_unique_id)),
+        table_properties(_table_properties) {}
 };

 // CompactionServiceResult contains the compaction result from a different db
@ -451,14 +457,11 @@ struct CompactionServiceOutputFile {
 struct CompactionServiceResult {
  Status status;
  std::vector<CompactionServiceOutputFile> output_files;
-  int output_level;
+  int output_level = 0;

  // location of the output files
  std::string output_path;

-  // some statistics about the compaction
-  uint64_t num_output_records = 0;
-  uint64_t total_bytes = 0;
  uint64_t bytes_read = 0;
  uint64_t bytes_written = 0;
  CompactionJobStats stats;
@ -504,6 +507,9 @@ class CompactionServiceCompactionJob : private CompactionJob {
 protected:
  void RecordCompactionIOStats() override;

+  void UpdateCompactionJobStats(
+      const InternalStats::CompactionStats& stats) const override;
+
 private:
  // Get table file name in output_path
  std::string GetTableFileName(uint64_t file_number) override;
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@ -50,7 +50,8 @@ void VerifyInitializationOfCompactionJobStats(
  ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
  ASSERT_EQ(compaction_job_stats.num_output_files, 0U);

-  ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+  ASSERT_TRUE(compaction_job_stats.is_manual_compaction);
+  ASSERT_FALSE(compaction_job_stats.is_remote_compaction);

  ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
  ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
@ -249,6 +250,7 @@ class CompactionJobTestBase : public testing::Test {
    } else {
      assert(false);
    }
+    mutable_cf_options_.table_factory = cf_options_.table_factory;
  }

  std::string GenerateFileName(uint64_t file_number) {
@ -299,13 +301,13 @@ class CompactionJobTestBase : public testing::Test {
    const WriteOptions write_options;
    std::unique_ptr<TableBuilder> table_builder(
        cf_options_.table_factory->NewTableBuilder(
-            TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_,
-                                read_options, write_options,
-                                cfd_->internal_comparator(),
-                                cfd_->internal_tbl_prop_coll_factories(),
-                                CompressionType::kNoCompression,
-                                CompressionOptions(), 0 /* column_family_id */,
-                                kDefaultColumnFamilyName, -1 /* level */),
+            TableBuilderOptions(
+                *cfd_->ioptions(), mutable_cf_options_, read_options,
+                write_options, cfd_->internal_comparator(),
+                cfd_->internal_tbl_prop_coll_factories(),
+                CompressionType::kNoCompression, CompressionOptions(),
+                0 /* column_family_id */, kDefaultColumnFamilyName,
+                -1 /* level */, kUnknownNewestKeyTime),
            file_writer.get()));
    // Build table.
    for (const auto& kv : contents) {
@ -545,14 +547,14 @@ class CompactionJobTestBase : public testing::Test {
    ASSERT_OK(s);
    db_options_.info_log = info_log;

-    versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
-        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
-        /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       test::kUnitTestDbId, /*db_session_id=*/"",
+                       /*daily_offpeak_time_utc=*/"",
+                       /*error_handler=*/nullptr, /*read_only=*/false));
    compaction_job_stats_.Reset();
-    ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));

    VersionEdit new_db;
    new_db.SetLogNumber(0);
@ -575,7 +577,8 @@ class CompactionJobTestBase : public testing::Test {
    }
    ASSERT_OK(s);
    // Make "CURRENT" file that points to the new manifest file.
-    s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
+    s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
+                       Temperature::kUnknown, nullptr);

    ASSERT_OK(s);

@ -649,7 +652,8 @@ class CompactionJobTestBase : public testing::Test {
        mutable_cf_options_.target_file_size_base,
        mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
        cfd->GetLatestMutableCFOptions()->compression_opts,
-        Temperature::kUnknown, max_subcompactions, grandparents, true);
+        Temperature::kUnknown, max_subcompactions, grandparents,
+        /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr, true);
    compaction.FinalizeInputInfo(cfd->current());

    assert(db_options_.info_log);
@ -1567,17 +1571,7 @@ TEST_F(CompactionJobTest, InputSerialization) {
  const int kStrMaxLen = 1000;
  Random rnd(static_cast<uint32_t>(time(nullptr)));
  Random64 rnd64(time(nullptr));
-  input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
-  input.column_family.options.comparator = ReverseBytewiseComparator();
-  input.column_family.options.max_bytes_for_level_base =
-      rnd64.Uniform(UINT64_MAX);
-  input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
-  input.column_family.options.compression = kZSTD;
-  input.column_family.options.compression_opts.level = 4;
-  input.db_options.max_background_flushes = 10;
-  input.db_options.paranoid_checks = rnd.OneIn(2);
-  input.db_options.statistics = CreateDBStatistics();
-  input.db_options.env = env_;
+  input.cf_name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
  while (!rnd.OneIn(10)) {
    input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
  }
@ -1605,10 +1599,10 @@ TEST_F(CompactionJobTest, InputSerialization) {
  ASSERT_TRUE(deserialized1.TEST_Equals(&input));

  // Test mismatch
-  deserialized1.db_options.max_background_flushes += 10;
+  deserialized1.output_level += 10;
  std::string mismatch;
  ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
-  ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+  ASSERT_EQ(mismatch, "output_level");

  // Test unknown field
  CompactionServiceInput deserialized2;
@ -1664,20 +1658,40 @@ TEST_F(CompactionJobTest, ResultSerialization) {
  };
  result.status =
      status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
+
+  std::string file_checksum = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  std::string file_checksum_func_name = "MyAwesomeChecksumGenerator";
  while (!rnd.OneIn(10)) {
+    TableProperties tp;
+    tp.user_collected_properties.emplace(
+        "UCP_Key1", rnd.RandomString(rnd.Uniform(kStrMaxLen)));
+    tp.user_collected_properties.emplace(
+        "UCP_Key2", rnd.RandomString(rnd.Uniform(kStrMaxLen)));
+    tp.readable_properties.emplace("RP_Key1",
+                                   rnd.RandomString(rnd.Uniform(kStrMaxLen)));
+    tp.readable_properties.emplace("RP_K2y2",
+                                   rnd.RandomString(rnd.Uniform(kStrMaxLen)));
+
    UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
    result.output_files.emplace_back(
-        rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
-        rnd64.Uniform(UINT64_MAX),
-        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
-        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
-        rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
-        rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
+        rnd.RandomString(rnd.Uniform(kStrMaxLen)) /* file_name */,
+        rnd64.Uniform(UINT64_MAX) /* smallest_seqno */,
+        rnd64.Uniform(UINT64_MAX) /* largest_seqno */,
+        rnd.RandomBinaryString(
+            rnd.Uniform(kStrMaxLen)) /* smallest_internal_key */,
+        rnd.RandomBinaryString(
+            rnd.Uniform(kStrMaxLen)) /* largest_internal_key */,
+        rnd64.Uniform(UINT64_MAX) /* oldest_ancester_time */,
+        rnd64.Uniform(UINT64_MAX) /* file_creation_time */,
+        rnd64.Uniform(UINT64_MAX) /* epoch_number */,
+        file_checksum /* file_checksum */,
+        file_checksum_func_name /* file_checksum_func_name */,
+        rnd64.Uniform(UINT64_MAX) /* paranoid_hash */,
+        rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp);
  }
  result.output_level = rnd.Uniform(10);
  result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
-  result.num_output_records = rnd64.Uniform(UINT64_MAX);
-  result.total_bytes = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_output_records = rnd64.Uniform(UINT64_MAX);
  result.bytes_read = 123;
  result.bytes_written = rnd64.Uniform(UINT64_MAX);
  result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
@ -1694,6 +1708,21 @@ TEST_F(CompactionJobTest, ResultSerialization) {
  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
  ASSERT_TRUE(deserialized1.TEST_Equals(&result));

+  for (size_t i = 0; i < result.output_files.size(); i++) {
+    for (const auto& prop :
+         result.output_files[i].table_properties.user_collected_properties) {
+      ASSERT_EQ(deserialized1.output_files[i]
+                    .table_properties.user_collected_properties[prop.first],
+                prop.second);
+    }
+    for (const auto& prop :
+         result.output_files[i].table_properties.readable_properties) {
+      ASSERT_EQ(deserialized1.output_files[i]
+                    .table_properties.readable_properties[prop.first],
+                prop.second);
+    }
+  }
+
  // Test mismatch
  deserialized1.stats.num_input_files += 10;
  std::string mismatch;
@ -1708,6 +1737,10 @@ TEST_F(CompactionJobTest, ResultSerialization) {
    ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch));
    ASSERT_EQ(mismatch, "output_files.unique_id");
    deserialized_tmp.status.PermitUncheckedError();
+
+    ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum, file_checksum);
+    ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum_func_name,
+              file_checksum_func_name);
  }

  // Test unknown field
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@ -62,8 +62,9 @@ class CompactionOutputs {
  }

  // TODO: Remove it when remote compaction support tiered compaction
-  void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
+  void AddBytesWritten(uint64_t bytes) { stats_.bytes_written += bytes; }
  void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
+  void SetNumOutputFiles(uint64_t num) { stats_.num_output_files = num; }

  // TODO: Move the BlobDB builder into CompactionOutputs
  const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
@ -107,6 +108,12 @@ class CompactionOutputs {
  Status Finish(const Status& intput_status,
                const SeqnoToTimeMapping& seqno_to_time_mapping);

+  // Update output table properties from already populated TableProperties.
+  // Used for remote compaction
+  void UpdateTableProperties(const TableProperties& table_properties) {
+    current_output().table_properties =
+        std::make_shared<TableProperties>(table_properties);
+  }
  // Update output table properties from table builder
  void UpdateTableProperties() {
    current_output().table_properties =
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@ -133,7 +133,8 @@ CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
 CompactionPicker::~CompactionPicker() = default;

 // Delete this compaction from the list of running compactions.
-void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c,
+                                              const Status& status) {
  UnregisterCompaction(c);
  if (!status.ok()) {
    c->ResetNextCompactionIndex();
@ -350,11 +351,11 @@ Compaction* CompactionPicker::CompactFiles(
      break;
    }
  }
-  assert(output_level == 0 ||
-         !FilesRangeOverlapWithCompaction(
-             input_files, output_level,
-             Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
-                                                  start_level, output_level)));
+  assert(output_level == 0 || !FilesRangeOverlapWithCompaction(
+                                  input_files, output_level,
+                                  Compaction::EvaluatePenultimateLevel(
+                                      vstorage, mutable_cf_options, ioptions_,
+                                      start_level, output_level)));
 #endif /* !NDEBUG */

  CompressionType compression_type;
@ -379,7 +380,8 @@ Compaction* CompactionPicker::CompactFiles(
      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
      mutable_cf_options.default_write_temperature,
      compact_options.max_subcompactions,
-      /* grandparents */ {}, true);
+      /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
+      /* snapshot_checker */ nullptr, true);
  RegisterCompaction(c);
  return c;
 }
@ -657,8 +659,9 @@ Compaction* CompactionPicker::CompactRange(
    // overlaping outputs in the same level.
    if (FilesRangeOverlapWithCompaction(
            inputs, output_level,
-            Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
-                                                 start_level, output_level))) {
+            Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
+                                                 ioptions_, start_level,
+                                                 output_level))) {
      // This compaction output could potentially conflict with the output
      // of a currently running compaction, we cannot run it.
      *manual_conflict = true;
@ -676,7 +679,9 @@ Compaction* CompactionPicker::CompactRange(
        GetCompressionOptions(mutable_cf_options, vstorage, output_level),
        mutable_cf_options.default_write_temperature,
        compact_range_options.max_subcompactions,
-        /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1,
+        /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
+        /* snapshot_checker */ nullptr,
+        /* is manual */ true, trim_ts, /* score */ -1,
        /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
        CompactionReason::kUnknown,
        compact_range_options.blob_garbage_collection_policy,
@ -842,7 +847,8 @@ Compaction* CompactionPicker::CompactRange(
  // overlaping outputs in the same level.
  if (FilesRangeOverlapWithCompaction(
          compaction_inputs, output_level,
-          Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level,
+          Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
+                                               ioptions_, input_level,
                                               output_level))) {
    // This compaction output could potentially conflict with the output
    // of a currently running compaction, we cannot run it.
@ -865,6 +871,7 @@ Compaction* CompactionPicker::CompactRange(
      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
      mutable_cf_options.default_write_temperature,
      compact_range_options.max_subcompactions, std::move(grandparents),
+      /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
      /* is manual */ true, trim_ts, /* score */ -1,
      /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
      CompactionReason::kUnknown,
@ -1044,10 +1051,12 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
 }

 Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
-    std::unordered_set<uint64_t>* input_files,
-    const ColumnFamilyMetaData& cf_meta, const int output_level,
-    const VersionStorageInfo* vstorage,
+    std::unordered_set<uint64_t>* input_files, const int output_level,
+    Version* version,
    std::vector<CompactionInputFiles>* converted_input_files) const {
+  ColumnFamilyMetaData cf_meta;
+  version->GetColumnFamilyMetaData(&cf_meta);
+
  assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
         cf_meta.levels[cf_meta.levels.size() - 1].level);
  assert(converted_input_files);
@ -1118,7 +1127,8 @@ Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
  }

  s = GetCompactionInputsFromFileNumbers(converted_input_files, input_files,
-                                         vstorage, CompactionOptions());
+                                         version->storage_info(),
+                                         CompactionOptions());
  if (!s.ok()) {
    return s;
  }
@ -1127,8 +1137,8 @@ Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
      FilesRangeOverlapWithCompaction(
          *converted_input_files, output_level,
          Compaction::EvaluatePenultimateLevel(
-              vstorage, ioptions_, (*converted_input_files)[0].level,
-              output_level))) {
+              version->storage_info(), version->GetMutableCFOptions(),
+              ioptions_, (*converted_input_files)[0].level, output_level))) {
    return Status::Aborted(
        "A running compaction is writing to the same output level(s) in an "
        "overlapping key range");
@ -1170,7 +1180,8 @@ void CompactionPicker::UnregisterCompaction(Compaction* c) {

 void CompactionPicker::PickFilesMarkedForCompaction(
    const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
-    int* output_level, CompactionInputFiles* start_level_inputs) {
+    int* output_level, CompactionInputFiles* start_level_inputs,
+    std::function<bool(const FileMetaData*)> skip_marked_file) {
  if (vstorage->FilesMarkedForCompaction().empty()) {
    return;
  }
@ -1180,6 +1191,9 @@ void CompactionPicker::PickFilesMarkedForCompaction(
    // If this assert() fails that means that some function marked some
    // files as being_compacted, but didn't call ComputeCompactionScore()
    assert(!level_file.second->being_compacted);
+    if (skip_marked_file(level_file.second)) {
+      return false;
+    }
    *start_level = level_file.first;
    *output_level =
        (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@ -16,6 +16,7 @@
 #include <vector>

 #include "db/compaction/compaction.h"
+#include "db/snapshot_checker.h"
 #include "db/version_set.h"
 #include "options/cf_options.h"
 #include "rocksdb/env.h"
@ -55,17 +56,17 @@ class CompactionPicker {
  // Returns nullptr if there is no compaction to be done.
  // Otherwise returns a pointer to a heap-allocated object that
  // describes the compaction.  Caller should delete the result.
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     const MutableDBOptions& mutable_db_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) = 0;
+  // Currently, only universal compaction will query existing snapshots and
+  // pass it to aid compaction picking. And it's only passed when user-defined
+  // timestamps is not enabled. The other compaction styles do not pass or use
+  // `existing_snapshots` or `snapshot_checker`.
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options,
+      const std::vector<SequenceNumber>& existing_snapshots,
+      const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer) = 0;

-  // Return a compaction object for compacting the range [begin,end] in
-  // the specified level.  Returns nullptr if there is nothing in that
-  // level that overlaps the specified range.  Caller should delete
-  // the result.
-  //
  // The returned Compaction might not include the whole requested range.
  // In that case, compaction_end will be set to the next key that needs
  // compacting. In case the compaction will compact the whole range,
@ -96,15 +97,14 @@ class CompactionPicker {
  // non-ok status with specific reason.
  //
  Status SanitizeAndConvertCompactionInputFiles(
-      std::unordered_set<uint64_t>* input_files,
-      const ColumnFamilyMetaData& cf_meta, const int output_level,
-      const VersionStorageInfo* vstorage,
+      std::unordered_set<uint64_t>* input_files, const int output_level,
+      Version* version,
      std::vector<CompactionInputFiles>* converted_input_files) const;

  // Free up the files that participated in a compaction
  //
  // Requirement: DB mutex held
-  void ReleaseCompactionFiles(Compaction* c, Status status);
+  void ReleaseCompactionFiles(Compaction* c, const Status& status);

  // Returns true if any one of the specified files are being compacted
  bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
@ -203,10 +203,11 @@ class CompactionPicker {
                       const CompactionInputFiles& output_level_inputs,
                       std::vector<FileMetaData*>* grandparents);

-  void PickFilesMarkedForCompaction(const std::string& cf_name,
-                                    VersionStorageInfo* vstorage,
-                                    int* start_level, int* output_level,
-                                    CompactionInputFiles* start_level_inputs);
+  void PickFilesMarkedForCompaction(
+      const std::string& cf_name, VersionStorageInfo* vstorage,
+      int* start_level, int* output_level,
+      CompactionInputFiles* start_level_inputs,
+      std::function<bool(const FileMetaData*)> skip_marked_file);

  bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
                             CompactionInputFiles* start_level_inputs,
@ -257,11 +258,13 @@ class NullCompactionPicker : public CompactionPicker {
  virtual ~NullCompactionPicker() {}

  // Always return "nullptr"
-  Compaction* PickCompaction(const std::string& /*cf_name*/,
-                             const MutableCFOptions& /*mutable_cf_options*/,
-                             const MutableDBOptions& /*mutable_db_options*/,
-                             VersionStorageInfo* /*vstorage*/,
-                             LogBuffer* /* log_buffer */) override {
+  Compaction* PickCompaction(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      const MutableDBOptions& /*mutable_db_options*/,
+      const std::vector<SequenceNumber>& /*existing_snapshots*/,
+      const SnapshotChecker* /*snapshot_checker*/,
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */) override {
    return nullptr;
  }

--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@ -79,10 +79,14 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
      FileMetaData* f = *ritr;
      assert(f);
      if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+        uint64_t newest_key_time = f->TryGetNewestKeyTime();
        uint64_t creation_time =
            f->fd.table_reader->GetTableProperties()->creation_time;
-        if (creation_time == 0 ||
-            creation_time >= (current_time - mutable_cf_options.ttl)) {
+        uint64_t est_newest_key_time = newest_key_time == kUnknownNewestKeyTime
+                                           ? creation_time
+                                           : newest_key_time;
+        if (est_newest_key_time == kUnknownNewestKeyTime ||
+            est_newest_key_time >= (current_time - mutable_cf_options.ttl)) {
          break;
        }
      }
@ -102,15 +106,19 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
  }

  for (const auto& f : inputs[0].files) {
-    uint64_t creation_time = 0;
    assert(f);
+    uint64_t newest_key_time = f->TryGetNewestKeyTime();
+    uint64_t creation_time = 0;
    if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
      creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
    }
+    uint64_t est_newest_key_time = newest_key_time == kUnknownNewestKeyTime
+                                       ? creation_time
+                                       : newest_key_time;
    ROCKS_LOG_BUFFER(log_buffer,
                     "[%s] FIFO compaction: picking file %" PRIu64
-                     " with creation time %" PRIu64 " for deletion",
-                     cf_name.c_str(), f->fd.GetNumber(), creation_time);
+                     " with estimated newest key time %" PRIu64 " for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(), est_newest_key_time);
  }

  Compaction* c = new Compaction(
@ -118,7 +126,9 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
      std::move(inputs), 0, 0, 0, 0, kNoCompression,
      mutable_cf_options.compression_opts,
      mutable_cf_options.default_write_temperature,
-      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
+      /* snapshot_checker */ nullptr,
+      /* is manual */ false,
      /* trim_ts */ "", vstorage->CompactionScore(0),
      /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
      CompactionReason::kFIFOTtl);
@ -188,7 +198,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
            0 /* output path ID */, mutable_cf_options.compression,
            mutable_cf_options.compression_opts,
            mutable_cf_options.default_write_temperature,
-            0 /* max_subcompactions */, {}, /* is manual */ false,
+            0 /* max_subcompactions */, {},
+            /* earliest_snapshot */ std::nullopt,
+            /* snapshot_checker */ nullptr, /* is manual */ false,
            /* trim_ts */ "", vstorage->CompactionScore(0),
            /* is deletion compaction */ false,
            /* l0_files_might_overlap */ true,
@ -284,7 +296,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
      /* output_path_id */ 0, kNoCompression,
      mutable_cf_options.compression_opts,
      mutable_cf_options.default_write_temperature,
-      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
+      /* snapshot_checker */ nullptr,
+      /* is manual */ false,
      /* trim_ts */ "", vstorage->CompactionScore(0),
      /* is deletion compaction */ true,
      /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
@ -294,7 +308,7 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
 Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer) const {
  const std::vector<FileTemperatureAge>& ages =
      mutable_cf_options.compaction_options_fifo
          .file_temperature_age_thresholds;
@ -344,73 +358,47 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
  Temperature compaction_target_temp = Temperature::kLastTemperature;
  if (current_time > min_age) {
    uint64_t create_time_threshold = current_time - min_age;
-    uint64_t compaction_size = 0;
-    // We will ideally identify a file qualifying for temperature change by
-    // knowing the timestamp for the youngest entry in the file. However, right
-    // now we don't have the information. We infer it by looking at timestamp of
-    // the previous file's (which is just younger) oldest entry's timestamp.
-    Temperature cur_target_temp;
-    // avoid index underflow
    assert(level_files.size() >= 1);
-    for (size_t index = level_files.size() - 1; index >= 1; --index) {
+    for (size_t index = level_files.size(); index >= 1; --index) {
      // Try to add cur_file to compaction inputs.
-      FileMetaData* cur_file = level_files[index];
-      // prev_file is just younger than cur_file
-      FileMetaData* prev_file = level_files[index - 1];
+      FileMetaData* cur_file = level_files[index - 1];
+      FileMetaData* prev_file = index < 2 ? nullptr : level_files[index - 2];
      if (cur_file->being_compacted) {
        // Should not happen since we check for
        // `level0_compactions_in_progress_` above. Here we simply just don't
        // schedule anything.
        return nullptr;
      }
-      uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime();
-      if (oldest_ancestor_time == kUnknownOldestAncesterTime) {
-        // Older files might not have enough information. It is possible to
-        // handle these files by looking at newer files, but maintaining the
-        // logic isn't worth it.
+      uint64_t est_newest_key_time = cur_file->TryGetNewestKeyTime(prev_file);
+      // Newer file could have newest_key_time populated
+      if (est_newest_key_time == kUnknownNewestKeyTime) {
+        continue;
+      }
+      if (est_newest_key_time > create_time_threshold) {
        break;
      }
-      if (oldest_ancestor_time > create_time_threshold) {
-        // cur_file is too fresh
-        break;
-      }
-      cur_target_temp = ages[0].temperature;
+      Temperature cur_target_temp = ages[0].temperature;
      for (size_t i = 1; i < ages.size(); ++i) {
        if (current_time >= ages[i].age &&
-            oldest_ancestor_time <= current_time - ages[i].age) {
+            est_newest_key_time <= current_time - ages[i].age) {
          cur_target_temp = ages[i].temperature;
        }
      }
      if (cur_file->temperature == cur_target_temp) {
-        if (inputs[0].empty()) {
-          continue;
-        } else {
-          break;
-        }
+        continue;
      }

      // cur_file needs to change temperature
-      if (compaction_target_temp == Temperature::kLastTemperature) {
-        assert(inputs[0].empty());
-        compaction_target_temp = cur_target_temp;
-      } else if (cur_target_temp != compaction_target_temp) {
-        assert(!inputs[0].empty());
-        break;
-      }
-      if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <=
-                                   mutable_cf_options.max_compaction_bytes) {
-        inputs[0].files.push_back(cur_file);
-        compaction_size += cur_file->fd.GetFileSize();
-        ROCKS_LOG_BUFFER(
-            log_buffer,
-            "[%s] FIFO compaction: picking file %" PRIu64
-            " with next file's oldest time %" PRIu64 " for temperature %s.",
-            cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time,
-            temperature_to_string[cur_target_temp].c_str());
-      }
-      if (compaction_size > mutable_cf_options.max_compaction_bytes) {
-        break;
-      }
+      assert(compaction_target_temp == Temperature::kLastTemperature);
+      compaction_target_temp = cur_target_temp;
+      inputs[0].files.push_back(cur_file);
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] FIFO compaction: picking file %" PRIu64
+          " with estimated newest key time %" PRIu64 " for temperature %s.",
+          cf_name.c_str(), cur_file->fd.GetNumber(), est_newest_key_time,
+          temperature_to_string[cur_target_temp].c_str());
+      break;
    }
  }

@ -418,15 +406,18 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
    return nullptr;
  }
  assert(compaction_target_temp != Temperature::kLastTemperature);
-
+  // Only compact one file at a time.
+  assert(inputs.size() == 1);
+  assert(inputs[0].size() == 1);
  Compaction* c = new Compaction(
      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
      std::move(inputs), 0, 0 /* output file size limit */,
      0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
      mutable_cf_options.compression, mutable_cf_options.compression_opts,
      compaction_target_temp,
-      /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
-      vstorage->CompactionScore(0),
+      /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
+      /* snapshot_checker */ nullptr,
+      /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0),
      /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
      CompactionReason::kChangeTemperature);
  return c;
@ -434,7 +425,9 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(

 Compaction* FIFOCompactionPicker::PickCompaction(
    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    const MutableDBOptions& mutable_db_options,
+    const std::vector<SequenceNumber>& /* existing_snapshots */,
+    const SnapshotChecker* /* snapshot_checker */, VersionStorageInfo* vstorage,
    LogBuffer* log_buffer) {
  Compaction* c = nullptr;
  if (mutable_cf_options.ttl > 0) {
@ -469,8 +462,10 @@ Compaction* FIFOCompactionPicker::CompactRange(
  assert(output_level == 0);
  *compaction_end = nullptr;
  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
-  Compaction* c = PickCompaction(cf_name, mutable_cf_options,
-                                 mutable_db_options, vstorage, &log_buffer);
+  Compaction* c =
+      PickCompaction(cf_name, mutable_cf_options, mutable_db_options,
+                     /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr,
+                     vstorage, &log_buffer);
  log_buffer.FlushBufferToLog();
  return c;
 }
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@ -18,11 +18,12 @@ class FIFOCompactionPicker : public CompactionPicker {
                       const InternalKeyComparator* icmp)
      : CompactionPicker(ioptions, icmp) {}

-  Compaction* PickCompaction(const std::string& cf_name,
-                             const MutableCFOptions& mutable_cf_options,
-                             const MutableDBOptions& mutable_db_options,
-                             VersionStorageInfo* version,
-                             LogBuffer* log_buffer) override;
+  Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options,
+      const std::vector<SequenceNumber>& /* existing_snapshots */,
+      const SnapshotChecker* /* snapshot_checker */,
+      VersionStorageInfo* version, LogBuffer* log_buffer) override;

  Compaction* CompactRange(const std::string& cf_name,
                           const MutableCFOptions& mutable_cf_options,
@ -53,9 +54,10 @@ class FIFOCompactionPicker : public CompactionPicker {
                                 VersionStorageInfo* version,
                                 LogBuffer* log_buffer);

+  // Will pick one file to compact at a time, starting from the oldest file.
  Compaction* PickTemperatureChangeCompaction(
      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer);
+      LogBuffer* log_buffer) const;
 };
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@ -262,7 +262,10 @@ void LevelCompactionBuilder::SetupInitialFiles() {
  parent_index_ = base_index_ = -1;

  compaction_picker_->PickFilesMarkedForCompaction(
-      cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
+      cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_,
+      /*skip_marked_file*/ [](const FileMetaData* /* file */) {
+        return false;
+      });
  if (!start_level_inputs_.empty()) {
    compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
    return;
@ -411,8 +414,9 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
                                                    &tmp_start_level_inputs) ||
        compaction_picker_->FilesRangeOverlapWithCompaction(
            {tmp_start_level_inputs}, output_level_,
-            Compaction::EvaluatePenultimateLevel(
-                vstorage_, ioptions_, start_level_, output_level_))) {
+            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
+                                                 ioptions_, start_level_,
+                                                 output_level_))) {
      // Constraint 1a
      tmp_start_level_inputs.clear();
      return;
@ -486,8 +490,9 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
    // We need to disallow this from happening.
    if (compaction_picker_->FilesRangeOverlapWithCompaction(
            compaction_inputs_, output_level_,
-            Compaction::EvaluatePenultimateLevel(
-                vstorage_, ioptions_, start_level_, output_level_))) {
+            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
+                                                 ioptions_, start_level_,
+                                                 output_level_))) {
      // This compaction output could potentially conflict with the output
      // of a currently running compaction, we cannot run it.
      return false;
@ -554,7 +559,9 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
                         vstorage_->base_level()),
      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
      mutable_cf_options_.default_write_temperature,
-      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+      /* max_subcompactions */ 0, std::move(grandparents_),
+      /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
+      is_manual_,
      /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
      l0_files_might_overlap, compaction_reason_);

@ -839,8 +846,9 @@ bool LevelCompactionBuilder::PickFileToCompact() {
                                                    &start_level_inputs_) ||
        compaction_picker_->FilesRangeOverlapWithCompaction(
            {start_level_inputs_}, output_level_,
-            Compaction::EvaluatePenultimateLevel(
-                vstorage_, ioptions_, start_level_, output_level_))) {
+            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
+                                                 ioptions_, start_level_,
+                                                 output_level_))) {
      // A locked (pending compaction) input-level file was pulled in due to
      // user-key overlap.
      start_level_inputs_.clear();
@ -925,11 +933,15 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
  }
  uint64_t l0_size = 0;
  for (const auto& file : l0_files) {
-    l0_size += file->fd.GetFileSize();
+    assert(file->compensated_file_size >= file->fd.GetFileSize());
+    // Compact down L0s with more deletions.
+    l0_size += file->compensated_file_size;
  }
-  const uint64_t min_lbase_size =
-      l0_size * static_cast<uint64_t>(std::max(
-                    10.0, mutable_cf_options_.max_bytes_for_level_multiplier));
+
+  // Avoid L0->Lbase compactions that are inefficient for write-amp.
+  const double kMultiplier =
+      std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
+  const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
  assert(min_lbase_size >= l0_size);
  const std::vector<FileMetaData*>& lbase_files =
      vstorage_->LevelFiles(/*level=*/base_level);
@ -963,7 +975,9 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {

 Compaction* LevelCompactionPicker::PickCompaction(
    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    const MutableDBOptions& mutable_db_options,
+    const std::vector<SequenceNumber>& /*existing_snapshots */,
+    const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage,
    LogBuffer* log_buffer) {
  LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
                                 mutable_cf_options, ioptions_,
--- a/db/compaction/compaction_picker_level.h
+++ b/db/compaction/compaction_picker_level.h
@ -20,11 +20,12 @@ class LevelCompactionPicker : public CompactionPicker {
  LevelCompactionPicker(const ImmutableOptions& ioptions,
                        const InternalKeyComparator* icmp)
      : CompactionPicker(ioptions, icmp) {}
-  Compaction* PickCompaction(const std::string& cf_name,
-                             const MutableCFOptions& mutable_cf_options,
-                             const MutableDBOptions& mutable_db_options,
-                             VersionStorageInfo* vstorage,
-                             LogBuffer* log_buffer) override;
+  Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options,
+      const std::vector<SequenceNumber>& /* existing_snapshots */,
+      const SnapshotChecker* /* snapshot_checker */,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer) override;

  bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
 };
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@ -35,7 +35,9 @@ class UniversalCompactionBuilder {
  UniversalCompactionBuilder(
      const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      const MutableDBOptions& mutable_db_options,
+      const std::vector<SequenceNumber>& existing_snapshots,
+      const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
      UniversalCompactionPicker* picker, LogBuffer* log_buffer)
      : ioptions_(ioptions),
        icmp_(icmp),
@ -44,7 +46,19 @@ class UniversalCompactionBuilder {
        mutable_db_options_(mutable_db_options),
        vstorage_(vstorage),
        picker_(picker),
-        log_buffer_(log_buffer) {}
+        log_buffer_(log_buffer) {
+    assert(icmp_);
+    const auto* ucmp = icmp_->user_comparator();
+    assert(ucmp);
+    // These parameters are only passed when user-defined timestamp is not
+    // enabled.
+    if (ucmp->timestamp_size() == 0) {
+      earliest_snapshot_ = existing_snapshots.empty()
+                               ? kMaxSequenceNumber
+                               : existing_snapshots.at(0);
+      snapshot_checker_ = snapshot_checker;
+    }
+  }

  // Form and return the compaction object. The caller owns return object.
  Compaction* PickCompaction();
@ -52,12 +66,15 @@ class UniversalCompactionBuilder {
 private:
  struct SortedRun {
    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
-              uint64_t _compensated_file_size, bool _being_compacted)
+              uint64_t _compensated_file_size, bool _being_compacted,
+              bool _level_has_marked_standalone_rangedel)
        : level(_level),
          file(_file),
          size(_size),
          compensated_file_size(_compensated_file_size),
-          being_compacted(_being_compacted) {
+          being_compacted(_being_compacted),
+          level_has_marked_standalone_rangedel(
+              _level_has_marked_standalone_rangedel) {
      assert(compensated_file_size > 0);
      assert(level != 0 || file != nullptr);
    }
@ -79,6 +96,10 @@ class UniversalCompactionBuilder {
    uint64_t size;
    uint64_t compensated_file_size;
    bool being_compacted;
+    // True if this level has any file that is a standalone range deletion file
+    // marked for compaction. Best effort is made to make only deletion
+    // triggered compaction pick this type of file.
+    bool level_has_marked_standalone_rangedel;
  };

  // Pick Universal compaction to limit read amplification
@ -98,6 +119,11 @@ class UniversalCompactionBuilder {

  Compaction* PickDeleteTriggeredCompaction();

+  // Returns true if this given file (that is marked be compaction) should be
+  // skipped from being picked for now. We do this to best use standalone range
+  // tombstone files.
+  bool ShouldSkipMarkedFile(const FileMetaData* file) const;
+
  // Form a compaction from the sorted run indicated by start_index to the
  // oldest sorted run.
  // The caller is responsible for making sure that those files are not in
@ -116,7 +142,7 @@ class UniversalCompactionBuilder {

  bool ShouldSkipLastSortedRunForSizeAmpCompaction() const {
    assert(!sorted_runs_.empty());
-    return ioptions_.preclude_last_level_data_seconds > 0 &&
+    return mutable_cf_options_.preclude_last_level_data_seconds > 0 &&
           ioptions_.num_levels > 2 &&
           sorted_runs_.back().level == ioptions_.num_levels - 1 &&
           sorted_runs_.size() > 1;
@ -234,8 +260,18 @@ class UniversalCompactionBuilder {
  VersionStorageInfo* vstorage_;
  UniversalCompactionPicker* picker_;
  LogBuffer* log_buffer_;
+  // Optional earliest snapshot at time of compaction picking. This is only
+  // provided if the column family doesn't enable user-defined timestamps.
+  // And this information is only passed to `Compaction` picked by deletion
+  // triggered compaction for possible optimizations.
+  std::optional<SequenceNumber> earliest_snapshot_;
+  const SnapshotChecker* snapshot_checker_;
+  // Mapping from file id to its index in the sorted run for the files that are
+  // marked for compaction. This is only populated when snapshot info is
+  // populated.
+  std::map<uint64_t, size_t> file_marked_for_compaction_to_sorted_run_index_;

-  static std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
+  std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
      const VersionStorageInfo& vstorage, int last_level,
      uint64_t* max_run_size);

@ -394,11 +430,13 @@ bool UniversalCompactionPicker::NeedsCompaction(

 Compaction* UniversalCompactionPicker::PickCompaction(
    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    const MutableDBOptions& mutable_db_options,
+    const std::vector<SequenceNumber>& existing_snapshots,
+    const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
    LogBuffer* log_buffer) {
-  UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
-                                     mutable_cf_options, mutable_db_options,
-                                     vstorage, this, log_buffer);
+  UniversalCompactionBuilder builder(
+      ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options,
+      existing_snapshots, snapshot_checker, vstorage, this, log_buffer);
  return builder.PickCompaction();
 }

@ -448,14 +486,20 @@ UniversalCompactionBuilder::CalculateSortedRuns(
  *max_run_size = 0;
  std::vector<UniversalCompactionBuilder::SortedRun> ret;
  for (FileMetaData* f : vstorage.LevelFiles(0)) {
-    ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
-                     f->being_compacted);
+    if (earliest_snapshot_.has_value() && f->marked_for_compaction) {
+      file_marked_for_compaction_to_sorted_run_index_.emplace(f->fd.GetNumber(),
+                                                              ret.size());
+    }
+    ret.emplace_back(
+        0, f, f->fd.GetFileSize(), f->compensated_file_size, f->being_compacted,
+        f->marked_for_compaction && f->FileIsStandAloneRangeTombstone());
    *max_run_size = std::max(*max_run_size, f->fd.GetFileSize());
  }
  for (int level = 1; level <= last_level; level++) {
    uint64_t total_compensated_size = 0U;
    uint64_t total_size = 0U;
    bool being_compacted = false;
+    bool level_has_marked_standalone_rangedel = false;
    for (FileMetaData* f : vstorage.LevelFiles(level)) {
      total_compensated_size += f->compensated_file_size;
      total_size += f->fd.GetFileSize();
@ -467,16 +511,57 @@ UniversalCompactionBuilder::CalculateSortedRuns(
      if (f->being_compacted) {
        being_compacted = f->being_compacted;
      }
+      level_has_marked_standalone_rangedel =
+          level_has_marked_standalone_rangedel ||
+          (f->marked_for_compaction && f->FileIsStandAloneRangeTombstone());
+      if (earliest_snapshot_.has_value() && f->marked_for_compaction) {
+        file_marked_for_compaction_to_sorted_run_index_.emplace(
+            f->fd.GetNumber(), ret.size());
+      }
    }
    if (total_compensated_size > 0) {
      ret.emplace_back(level, nullptr, total_size, total_compensated_size,
-                       being_compacted);
+                       being_compacted, level_has_marked_standalone_rangedel);
    }
    *max_run_size = std::max(*max_run_size, total_size);
  }
  return ret;
 }

+bool UniversalCompactionBuilder::ShouldSkipMarkedFile(
+    const FileMetaData* file) const {
+  assert(file->marked_for_compaction);
+  if (!earliest_snapshot_.has_value()) {
+    return false;
+  }
+  if (!file->FileIsStandAloneRangeTombstone()) {
+    return false;
+  }
+  // Skip until earliest snapshot advances at or above this standalone range
+  // tombstone file. `DB::ReleaseSnapshot` will re-examine and schedule
+  // compaction for it.
+  if (!DataIsDefinitelyInSnapshot(file->fd.largest_seqno,
+                                  earliest_snapshot_.value(),
+                                  snapshot_checker_)) {
+    return true;
+  }
+
+  auto iter = file_marked_for_compaction_to_sorted_run_index_.find(
+      file->fd.GetNumber());
+  assert(iter != file_marked_for_compaction_to_sorted_run_index_.end());
+  size_t idx = iter->second;
+  const SortedRun* succeeding_sorted_run =
+      idx < sorted_runs_.size() - 1 ? &sorted_runs_[idx + 1] : nullptr;
+  // Marked standalone range tombstone file is best used if it's in the start
+  // input level. Skip to let that compaction happen first.
+  if (succeeding_sorted_run &&
+      succeeding_sorted_run->level_has_marked_standalone_rangedel) {
+    return true;
+  }
+
+  return false;
+}
+
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
 Compaction* UniversalCompactionBuilder::PickCompaction() {
@ -580,7 +665,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
        // Get the total number of sorted runs that are not being compacted
        int num_sr_not_compacted = 0;
        for (size_t i = 0; i < sorted_runs_.size(); i++) {
-          if (sorted_runs_[i].being_compacted == false) {
+          if (sorted_runs_[i].being_compacted == false &&
+              !sorted_runs_[i].level_has_marked_standalone_rangedel) {
            num_sr_not_compacted++;
          }
        }
@ -743,16 +829,24 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
    for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
      sr = &sorted_runs_[loop];

-      if (!sr->being_compacted) {
+      if (!sr->being_compacted && !sr->level_has_marked_standalone_rangedel) {
        candidate_count = 1;
        break;
      }
      char file_num_buf[kFormatFileNumberBufSize];
      sr->Dump(file_num_buf, sizeof(file_num_buf));
-      ROCKS_LOG_BUFFER(log_buffer_,
-                       "[%s] Universal: %s"
-                       "[%d] being compacted, skipping",
-                       cf_name_.c_str(), file_num_buf, loop);
+      if (sr->being_compacted) {
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: %s"
+                         "[%d] being compacted, skipping",
+                         cf_name_.c_str(), file_num_buf, loop);
+      } else if (sr->level_has_marked_standalone_rangedel) {
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: %s"
+                         "[%d] has standalone range tombstone files marked for "
+                         "compaction, skipping",
+                         cf_name_.c_str(), file_num_buf, loop);
+      }

      sr = nullptr;
    }
@ -773,7 +867,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
         candidate_count < max_files_to_compact && i < sorted_runs_.size();
         i++) {
      const SortedRun* succeeding_sr = &sorted_runs_[i];
-      if (succeeding_sr->being_compacted) {
+      if (succeeding_sr->being_compacted ||
+          succeeding_sr->level_has_marked_standalone_rangedel) {
        break;
      }
      // Pick files if the total/last candidate file size (increased by the
@ -899,11 +994,11 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
    grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level);
  }

-  if (output_level != 0 &&
-      picker_->FilesRangeOverlapWithCompaction(
-          inputs, output_level,
-          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
-                                               start_level, output_level))) {
+  if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
+                               inputs, output_level,
+                               Compaction::EvaluatePenultimateLevel(
+                                   vstorage_, mutable_cf_options_, ioptions_,
+                                   start_level, output_level))) {
    return nullptr;
  }
  CompactionReason compaction_reason;
@ -923,6 +1018,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
                                              output_level, enable_compression),
                        mutable_cf_options_.default_write_temperature,
                        /* max_subcompactions */ 0, grandparents,
+                        /* earliest_snapshot */ std::nullopt,
+                        /* snapshot_checker */ nullptr,
                        /* is manual */ false, /* trim_ts */ "", score_,
                        false /* deletion_compaction */,
                        /* l0_files_might_overlap */ true, compaction_reason);
@ -939,7 +1036,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
  const size_t end_index = ShouldSkipLastSortedRunForSizeAmpCompaction()
                               ? sorted_runs_.size() - 2
                               : sorted_runs_.size() - 1;
-  if (sorted_runs_[end_index].being_compacted) {
+  if (sorted_runs_[end_index].being_compacted ||
+      sorted_runs_[end_index].level_has_marked_standalone_rangedel) {
    return nullptr;
  }
  const uint64_t base_sr_size = sorted_runs_[end_index].size;
@ -950,14 +1048,23 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
  // Get longest span (i.e, [start_index, end_index]) of available sorted runs
  while (start_index > 0) {
    const SortedRun* sr = &sorted_runs_[start_index - 1];
-    if (sr->being_compacted) {
+    if (sr->being_compacted || sr->level_has_marked_standalone_rangedel) {
      char file_num_buf[kFormatFileNumberBufSize];
      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
-      ROCKS_LOG_BUFFER(
-          log_buffer_,
-          "[%s] Universal: stopping at sorted run undergoing compaction: "
-          "%s[%" ROCKSDB_PRIszt "]",
-          cf_name_.c_str(), file_num_buf, start_index - 1);
+      if (sr->being_compacted) {
+        ROCKS_LOG_BUFFER(
+            log_buffer_,
+            "[%s] Universal: stopping at sorted run undergoing compaction: "
+            "%s[%" ROCKSDB_PRIszt "]",
+            cf_name_.c_str(), file_num_buf, start_index - 1);
+      } else if (sr->level_has_marked_standalone_rangedel) {
+        ROCKS_LOG_BUFFER(
+            log_buffer_,
+            "[%s] Universal: stopping at sorted run that has standalone range "
+            "tombstone files marked for compaction: "
+            "%s[%" ROCKSDB_PRIszt "]",
+            cf_name_.c_str(), file_num_buf, start_index - 1);
+      }
      break;
    }
    candidate_size += sr->compensated_file_size;
@ -1236,11 +1343,11 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
  }

  // intra L0 compactions outputs could have overlap
-  if (output_level != 0 &&
-      picker_->FilesRangeOverlapWithCompaction(
-          inputs, output_level,
-          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
-                                               start_level, output_level))) {
+  if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
+                               inputs, output_level,
+                               Compaction::EvaluatePenultimateLevel(
+                                   vstorage_, mutable_cf_options_, ioptions_,
+                                   start_level, output_level))) {
    return nullptr;
  }

@ -1257,7 +1364,10 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
                            true /* enable_compression */),
      mutable_cf_options_.default_write_temperature,
-      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      /* max_subcompactions */ 0, /* grandparents */ {},
+      /* earliest_snapshot */ std::nullopt,
+      /* snapshot_checker */ nullptr,
+      /* is manual */ false,
      /* trim_ts */ "", score_, false /* deletion_compaction */,
      /* l0_files_might_overlap */ true,
      CompactionReason::kUniversalSizeAmplification);
@ -1288,7 +1398,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
        continue;
      }
      FileMetaData* f = vstorage_->LevelFiles(0)[loop];
-      if (f->marked_for_compaction) {
+      if (f->marked_for_compaction && !ShouldSkipMarkedFile(f)) {
        start_level_inputs.files.push_back(f);
        start_index =
            static_cast<int>(loop);  // Consider this as the first candidate.
@ -1302,7 +1412,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {

    for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) {
      SortedRun* sr = &sorted_runs_[loop];
-      if (sr->being_compacted) {
+      if (sr->being_compacted || sr->level_has_marked_standalone_rangedel) {
        break;
      }

@ -1321,7 +1431,10 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
    // leveled. We pick one of the files marked for compaction and compact with
    // overlapping files in the adjacent level.
    picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
-                                          &output_level, &start_level_inputs);
+                                          &output_level, &start_level_inputs,
+                                          [this](const FileMetaData* file) {
+                                            return ShouldSkipMarkedFile(file);
+                                          });
    if (start_level_inputs.empty()) {
      return nullptr;
    }
@ -1374,7 +1487,8 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
      if (picker_->FilesRangeOverlapWithCompaction(
              inputs, output_level,
              Compaction::EvaluatePenultimateLevel(
-                  vstorage_, ioptions_, start_level, output_level))) {
+                  vstorage_, mutable_cf_options_, ioptions_, start_level,
+                  output_level))) {
        return nullptr;
      }

@ -1401,7 +1515,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
      GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
      mutable_cf_options_.default_write_temperature,
-      /* max_subcompactions */ 0, grandparents, /* is manual */ false,
+      /* max_subcompactions */ 0, grandparents, earliest_snapshot_,
+      snapshot_checker_,
+      /* is manual */ false,
      /* trim_ts */ "", score_, false /* deletion_compaction */,
      /* l0_files_might_overlap */ true,
      CompactionReason::kFilesMarkedForCompaction);
@ -1472,11 +1588,11 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
  }

  // intra L0 compactions outputs could have overlap
-  if (output_level != 0 &&
-      picker_->FilesRangeOverlapWithCompaction(
-          inputs, output_level,
-          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
-                                               start_level, output_level))) {
+  if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
+                               inputs, output_level,
+                               Compaction::EvaluatePenultimateLevel(
+                                   vstorage_, mutable_cf_options_, ioptions_,
+                                   start_level, output_level))) {
    return nullptr;
  }

@ -1494,7 +1610,10 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
                            true /* enable_compression */),
      mutable_cf_options_.default_write_temperature,
-      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      /* max_subcompactions */ 0, /* grandparents */ {},
+      /* earliest_snapshot */ std::nullopt,
+      /* snapshot_checker */ nullptr,
+      /* is manual */ false,
      /* trim_ts */ "", score_, false /* deletion_compaction */,
      /* l0_files_might_overlap */ true, compaction_reason);
 }
@ -1515,7 +1634,8 @@ Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
  // included in the compaction.

  size_t start_index = sorted_runs_.size();
-  while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+  while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted &&
+         !sorted_runs_[start_index - 1].level_has_marked_standalone_rangedel) {
    start_index--;
  }
  if (start_index == sorted_runs_.size()) {
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@ -10,6 +10,7 @@
 #pragma once

 #include "db/compaction/compaction_picker.h"
+#include "db/snapshot_checker.h"

 namespace ROCKSDB_NAMESPACE {
 class UniversalCompactionPicker : public CompactionPicker {
@ -17,11 +18,12 @@ class UniversalCompactionPicker : public CompactionPicker {
  UniversalCompactionPicker(const ImmutableOptions& ioptions,
                            const InternalKeyComparator* icmp)
      : CompactionPicker(ioptions, icmp) {}
-  Compaction* PickCompaction(const std::string& cf_name,
-                             const MutableCFOptions& mutable_cf_options,
-                             const MutableDBOptions& mutable_db_options,
-                             VersionStorageInfo* vstorage,
-                             LogBuffer* log_buffer) override;
+  Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options,
+      const std::vector<SequenceNumber>& existing_snapshots,
+      const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer) override;
  int MaxOutputLevel() const override { return NumberLevels() - 1; }

  bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@ -39,12 +39,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
          MakeTableFileName(file->fd.GetNumber()));
    }
  }
-  compaction_input.column_family.name =
-      compaction->column_family_data()->GetName();
-  compaction_input.column_family.options =
-      compaction->column_family_data()->GetLatestCFOptions();
-  compaction_input.db_options =
-      BuildDBOptions(db_options_, mutable_db_options_copy_);
+
+  compaction_input.cf_name = compaction->column_family_data()->GetName();
  compaction_input.snapshots = existing_snapshots_;
  compaction_input.has_begin = sub_compact->start.has_value();
  compaction_input.begin =
@ -52,6 +48,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
  compaction_input.has_end = sub_compact->end.has_value();
  compaction_input.end =
      compaction_input.has_end ? sub_compact->end->ToString() : "";
+  compaction_input.options_file_number =
+      sub_compact->compaction->input_version()
+          ->version_set()
+          ->options_file_number();
+
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionServiceJob::ProcessKeyValueCompactionWithCompactionService",
+      &compaction_input);

  std::string compaction_input_binary;
  Status s = compaction_input.Write(&compaction_input_binary);
@ -70,10 +74,13 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
  ROCKS_LOG_INFO(
      db_options_.info_log,
      "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
-      compaction_input.column_family.name.c_str(), job_id_,
+      compaction->column_family_data()->GetName().c_str(), job_id_,
      compaction_input.output_level, input_files_oss.str().c_str());
-  CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
-                                GetCompactionId(sub_compact), thread_pri_);
+  CompactionServiceJobInfo info(
+      dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact),
+      thread_pri_, compaction->compaction_reason(),
+      compaction->is_full_compaction(), compaction->is_manual_compaction(),
+      compaction->bottommost_level());
  CompactionServiceScheduleResponse response =
      db_options_.compaction_service->Schedule(info, compaction_input_binary);
  switch (response.status) {
@ -84,13 +91,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
          "CompactionService failed to schedule a remote compaction job.");
      ROCKS_LOG_WARN(db_options_.info_log,
                     "[%s] [JOB %d] Remote compaction failed to start.",
-                     compaction_input.column_family.name.c_str(), job_id_);
+                     compaction->column_family_data()->GetName().c_str(),
+                     job_id_);
      return response.status;
    case CompactionServiceJobStatus::kUseLocal:
      ROCKS_LOG_INFO(
          db_options_.info_log,
          "[%s] [JOB %d] Remote compaction fallback to local by API (Schedule)",
-          compaction_input.column_family.name.c_str(), job_id_);
+          compaction->column_family_data()->GetName().c_str(), job_id_);
      return response.status;
    default:
      assert(false);  // unknown status
@ -99,7 +107,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(

  ROCKS_LOG_INFO(db_options_.info_log,
                 "[%s] [JOB %d] Waiting for remote compaction...",
-                 compaction_input.column_family.name.c_str(), job_id_);
+                 compaction->column_family_data()->GetName().c_str(), job_id_);
  std::string compaction_result_binary;
  CompactionServiceJobStatus compaction_status =
      db_options_.compaction_service->Wait(response.scheduled_job_id,
@ -109,7 +117,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
    ROCKS_LOG_INFO(
        db_options_.info_log,
        "[%s] [JOB %d] Remote compaction fallback to local by API (Wait)",
-        compaction_input.column_family.name.c_str(), job_id_);
+        compaction->column_family_data()->GetName().c_str(), job_id_);
    return compaction_status;
  }

@ -134,15 +142,19 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
          "result is returned).");
      compaction_result.status.PermitUncheckedError();
    }
-    ROCKS_LOG_WARN(db_options_.info_log,
-                   "[%s] [JOB %d] Remote compaction failed.",
-                   compaction_input.column_family.name.c_str(), job_id_);
+    ROCKS_LOG_WARN(
+        db_options_.info_log, "[%s] [JOB %d] Remote compaction failed.",
+        compaction->column_family_data()->GetName().c_str(), job_id_);
    return compaction_status;
  }

+  // CompactionServiceJobStatus::kSuccess was returned, but somehow we failed to
+  // read the result. Consider this as an installation failure
  if (!s.ok()) {
    sub_compact->status = s;
    compaction_result.status.PermitUncheckedError();
+    db_options_.compaction_service->OnInstallation(
+        response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
    return CompactionServiceJobStatus::kFailure;
  }
  sub_compact->status = compaction_result.status;
@ -154,18 +166,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
    is_first_one = false;
  }

-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "[%s] [JOB %d] Receive remote compaction result, output path: "
-                 "%s, files: %s",
-                 compaction_input.column_family.name.c_str(), job_id_,
-                 compaction_result.output_path.c_str(),
-                 output_files_oss.str().c_str());
-
-  if (!s.ok()) {
-    sub_compact->status = s;
-    return CompactionServiceJobStatus::kFailure;
-  }
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "[%s] [JOB %d] Received remote compaction result, output path: "
+      "%s, files: %s",
+      compaction->column_family_data()->GetName().c_str(), job_id_,
+      compaction_result.output_path.c_str(), output_files_oss.str().c_str());

+  // Installation Starts
  for (const auto& file : compaction_result.output_files) {
    uint64_t file_num = versions_->NewFileNumber();
    auto src_file = compaction_result.output_path + "/" + file.file_name;
@ -174,6 +182,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
    s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
    if (!s.ok()) {
      sub_compact->status = s;
+      db_options_.compaction_service->OnInstallation(
+          response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
      return CompactionServiceJobStatus::kFailure;
    }

@ -182,6 +192,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
    s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
    if (!s.ok()) {
      sub_compact->status = s;
+      db_options_.compaction_service->OnInstallation(
+          response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
      return CompactionServiceJobStatus::kFailure;
    }
    meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
@ -191,6 +203,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
    meta.oldest_ancester_time = file.oldest_ancester_time;
    meta.file_creation_time = file.file_creation_time;
    meta.epoch_number = file.epoch_number;
+    meta.file_checksum = file.file_checksum;
+    meta.file_checksum_func_name = file.file_checksum_func_name;
    meta.marked_for_compaction = file.marked_for_compaction;
    meta.unique_id = file.unique_id;

@ -198,14 +212,19 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
    sub_compact->Current().AddOutput(std::move(meta),
                                     cfd->internal_comparator(), false, true,
                                     file.paranoid_hash);
+    sub_compact->Current().UpdateTableProperties(file.table_properties);
  }
  sub_compact->compaction_job_stats = compaction_result.stats;
  sub_compact->Current().SetNumOutputRecords(
-      compaction_result.num_output_records);
-  sub_compact->Current().SetTotalBytes(compaction_result.total_bytes);
+      compaction_result.stats.num_output_records);
+  sub_compact->Current().SetNumOutputFiles(
+      compaction_result.stats.num_output_files);
+  sub_compact->Current().AddBytesWritten(compaction_result.bytes_written);
  RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
  RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
             compaction_result.bytes_written);
+  db_options_.compaction_service->OnInstallation(
+      response.scheduled_job_id, CompactionServiceJobStatus::kSuccess);
  return CompactionServiceJobStatus::kSuccess;
 }

@ -220,6 +239,18 @@ void CompactionServiceCompactionJob::RecordCompactionIOStats() {
  CompactionJob::RecordCompactionIOStats();
 }

+void CompactionServiceCompactionJob::UpdateCompactionJobStats(
+    const InternalStats::CompactionStats& stats) const {
+  compaction_job_stats_->elapsed_micros = stats.micros;
+
+  // output information only in remote compaction
+  compaction_job_stats_->total_output_bytes = stats.bytes_written;
+  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
+  compaction_job_stats_->num_output_records = stats.num_output_records;
+  compaction_job_stats_->num_output_files = stats.num_output_files;
+  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+}
+
 CompactionServiceCompactionJob::CompactionServiceCompactionJob(
    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
@ -255,11 +286,11 @@ Status CompactionServiceCompactionJob::Run() {

  auto* c = compact_->compaction;
  assert(c->column_family_data() != nullptr);
-  assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
-             compact_->compaction->level()) > 0);
+  const VersionStorageInfo* storage_info = c->input_version()->storage_info();
+  assert(storage_info);
+  assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
+  write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());

-  write_hint_ =
-      c->column_family_data()->CalculateSSTWriteHint(c->output_level());
  bottommost_level_ = c->bottommost_level();

  Slice begin = compaction_input_.begin;
@ -274,6 +305,9 @@ Status CompactionServiceCompactionJob::Run() {

  log_buffer_->FlushBufferToLog();
  LogCompaction();
+
+  compaction_result_->stats.Reset();
+
  const uint64_t start_micros = db_options_.clock->NowMicros();
  c->GetOrInitInputTableProperties();

@ -314,20 +348,32 @@ Status CompactionServiceCompactionJob::Run() {
  if (status.ok()) {
    status = io_s;
  }
-  if (status.ok()) {
-    // TODO: Add verify_table()
-  }
-
-  // Finish up all book-keeping to unify the subcompaction results
-  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
-  UpdateCompactionStats();
-  RecordCompactionIOStats();

  LogFlush(db_options_.info_log);
  compact_->status = status;
  compact_->status.PermitUncheckedError();

-  // Build compaction result
+  // Build Compaction Job Stats
+
+  // 1. Aggregate CompactionOutputStats into Internal Compaction Stats
+  // (compaction_stats_) and aggregate Compaction Job Stats
+  // (compaction_job_stats_) from the sub compactions
+  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+
+  // 2. Update the Output information in the Compaction Job Stats with
+  // aggregated Internal Compaction Stats.
+  UpdateCompactionJobStats(compaction_stats_.stats);
+
+  // 3. Set fields that are not propagated as part of aggregations above
+  compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
+  compaction_result_->stats.is_full_compaction = c->is_full_compaction();
+  compaction_result_->stats.is_remote_compaction = true;
+
+  // 4. Update IO Stats that are not part of the aggregations above (bytes_read,
+  // bytes_written)
+  RecordCompactionIOStats();
+
+  // Build Output
  compaction_result_->output_level = compact_->compaction->output_level();
  compaction_result_->output_path = output_path_;
  for (const auto& output_file : sub_compact->GetOutputs()) {
@ -336,16 +382,14 @@ Status CompactionServiceCompactionJob::Run() {
        MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
        meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
        meta.largest.Encode().ToString(), meta.oldest_ancester_time,
-        meta.file_creation_time, meta.epoch_number,
-        output_file.validator.GetHash(), meta.marked_for_compaction,
-        meta.unique_id);
+        meta.file_creation_time, meta.epoch_number, meta.file_checksum,
+        meta.file_checksum_func_name, output_file.validator.GetHash(),
+        meta.marked_for_compaction, meta.unique_id,
+        *output_file.table_properties);
  }
-  InternalStats::CompactionStatsFull compaction_stats;
-  sub_compact->AggregateCompactionStats(compaction_stats);
-  compaction_result_->num_output_records =
-      compaction_stats.stats.num_output_records;
-  compaction_result_->total_bytes = compaction_stats.TotalBytesWritten();

+  TEST_SYNC_POINT_CALLBACK("CompactionServiceCompactionJob::Run:0",
+                           &compaction_result_);
  return status;
 }

@ -398,42 +442,9 @@ static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
 };

 static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
-    {"column_family",
-     OptionTypeInfo::Struct(
-         "column_family", &cfd_type_info,
-         offsetof(struct CompactionServiceInput, column_family),
-         OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
-    {"db_options",
-     {offsetof(struct CompactionServiceInput, db_options),
-      OptionType::kConfigurable, OptionVerificationType::kNormal,
-      OptionTypeFlags::kNone,
-      [](const ConfigOptions& opts, const std::string& /*name*/,
-         const std::string& value, void* addr) {
-        auto options = static_cast<DBOptions*>(addr);
-        return GetDBOptionsFromString(opts, DBOptions(), value, options);
-      },
-      [](const ConfigOptions& opts, const std::string& /*name*/,
-         const void* addr, std::string* value) {
-        const auto options = static_cast<const DBOptions*>(addr);
-        std::string result;
-        auto status = GetStringFromDBOptions(opts, *options, &result);
-        *value = "{" + result + "}";
-        return status;
-      },
-      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
-         const void* addr2, std::string* mismatch) {
-        const auto this_one = static_cast<const DBOptions*>(addr1);
-        const auto that_one = static_cast<const DBOptions*>(addr2);
-        auto this_conf = DBOptionsAsConfigurable(*this_one);
-        auto that_conf = DBOptionsAsConfigurable(*that_one);
-        std::string mismatch_opt;
-        bool result =
-            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
-        if (!result) {
-          *mismatch = name + "." + mismatch_opt;
-        }
-        return result;
-      }}},
+    {"cf_name",
+     {offsetof(struct CompactionServiceInput, cf_name),
+      OptionType::kEncodedString}},
    {"snapshots", OptionTypeInfo::Vector<uint64_t>(
                      offsetof(struct CompactionServiceInput, snapshots),
                      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
@ -461,6 +472,10 @@ static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
    {"end",
     {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString,
      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"options_file_number",
+     {offsetof(struct CompactionServiceInput, options_file_number),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
 };

 static std::unordered_map<std::string, OptionTypeInfo>
@ -497,6 +512,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {offsetof(struct CompactionServiceOutputFile, epoch_number),
          OptionType::kUInt64T, OptionVerificationType::kNormal,
          OptionTypeFlags::kNone}},
+        {"file_checksum",
+         {offsetof(struct CompactionServiceOutputFile, file_checksum),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_checksum_func_name",
+         {offsetof(struct CompactionServiceOutputFile, file_checksum_func_name),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
        {"paranoid_hash",
         {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
          OptionType::kUInt64T, OptionVerificationType::kNormal,
@ -510,7 +533,30 @@ static std::unordered_map<std::string, OptionTypeInfo>
             offsetof(struct CompactionServiceOutputFile, unique_id),
             OptionVerificationType::kNormal, OptionTypeFlags::kNone,
             {0, OptionType::kUInt64T})},
-};
+        {"table_properties",
+         {offsetof(struct CompactionServiceOutputFile, table_properties),
+          OptionType::kStruct, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone,
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto table_properties = static_cast<TableProperties*>(addr);
+            return TableProperties::Parse(opts, value, table_properties);
+          },
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const void* addr, std::string* value) {
+            const auto table_properties =
+                static_cast<const TableProperties*>(addr);
+            std::string result;
+            auto status = table_properties->Serialize(opts, &result);
+            *value = "{" + result + "}";
+            return status;
+          },
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const void* addr1, const void* addr2, std::string* mismatch) {
+            const auto this_one = static_cast<const TableProperties*>(addr1);
+            const auto that_one = static_cast<const TableProperties*>(addr2);
+            return this_one->AreEqual(opts, that_one, mismatch);
+          }}}};

 static std::unordered_map<std::string, OptionTypeInfo>
    compaction_job_stats_type_info = {
@ -557,6 +603,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {offsetof(struct CompactionJobStats, is_manual_compaction),
          OptionType::kBoolean, OptionVerificationType::kNormal,
          OptionTypeFlags::kNone}},
+        {"is_remote_compaction",
+         {offsetof(struct CompactionJobStats, is_remote_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
        {"total_input_bytes",
         {offsetof(struct CompactionJobStats, total_input_bytes),
          OptionType::kUInt64T, OptionVerificationType::kNormal,
@ -725,14 +775,6 @@ static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
     {offsetof(struct CompactionServiceResult, output_path),
      OptionType::kEncodedString, OptionVerificationType::kNormal,
      OptionTypeFlags::kNone}},
-    {"num_output_records",
-     {offsetof(struct CompactionServiceResult, num_output_records),
-      OptionType::kUInt64T, OptionVerificationType::kNormal,
-      OptionTypeFlags::kNone}},
-    {"total_bytes",
-     {offsetof(struct CompactionServiceResult, total_bytes),
-      OptionType::kUInt64T, OptionVerificationType::kNormal,
-      OptionTypeFlags::kNone}},
    {"bytes_read",
     {offsetof(struct CompactionServiceResult, bytes_read),
      OptionType::kUInt64T, OptionVerificationType::kNormal,
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@ -3,9 +3,9 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).

-
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/utilities/options_util.h"
 #include "table/unique_id_impl.h"

 namespace ROCKSDB_NAMESPACE {
@ -21,8 +21,10 @@ class MyTestCompactionService : public CompactionService {
      : db_path_(std::move(db_path)),
        options_(options),
        statistics_(statistics),
-        start_info_("na", "na", "na", 0, Env::TOTAL),
-        wait_info_("na", "na", "na", 0, Env::TOTAL),
+        start_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
+                    false, false, false),
+        wait_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
+                   false, false, false),
        listeners_(listeners),
        table_properties_collector_factories_(
            std::move(table_properties_collector_factories)) {}
@ -97,8 +99,12 @@ class MyTestCompactionService : public CompactionService {
    Status s =
        DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id,
                           compaction_input, result, options_override);
-    if (is_override_wait_result_) {
-      *result = override_wait_result_;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (is_override_wait_result_) {
+        *result = override_wait_result_;
+      }
+      result_ = *result;
    }
    compaction_num_.fetch_add(1);
    if (s.ok()) {
@ -108,6 +114,11 @@ class MyTestCompactionService : public CompactionService {
    }
  }

+  void OnInstallation(const std::string& /*scheduled_job_id*/,
+                      CompactionServiceJobStatus status) override {
+    final_updated_status_ = status;
+  }
+
  int GetCompactionNum() { return compaction_num_.load(); }

  CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
@ -136,6 +147,14 @@ class MyTestCompactionService : public CompactionService {

  void SetCanceled(bool canceled) { canceled_ = canceled; }

+  void GetResult(CompactionServiceResult* deserialized) {
+    CompactionServiceResult::Read(result_, deserialized).PermitUncheckedError();
+  }
+
+  CompactionServiceJobStatus GetFinalCompactionServiceJobStatus() {
+    return final_updated_status_.load();
+  }
+
 private:
  InstrumentedMutex mutex_;
  std::atomic_int compaction_num_{0};
@ -153,11 +172,14 @@ class MyTestCompactionService : public CompactionService {
  CompactionServiceJobStatus override_wait_status_ =
      CompactionServiceJobStatus::kFailure;
  bool is_override_wait_result_ = false;
+  std::string result_;
  std::string override_wait_result_;
  std::vector<std::shared_ptr<EventListener>> listeners_;
  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
      table_properties_collector_factories_;
  std::atomic_bool canceled_{false};
+  std::atomic<CompactionServiceJobStatus> final_updated_status_{
+      CompactionServiceJobStatus::kUseLocal};
 };

 class CompactionServiceTest : public DBTestBase {
@ -255,6 +277,8 @@ TEST_F(CompactionServiceTest, BasicCompactions) {

  auto my_cs = GetCompactionService();
  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+  ASSERT_EQ(CompactionServiceJobStatus::kSuccess,
+            my_cs->GetFinalCompactionServiceJobStatus());

  // make sure the compaction statistics is only recorded on the remote side
  ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
@ -318,6 +342,34 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "cf_1", "cf_2", "cf_3"},
                           options);
  ASSERT_GT(verify_passed, 0);
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  if (s.IsAborted()) {
+    ASSERT_NOK(result.status);
+  } else {
+    ASSERT_OK(result.status);
+  }
+  ASSERT_GE(result.stats.elapsed_micros, 1);
+  ASSERT_GE(result.stats.cpu_micros, 1);
+
+  ASSERT_EQ(20, result.stats.num_output_records);
+  ASSERT_EQ(result.output_files.size(), result.stats.num_output_files);
+
+  uint64_t total_size = 0;
+  for (auto output_file : result.output_files) {
+    std::string file_name = result.output_path + "/" + output_file.file_name;
+
+    uint64_t file_size = 0;
+    ASSERT_OK(options.env->GetFileSize(file_name, &file_size));
+    ASSERT_GT(file_size, 0);
+    total_size += file_size;
+  }
+  ASSERT_EQ(total_size, result.stats.total_output_bytes);
+
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_FALSE(result.stats.is_full_compaction);
+
  Close();
 }

@ -356,6 +408,507 @@ TEST_F(CompactionServiceTest, ManualCompaction) {
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
  VerifyTestData();
+
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+}
+
+TEST_F(CompactionServiceTest, PreservedOptionsLocalCompaction) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (auto i = 0; i < 2; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        std::string options_file_name = OptionsFileName(
+            dbname_,
+            compaction->input_version()->version_set()->options_file_number());
+
+        // Change option twice to make sure the very first OPTIONS file gets
+        // purged
+        ASSERT_OK(dbfull()->SetOptions(
+            {{"level0_file_num_compaction_trigger", "4"}}));
+        ASSERT_EQ(4, dbfull()->GetOptions().level0_file_num_compaction_trigger);
+        ASSERT_OK(dbfull()->SetOptions(
+            {{"level0_file_num_compaction_trigger", "6"}}));
+        ASSERT_EQ(6, dbfull()->GetOptions().level0_file_num_compaction_trigger);
+        dbfull()->TEST_DeleteObsoleteFiles();
+
+        // For non-remote compactions, OPTIONS file can be deleted while
+        // using option at the start of the compaction
+        Status s = env_->FileExists(options_file_name);
+        ASSERT_NOK(s);
+        ASSERT_TRUE(s.IsNotFound());
+        // Should be old value
+        ASSERT_EQ(2, compaction->mutable_cf_options()
+                         ->level0_file_num_compaction_trigger);
+        ASSERT_TRUE(dbfull()->min_options_file_numbers_.empty());
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.ok());
+}
+
+TEST_F(CompactionServiceTest, PreservedOptionsRemoteCompaction) {
+  // For non-remote compaction do not preserve options file
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  Random rnd(301);
+  for (auto i = 0; i < 2; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  bool is_primary_called = false;
+  // This will be called twice. One from primary and one from remote.
+  // Try changing the option when called from remote. Otherwise, the new option
+  // will be used
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", [&](void* /*arg*/) {
+        if (!is_primary_called) {
+          is_primary_called = true;
+          return;
+        }
+        // Change the option right before the compaction run
+        ASSERT_OK(dbfull()->SetOptions(
+            {{"level0_file_num_compaction_trigger", "4"}}));
+        ASSERT_EQ(4, dbfull()->GetOptions().level0_file_num_compaction_trigger);
+        dbfull()->TEST_DeleteObsoleteFiles();
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionServiceJob::ProcessKeyValueCompactionWithCompactionService",
+      [&](void* arg) {
+        auto input = static_cast<CompactionServiceInput*>(arg);
+        std::string options_file_name =
+            OptionsFileName(dbname_, input->options_file_number);
+
+        ASSERT_OK(env_->FileExists(options_file_name));
+        ASSERT_FALSE(dbfull()->min_options_file_numbers_.empty());
+        ASSERT_EQ(dbfull()->min_options_file_numbers_.front(),
+                  input->options_file_number);
+
+        DBOptions db_options;
+        ConfigOptions config_options;
+        std::vector<ColumnFamilyDescriptor> all_column_families;
+        config_options.env = env_;
+        ASSERT_OK(LoadOptionsFromFile(config_options, options_file_name,
+                                      &db_options, &all_column_families));
+        bool has_cf = false;
+        for (auto& cf : all_column_families) {
+          if (cf.name == input->cf_name) {
+            // Should be old value
+            ASSERT_EQ(2, cf.options.level0_file_num_compaction_trigger);
+            has_cf = true;
+          }
+        }
+        ASSERT_TRUE(has_cf);
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        ASSERT_EQ(2, compaction->mutable_cf_options()
+                         ->level0_file_num_compaction_trigger);
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.ok());
+
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+}
+
+class EventVerifier : public EventListener {
+ public:
+  explicit EventVerifier(uint64_t expected_num_input_records,
+                         size_t expected_num_input_files,
+                         uint64_t expected_num_output_records,
+                         size_t expected_num_output_files,
+                         const std::string& expected_smallest_output_key_prefix,
+                         const std::string& expected_largest_output_key_prefix,
+                         bool expected_is_remote_compaction_on_begin,
+                         bool expected_is_remote_compaction_on_complete)
+      : expected_num_input_records_(expected_num_input_records),
+        expected_num_input_files_(expected_num_input_files),
+        expected_num_output_records_(expected_num_output_records),
+        expected_num_output_files_(expected_num_output_files),
+        expected_smallest_output_key_prefix_(
+            expected_smallest_output_key_prefix),
+        expected_largest_output_key_prefix_(expected_largest_output_key_prefix),
+        expected_is_remote_compaction_on_begin_(
+            expected_is_remote_compaction_on_begin),
+        expected_is_remote_compaction_on_complete_(
+            expected_is_remote_compaction_on_complete) {}
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(expected_num_input_files_, ci.input_files.size());
+    ASSERT_EQ(expected_num_input_files_, ci.input_file_infos.size());
+    ASSERT_EQ(expected_is_remote_compaction_on_begin_,
+              ci.stats.is_remote_compaction);
+    ASSERT_TRUE(ci.stats.is_manual_compaction);
+    ASSERT_FALSE(ci.stats.is_full_compaction);
+  }
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_GT(ci.stats.elapsed_micros, 0);
+    ASSERT_GT(ci.stats.cpu_micros, 0);
+    ASSERT_EQ(expected_num_input_records_, ci.stats.num_input_records);
+    ASSERT_EQ(expected_num_input_files_, ci.stats.num_input_files);
+    ASSERT_EQ(expected_num_output_records_, ci.stats.num_output_records);
+    ASSERT_EQ(expected_num_output_files_, ci.stats.num_output_files);
+    ASSERT_EQ(expected_smallest_output_key_prefix_,
+              ci.stats.smallest_output_key_prefix);
+    ASSERT_EQ(expected_largest_output_key_prefix_,
+              ci.stats.largest_output_key_prefix);
+    ASSERT_GT(ci.stats.total_input_bytes, 0);
+    ASSERT_GT(ci.stats.total_output_bytes, 0);
+    ASSERT_EQ(ci.stats.num_input_records,
+              ci.stats.num_output_records + ci.stats.num_records_replaced);
+    ASSERT_EQ(expected_is_remote_compaction_on_complete_,
+              ci.stats.is_remote_compaction);
+    ASSERT_TRUE(ci.stats.is_manual_compaction);
+    ASSERT_FALSE(ci.stats.is_full_compaction);
+  }
+
+ private:
+  uint64_t expected_num_input_records_;
+  size_t expected_num_input_files_;
+  uint64_t expected_num_output_records_;
+  size_t expected_num_output_files_;
+  std::string expected_smallest_output_key_prefix_;
+  std::string expected_largest_output_key_prefix_;
+  bool expected_is_remote_compaction_on_begin_;
+  bool expected_is_remote_compaction_on_complete_;
+};
+
+TEST_F(CompactionServiceTest, VerifyStats) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  auto event_verifier = std::make_shared<EventVerifier>(
+      30 /* expected_num_input_records */, 3 /* expected_num_input_files */,
+      20 /* expected_num_output_records */, 1 /* expected_num_output_files */,
+      "key00000" /* expected_smallest_output_key_prefix */,
+      "key00001" /* expected_largest_output_key_prefix */,
+      true /* expected_is_remote_compaction_on_begin */,
+      true /* expected_is_remote_compaction_on_complete */);
+  options.listeners.push_back(event_verifier);
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(0);
+  std::string end_str = Key(1);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+}
+
+TEST_F(CompactionServiceTest, VerifyStatsLocalFallback) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  auto event_verifier = std::make_shared<EventVerifier>(
+      30 /* expected_num_input_records */, 3 /* expected_num_input_files */,
+      20 /* expected_num_output_records */, 1 /* expected_num_output_files */,
+      "key00000" /* expected_smallest_output_key_prefix */,
+      "key00001" /* expected_largest_output_key_prefix */,
+      true /* expected_is_remote_compaction_on_begin */,
+      false /* expected_is_remote_compaction_on_complete */);
+  options.listeners.push_back(event_verifier);
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+  my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal);
+
+  std::string start_str = Key(0);
+  std::string end_str = Key(1);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  // Remote Compaction did not happen
+  ASSERT_EQ(my_cs->GetCompactionNum(), comp_num);
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, CorruptedOutput) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
+        CompactionServiceResult* compaction_result =
+            *(static_cast<CompactionServiceResult**>(arg));
+        ASSERT_TRUE(compaction_result != nullptr &&
+                    !compaction_result->output_files.empty());
+        // Corrupt files here
+        for (const auto& output_file : compaction_result->output_files) {
+          std::string file_name =
+              compaction_result->output_path + "/" + output_file.file_name;
+
+          uint64_t file_size = 0;
+          Status s = options.env->GetFileSize(file_name, &file_size);
+          ASSERT_OK(s);
+          ASSERT_GT(file_size, 0);
+
+          ASSERT_OK(test::CorruptFile(env_, file_name, 0,
+                                      static_cast<int>(file_size),
+                                      true /* verifyChecksum */));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // CompactRange() should fail
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsCorruption());
+
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // On the worker side, the compaction is considered success
+  // Verification is done on the primary side
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+}
+
+TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) {
+  for (bool paranoid_file_check_enabled : {false, true}) {
+    SCOPED_TRACE("paranoid_file_check_enabled=" +
+                 std::to_string(paranoid_file_check_enabled));
+
+    Options options = CurrentOptions();
+    Destroy(options);
+    options.disable_auto_compactions = true;
+    options.paranoid_file_checks = paranoid_file_check_enabled;
+    ReopenWithCompactionService(&options);
+    GenerateTestData();
+
+    auto my_cs = GetCompactionService();
+
+    std::string start_str = Key(15);
+    std::string end_str = Key(45);
+    Slice start(start_str);
+    Slice end(end_str);
+    uint64_t comp_num = my_cs->GetCompactionNum();
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
+          CompactionServiceResult* compaction_result =
+              *(static_cast<CompactionServiceResult**>(arg));
+          ASSERT_TRUE(compaction_result != nullptr &&
+                      !compaction_result->output_files.empty());
+          // Corrupt files here
+          for (const auto& output_file : compaction_result->output_files) {
+            std::string file_name =
+                compaction_result->output_path + "/" + output_file.file_name;
+
+            // Corrupt very small range of bytes. This corruption is so small
+            // that this isn't caught by default light-weight check
+            ASSERT_OK(test::CorruptFile(env_, file_name, 0, 1,
+                                        false /* verifyChecksum */));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+    if (paranoid_file_check_enabled) {
+      ASSERT_NOK(s);
+      ASSERT_EQ(Status::Corruption("Paranoid checksums do not match"), s);
+    } else {
+      // CompactRange() goes through if paranoid file check is not enabled
+      ASSERT_OK(s);
+    }
+
+    ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // On the worker side, the compaction is considered success
+    // Verification is done on the primary side
+    CompactionServiceResult result;
+    my_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_TRUE(result.stats.is_manual_compaction);
+    ASSERT_TRUE(result.stats.is_remote_compaction);
+  }
+}
+
+TEST_F(CompactionServiceTest, TruncatedOutput) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
+        CompactionServiceResult* compaction_result =
+            *(static_cast<CompactionServiceResult**>(arg));
+        ASSERT_TRUE(compaction_result != nullptr &&
+                    !compaction_result->output_files.empty());
+        // Truncate files here
+        for (const auto& output_file : compaction_result->output_files) {
+          std::string file_name =
+              compaction_result->output_path + "/" + output_file.file_name;
+
+          uint64_t file_size = 0;
+          Status s = options.env->GetFileSize(file_name, &file_size);
+          ASSERT_OK(s);
+          ASSERT_GT(file_size, 0);
+
+          ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 2));
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // CompactRange() should fail
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsCorruption());
+
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // On the worker side, the compaction is considered success
+  // Verification is done on the primary side
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+}
+
+TEST_F(CompactionServiceTest, CustomFileChecksum) {
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
+        CompactionServiceResult* compaction_result =
+            *(static_cast<CompactionServiceResult**>(arg));
+        ASSERT_TRUE(compaction_result != nullptr &&
+                    !compaction_result->output_files.empty());
+        // Validate Checksum files here
+        for (const auto& output_file : compaction_result->output_files) {
+          std::string file_name =
+              compaction_result->output_path + "/" + output_file.file_name;
+
+          FileChecksumGenContext gen_context;
+          gen_context.file_name = file_name;
+          std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+              options.file_checksum_gen_factory->CreateFileChecksumGenerator(
+                  gen_context);
+
+          std::unique_ptr<SequentialFile> file_reader;
+          uint64_t file_size = 0;
+          Status s = options.env->GetFileSize(file_name, &file_size);
+          ASSERT_OK(s);
+          ASSERT_GT(file_size, 0);
+
+          s = options.env->NewSequentialFile(file_name, &file_reader,
+                                             EnvOptions());
+          ASSERT_OK(s);
+
+          Slice result;
+          std::unique_ptr<char[]> scratch(new char[file_size]);
+          s = file_reader->Read(file_size, &result, scratch.get());
+          ASSERT_OK(s);
+
+          file_checksum_gen->Update(scratch.get(), result.size());
+          file_checksum_gen->Finalize();
+
+          // Verify actual checksum and the func name
+          ASSERT_EQ(file_checksum_gen->Name(),
+                    output_file.file_checksum_func_name);
+          ASSERT_EQ(file_checksum_gen->GetChecksum(),
+                    output_file.file_checksum);
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
 }

 TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) {
@ -437,6 +990,8 @@ TEST_F(CompactionServiceTest, InvalidResult) {
  Slice end(end_str);
  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
  ASSERT_FALSE(s.ok());
+  ASSERT_EQ(CompactionServiceJobStatus::kFailure,
+            my_cs->GetFinalCompactionServiceJobStatus());
 }

 TEST_F(CompactionServiceTest, SubCompaction) {
@ -586,11 +1141,20 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
                              {file.db_path + "/" + file.name}, 2));
  info = my_cs->GetCompactionInfoForStart();
  ASSERT_EQ(Env::USER, info.priority);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
  info = my_cs->GetCompactionInfoForWait();
  ASSERT_EQ(Env::USER, info.priority);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);

  // Test priority BOTTOM
  env_->SetBackgroundThreads(1, Env::BOTTOM);
+  // This will set bottommost_level = true but is_full_compaction = false
  options.num_levels = 2;
  ReopenWithCompactionService(&options);
  my_cs =
@ -613,9 +1177,71 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
  ASSERT_EQ(Env::BOTTOM, info.priority);
  info = my_cs->GetCompactionInfoForWait();
  ASSERT_EQ(Env::BOTTOM, info.priority);
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
+
+  // Test Non-Bottommost Level
+  options.num_levels = 4;
+  ReopenWithCompactionService(&options);
+  my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+  for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value_new_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(false, info.bottommost_level);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(false, info.bottommost_level);
+
+  // Test Full Compaction + Bottommost Level
+  options.num_levels = 6;
+  ReopenWithCompactionService(&options);
+  my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value_new_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(true, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(true, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
 }

 TEST_F(CompactionServiceTest, FallbackLocalAuto) {
--- a/db/compaction/compaction_state.cc
+++ b/db/compaction/compaction_state.cc
@ -39,7 +39,7 @@ void CompactionState::AggregateCompactionStats(
    InternalStats::CompactionStatsFull& compaction_stats,
    CompactionJobStats& compaction_job_stats) {
  for (const auto& sc : sub_compact_states) {
-    sc.AggregateCompactionStats(compaction_stats);
+    sc.AggregateCompactionOutputStats(compaction_stats);
    compaction_job_stats.Add(sc.compaction_job_stats);
  }
 }
--- a/db/compaction/subcompaction_state.cc
+++ b/db/compaction/subcompaction_state.cc
@ -13,7 +13,7 @@
 #include "rocksdb/sst_partitioner.h"

 namespace ROCKSDB_NAMESPACE {
-void SubcompactionState::AggregateCompactionStats(
+void SubcompactionState::AggregateCompactionOutputStats(
    InternalStats::CompactionStatsFull& compaction_stats) const {
  compaction_stats.stats.Add(compaction_outputs_.stats_);
  if (HasPenultimateLevelOutputs()) {
@ -34,9 +34,16 @@ void SubcompactionState::Cleanup(Cache* cache) {

  if (!status.ok()) {
    for (const auto& out : GetOutputs()) {
-      // If this file was inserted into the table cache then remove
-      // them here because this compaction was not committed.
-      TableCache::Evict(cache, out.meta.fd.GetNumber());
+      // If this file was inserted into the table cache then remove it here
+      // because this compaction was not committed. This is not strictly
+      // required because of a backstop TableCache::Evict() in
+      // PurgeObsoleteFiles() but is our opportunity to apply
+      // uncache_aggressiveness. TODO: instead, put these files into the
+      // VersionSet::obsolete_files_ pipeline so that they don't have to
+      // be picked up by scanning the DB directory.
+      TableCache::ReleaseObsolete(
+          cache, out.meta.fd.GetNumber(), nullptr /*handle*/,
+          compaction->mutable_cf_options()->uncache_aggressiveness);
    }
  }
  // TODO: sub_compact.io_status is not checked like status. Not sure if thats
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@ -179,7 +179,7 @@ class SubcompactionState {

  void Cleanup(Cache* cache);

-  void AggregateCompactionStats(
+  void AggregateCompactionOutputStats(
      InternalStats::CompactionStatsFull& compaction_stats) const;

  CompactionOutputs& Current() const {
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@ -2512,6 +2512,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromMemtables) {
                              start_time + kSecondsPerRecording * (i + 1));
      }
    }
+    ASSERT_EQ(kNumKeys, i);
    ASSERT_OK(iter->status());
  }

@ -2531,12 +2532,13 @@ TEST_P(IteratorWriteTimeTest, ReadFromMemtables) {
      }
    }
    ASSERT_OK(iter->status());
+    ASSERT_EQ(-1, i);
  }

  // Reopen the DB and disable the seqno to time recording, data with user
  // specified write time can still get a write time before it's flushed.
  options.preserve_internal_time_seconds = 0;
-  DestroyAndReopen(options);
+  Reopen(options);
  ASSERT_OK(TimedPut(Key(kKeyWithWriteTime), rnd.RandomString(100),
                     kUserSpecifiedWriteTime));
  {
@ -2613,6 +2615,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
      }
    }
    ASSERT_OK(iter->status());
+    ASSERT_EQ(kNumKeys, i);
  }

  // Backward iteration
@ -2632,12 +2635,13 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
      }
    }
    ASSERT_OK(iter->status());
+    ASSERT_EQ(-1, i);
  }

  // Reopen the DB and disable the seqno to time recording. Data retrieved from
  // SST files still have write time available.
  options.preserve_internal_time_seconds = 0;
-  DestroyAndReopen(options);
+  Reopen(options);

  dbfull()->TEST_WaitForPeriodicTaskRun(
      [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); });
@ -2663,6 +2667,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
                              start_time + kSecondsPerRecording * (i + 1));
      }
    }
+    ASSERT_EQ(kNumKeys, i);
    ASSERT_OK(iter->status());
  }

@ -2686,6 +2691,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) {
      VerifyKeyAndWriteTime(iter.get(), Key(i), 0);
    }
    ASSERT_OK(iter->status());
+    ASSERT_EQ(kNumKeys, i);
  }
  Close();
 }
--- a/db/convenience.cc
+++ b/db/convenience.cc
@ -87,7 +87,7 @@ Status VerifySstFileChecksumInternal(const Options& options,
      options.block_protection_bytes_per_key, false /* skip_filters */,
      !kImmortal, false /* force_direct_prefetch */, -1 /* level */);
  reader_options.largest_seqno = largest_seqno;
-  s = ioptions.table_factory->NewTableReader(
+  s = options.table_factory->NewTableReader(
      read_options, reader_options, std::move(file_reader), file_size,
      &table_reader, false /* prefetch_index_and_filter_in_cache */);
  if (!s.ok()) {
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@ -688,76 +688,100 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) {
  constexpr size_t kMinIdSize = 10;
  do {
    for (bool with_manifest : {false, true}) {
-      std::string idfilename = IdentityFileName(dbname_);
-      std::string id1, tmp;
-      ASSERT_OK(db_->GetDbIdentity(id1));
-      ASSERT_GE(id1.size(), kMinIdSize);
+      for (bool write_file : {false, true}) {
+        std::string idfilename = IdentityFileName(dbname_);
+        std::string id1, tmp;
+        ASSERT_OK(db_->GetDbIdentity(id1));
+        ASSERT_GE(id1.size(), kMinIdSize);

-      Options options = CurrentOptions();
-      options.write_dbid_to_manifest = with_manifest;
-      Reopen(options);
-      std::string id2;
-      ASSERT_OK(db_->GetDbIdentity(id2));
-      // id2 should match id1 because identity was not regenerated
-      ASSERT_EQ(id1, id2);
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id2);
+        Options options = CurrentOptions();
+        options.write_dbid_to_manifest = with_manifest;
+        options.write_identity_file = true;  // initially
+        Reopen(options);
+        std::string id2;
+        ASSERT_OK(db_->GetDbIdentity(id2));
+        // id2 should match id1 because identity was not regenerated
+        ASSERT_EQ(id1, id2);
+        ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+        ASSERT_EQ(tmp, id2);

-      // Recover from deleted/missing IDENTITY
-      ASSERT_OK(env_->DeleteFile(idfilename));
-      Reopen(options);
-      std::string id3;
-      ASSERT_OK(db_->GetDbIdentity(id3));
-      if (with_manifest) {
-        // id3 should match id1 because identity was restored from manifest
-        ASSERT_EQ(id1, id3);
-      } else {
-        // id3 should NOT match id1 because identity was regenerated
-        ASSERT_NE(id1, id3);
-        ASSERT_GE(id3.size(), kMinIdSize);
-      }
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id3);
+        if (write_file) {
+          // Recover from deleted/missing IDENTITY
+          ASSERT_OK(env_->DeleteFile(idfilename));
+        } else {
+          // Transition to no IDENTITY file
+          options.write_identity_file = false;
+          if (!with_manifest) {
+            // Incompatible options, should fail
+            ASSERT_NOK(TryReopen(options));
+            // Back to a usable config and continue
+            options.write_identity_file = true;
+            Reopen(options);
+            continue;
+          }
+        }
+        Reopen(options);
+        std::string id3;
+        ASSERT_OK(db_->GetDbIdentity(id3));
+        if (with_manifest) {
+          // id3 should match id1 because identity was restored from manifest
+          ASSERT_EQ(id1, id3);
+        } else {
+          // id3 should NOT match id1 because identity was regenerated
+          ASSERT_NE(id1, id3);
+          ASSERT_GE(id3.size(), kMinIdSize);
+        }
+        if (write_file) {
+          ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+          ASSERT_EQ(tmp, id3);

-      // Recover from truncated IDENTITY
-      {
-        std::unique_ptr<WritableFile> w;
-        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
-        ASSERT_OK(w->Close());
-      }
-      Reopen(options);
-      std::string id4;
-      ASSERT_OK(db_->GetDbIdentity(id4));
-      if (with_manifest) {
-        // id4 should match id1 because identity was restored from manifest
-        ASSERT_EQ(id1, id4);
-      } else {
-        // id4 should NOT match id1 because identity was regenerated
-        ASSERT_NE(id1, id4);
-        ASSERT_GE(id4.size(), kMinIdSize);
-      }
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id4);
+          // Recover from truncated IDENTITY
+          std::unique_ptr<WritableFile> w;
+          ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+          ASSERT_OK(w->Close());
+        } else {
+          ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
+        }
+        Reopen(options);
+        std::string id4;
+        ASSERT_OK(db_->GetDbIdentity(id4));
+        if (with_manifest) {
+          // id4 should match id1 because identity was restored from manifest
+          ASSERT_EQ(id1, id4);
+        } else {
+          // id4 should NOT match id1 because identity was regenerated
+          ASSERT_NE(id1, id4);
+          ASSERT_GE(id4.size(), kMinIdSize);
+        }
+        std::string silly_id = "asdf123456789";
+        if (write_file) {
+          ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+          ASSERT_EQ(tmp, id4);

-      // Recover from overwritten IDENTITY
-      std::string silly_id = "asdf123456789";
-      {
-        std::unique_ptr<WritableFile> w;
-        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
-        ASSERT_OK(w->Append(silly_id));
-        ASSERT_OK(w->Close());
+          // Recover from overwritten IDENTITY
+          std::unique_ptr<WritableFile> w;
+          ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+          ASSERT_OK(w->Append(silly_id));
+          ASSERT_OK(w->Close());
+        } else {
+          ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
+        }
+        Reopen(options);
+        std::string id5;
+        ASSERT_OK(db_->GetDbIdentity(id5));
+        if (with_manifest) {
+          // id4 should match id1 because identity was restored from manifest
+          ASSERT_EQ(id1, id5);
+        } else {
+          ASSERT_EQ(id5, silly_id);
+        }
+        if (write_file) {
+          ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+          ASSERT_EQ(tmp, id5);
+        } else {
+          ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
+        }
      }
-      Reopen(options);
-      std::string id5;
-      ASSERT_OK(db_->GetDbIdentity(id5));
-      if (with_manifest) {
-        // id4 should match id1 because identity was restored from manifest
-        ASSERT_EQ(id1, id5);
-      } else {
-        ASSERT_EQ(id5, silly_id);
-      }
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id5);
    }
  } while (ChangeCompactOptions());
 }
@ -3407,6 +3431,46 @@ class TableFileListener : public EventListener {
  InstrumentedMutex mutex_;
  std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
 };
+
+class FlushTableFileListener : public EventListener {
+ public:
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    if (info.reason != TableFileCreationReason::kFlush) {
+      return;
+    }
+    cf_to_flushed_files_[info.cf_name].push_back(info.file_path);
+  }
+  std::vector<std::string>& GetFlushedFiles(const std::string& cf_name) {
+    InstrumentedMutexLock lock(&mutex_);
+    return cf_to_flushed_files_[cf_name];
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::unordered_map<std::string, std::vector<std::string>>
+      cf_to_flushed_files_;
+};
+
+class FlushBlobFileListener : public EventListener {
+ public:
+  void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    if (info.reason != BlobFileCreationReason::kFlush) {
+      return;
+    }
+    cf_to_flushed_blobs_files_[info.cf_name].push_back(info.file_path);
+  }
+  std::vector<std::string>& GetFlushedBlobFiles(const std::string& cf_name) {
+    InstrumentedMutexLock lock(&mutex_);
+    return cf_to_flushed_blobs_files_[cf_name];
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::unordered_map<std::string, std::vector<std::string>>
+      cf_to_flushed_blobs_files_;
+};
 }  // anonymous namespace

 TEST_F(DBBasicTest, LastSstFileNotInManifest) {
@ -3512,6 +3576,121 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) {
  }
 }

+// Param 0: whether to enable blob DB.
+// Param 1: when blob DB is enabled, whether to also delete the missing L0
+// file's associated blob file.
+class BestEffortsRecoverIncompleteVersionTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  BestEffortsRecoverIncompleteVersionTest()
+      : DBTestBase("best_efforts_recover_incomplete_version_test",
+                   /*env_do_fsync=*/false) {}
+};
+
+TEST_P(BestEffortsRecoverIncompleteVersionTest, Basic) {
+  Options options = CurrentOptions();
+  options.enable_blob_files = std::get<0>(GetParam());
+  bool delete_blob_file_too = std::get<1>(GetParam());
+  DestroyAndReopen(options);
+  FlushTableFileListener* flush_table_listener = new FlushTableFileListener();
+  FlushBlobFileListener* flush_blob_listener = new FlushBlobFileListener();
+  // Disable auto compaction to simplify SST file name tracking.
+  options.disable_auto_compactions = true;
+  options.listeners.emplace_back(flush_table_listener);
+  options.listeners.emplace_back(flush_blob_listener);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
+                                           "eevee"};
+  int num_cfs = static_cast<int>(handles_.size());
+  ASSERT_EQ(3, num_cfs);
+  std::string start = "a";
+  Slice start_slice = start;
+  std::string end = "d";
+  Slice end_slice = end;
+  for (int cf = 0; cf != num_cfs; ++cf) {
+    ASSERT_OK(Put(cf, "a", "a_value"));
+    ASSERT_OK(Flush(cf));
+    // Compact file to L1 to avoid trivial file move in the next compaction
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
+                                &start_slice, &end_slice));
+    ASSERT_OK(Put(cf, "a", "a_value_new"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(Put(cf, "b", "b_value"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(Put(cf, "f", "f_value"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
+                                &start_slice, &end_slice));
+  }
+
+  dbfull()->TEST_DeleteObsoleteFiles();
+
+  // Delete the most recent L0 file which is before a compaction.
+  for (int i = 0; i < num_cfs; ++i) {
+    std::vector<std::string>& files =
+        flush_table_listener->GetFlushedFiles(all_cf_names[i]);
+    ASSERT_EQ(4, files.size());
+    ASSERT_OK(env_->DeleteFile(files[files.size() - 1]));
+    if (options.enable_blob_files) {
+      std::vector<std::string>& blob_files =
+          flush_blob_listener->GetFlushedBlobFiles(all_cf_names[i]);
+      ASSERT_EQ(4, blob_files.size());
+      if (delete_blob_file_too) {
+        ASSERT_OK(env_->DeleteFile(blob_files[files.size() - 1]));
+      }
+    }
+  }
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies(all_cf_names, options);
+
+  for (int i = 0; i < num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ColumnFamilyData* cfd = cfh->cfd();
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    // The L0 file flushed right before the last compaction is missing.
+    ASSERT_EQ(0, vstorage->LevelFiles(0).size());
+    // Only the output of the last compaction is available.
+    ASSERT_EQ(1, vstorage->LevelFiles(1).size());
+  }
+  // Verify data
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  for (int i = 0; i < num_cfs; ++i) {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[i]));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ("a", iter->key());
+    ASSERT_EQ("a_value_new", iter->value());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ("b", iter->key());
+    ASSERT_EQ("b_value", iter->value());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  // Write more data.
+  for (int cf = 0; cf < num_cfs; ++cf) {
+    ASSERT_OK(Put(cf, "g", "g_value"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
+                                nullptr));
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), handles_[cf], "g", &value));
+    ASSERT_EQ("g_value", value);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(BestEffortsRecoverIncompleteVersionTest,
+                        BestEffortsRecoverIncompleteVersionTest,
+                        testing::Values(std::make_tuple(false, false),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(true, true)));
+
 TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
  Options options = CurrentOptions();
  options.env = env_;
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@ -563,7 +563,7 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
  }
 }

-TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) {
+TEST_F(DBBlockCacheTest, DynamicOptions) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
@ -578,39 +578,74 @@ TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) {
  DestroyAndReopen(options);

  std::string value(kValueSize, 'a');
+  auto st = options.statistics;

-  for (size_t i = 1; i <= 5; i++) {
-    ASSERT_OK(Put(std::to_string(i), value));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(1,
-              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+  size_t i = 1;
+  ASSERT_OK(Put(std::to_string(i), value));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));

-    ASSERT_EQ(value, Get(std::to_string(i)));
-    ASSERT_EQ(0,
-              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
-    ASSERT_EQ(
-        0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
-    ASSERT_EQ(1,
-              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
-  }
+  ASSERT_EQ(value, Get(std::to_string(i)));
+  ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+  ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+  ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));

+  ++i;
  ASSERT_OK(dbfull()->SetOptions(
      {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}}));

-  for (size_t i = 6; i <= kNumBlocks; i++) {
-    ASSERT_OK(Put(std::to_string(i), value));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(0,
-              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+  ASSERT_OK(Put(std::to_string(i), value));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));

-    ASSERT_EQ(value, Get(std::to_string(i)));
-    ASSERT_EQ(1,
-              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
-    ASSERT_EQ(
-        1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
-    ASSERT_EQ(0,
-              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
-  }
+  ASSERT_EQ(value, Get(std::to_string(i)));
+  ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+  ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+  ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  ++i;
+  ASSERT_OK(dbfull()->SetOptions({{"block_based_table_factory",
+                                   "{prepopulate_block_cache=kFlushOnly;}"}}));
+
+  ASSERT_OK(Put(std::to_string(i), value));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+  ASSERT_EQ(value, Get(std::to_string(i)));
+  ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+  ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+  ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  ++i;
+  // NOT YET SUPPORTED
+  // FIXME: find a way to make this fail again (until well supported)
+  // ASSERT_NOK(dbfull()->SetOptions(
+  //    {{"block_based_table_factory", "{block_cache=null;}"}}));
+
+  // ASSERT_OK(Put(std::to_string(i), value));
+  // ASSERT_OK(Flush());
+  // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+  // ASSERT_EQ(value, Get(std::to_string(i)));
+  // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+  // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+  // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  // ++i;
+
+  // NOT YET SUPPORTED
+  // FIXME: find a way to make this fail again (until well supported)
+  // ASSERT_NOK(dbfull()->SetOptions(
+  //    {{"block_based_table_factory", "{block_cache=1M;}"}}));
+
+  // ASSERT_OK(Put(std::to_string(i), value));
+  // ASSERT_OK(Flush());
+  // ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+  // ASSERT_EQ(value, Get(std::to_string(i)));
+  // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+  // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+  // ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
 }
 #endif

--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@ -6146,7 +6146,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {

  std::vector<std::string> pending_compaction_cfs;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "SchedulePendingCompaction::cfd", [&](void* arg) {
+      "EnqueuePendingCompaction::cfd", [&](void* arg) {
        const std::string& cf_name =
            static_cast<ColumnFamilyData*>(arg)->GetName();
        pending_compaction_cfs.emplace_back(cf_name);
@ -9357,12 +9357,13 @@ TEST_F(DBCompactionTest, FIFOChangeTemperature) {
    ASSERT_OK(Flush());

    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
    ASSERT_OK(Put(Key(2), "value2"));
    ASSERT_OK(Flush());

+    // First two L0 files both become eligible for temperature change compaction
+    // They should be compacted one-by-one.
    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
+    env_->MockSleepForSeconds(1200);
    ASSERT_OK(Put(Key(2), "value2"));
    ASSERT_OK(Flush());
    ASSERT_OK(dbfull()->TEST_WaitForCompact());
@ -10622,6 +10623,97 @@ TEST_F(DBCompactionTest, ReleaseCompactionDuringManifestWrite) {
  SyncPoint::GetInstance()->ClearAllCallBacks();
 }

+TEST_F(DBCompactionTest, RecordNewestKeyTimeForTtlCompaction) {
+  Options options;
+  SetTimeElapseOnlySleepOnReopen(&options);
+  options.env = CurrentOptions().env;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.compaction_options_fifo.allow_compaction = false;
+  options.num_levels = 1;
+  env_->SetMockSleep();
+  options.env = env_;
+  options.ttl = 1 * 60 * 60;  // 1 hour
+  ASSERT_OK(TryReopen(options));
+
+  // Generate and flush 4 files, each about 10KB
+  // Compaction is manually disabled at this point so we can check
+  // each file's newest_key_time
+  Random rnd(301);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(5);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+  // Check that we are populating newest_key_time on flush
+  std::vector<FileMetaData*> file_metadatas = GetLevelFileMetadatas(0);
+  ASSERT_EQ(file_metadatas.size(), 4);
+  uint64_t first_newest_key_time =
+      file_metadatas[0]->fd.table_reader->GetTableProperties()->newest_key_time;
+  ASSERT_NE(first_newest_key_time, kUnknownNewestKeyTime);
+  // Check that the newest_key_times are in expected ordering
+  uint64_t prev_newest_key_time = first_newest_key_time;
+  for (size_t idx = 1; idx < file_metadatas.size(); idx++) {
+    uint64_t newest_key_time = file_metadatas[idx]
+                                   ->fd.table_reader->GetTableProperties()
+                                   ->newest_key_time;
+
+    ASSERT_LT(newest_key_time, prev_newest_key_time);
+    prev_newest_key_time = newest_key_time;
+    ASSERT_EQ(newest_key_time, file_metadatas[idx]
+                                   ->fd.table_reader->GetTableProperties()
+                                   ->creation_time);
+  }
+  // The delta between the first and last newest_key_times is 15s
+  uint64_t last_newest_key_time = prev_newest_key_time;
+  ASSERT_EQ(15, first_newest_key_time - last_newest_key_time);
+
+  // After compaction, the newest_key_time of the output file should be the max
+  // of the input files
+  options.compaction_options_fifo.allow_compaction = true;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  file_metadatas = GetLevelFileMetadatas(0);
+  ASSERT_EQ(file_metadatas.size(), 1);
+  ASSERT_EQ(
+      file_metadatas[0]->fd.table_reader->GetTableProperties()->newest_key_time,
+      first_newest_key_time);
+  // Contrast newest_key_time with creation_time, which records the oldest
+  // ancestor time (15s older than newest_key_time)
+  ASSERT_EQ(
+      file_metadatas[0]->fd.table_reader->GetTableProperties()->creation_time,
+      last_newest_key_time);
+  ASSERT_EQ(file_metadatas[0]->oldest_ancester_time, last_newest_key_time);
+
+  // Make sure TTL of 5s causes compaction
+  env_->MockSleepForSeconds(6);
+
+  // The oldest input file is older than 15s
+  // However the newest of the compaction input files is younger than 15s, so
+  // we don't compact
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "15"}}));
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 15);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // Now even the youngest input file is too old
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "5"}}));
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 5);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+}
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/db_follower_test.cc
+++ b/db/db_follower_test.cc
@ -289,10 +289,12 @@ TEST_F(DBFollowerTest, RetryCatchup) {
      {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
      {"DBImpl::BackgroundCompaction:Start",
       "DBImplFollower::TryCatchupWithLeader:Begin2"},
-      {"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1",
+      {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
+       "Begin1",
       "DBImpl::BackgroundCompaction:BeforeCompaction"},
      {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
-       "VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"},
+       "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
+       "Begin2"},
      {"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"},
  });
  SyncPoint::GetInstance()->EnableProcessing();
@ -335,10 +337,12 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
  SyncPoint::GetInstance()->LoadDependency({
      {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
      {"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"},
-      {"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1",
+      {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
+       "Begin1",
       "Leader::Done"},
      {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
-       "VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"},
+       "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
+       "Begin2"},
      {"DBImplFollower::TryCatchupWithLeader:End",
       "Follower::WaitForCatchup:1"},
  });
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@ -17,6 +17,7 @@
 #include <cstdio>
 #include <map>
 #include <memory>
+#include <optional>
 #include <set>
 #include <sstream>
 #include <stdexcept>
@ -472,7 +473,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {

  if (s.ok()) {
    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      SchedulePendingCompaction(cfd);
+      EnqueuePendingCompaction(cfd);
    }
    MaybeScheduleFlushOrCompaction();
  }
@ -529,6 +530,11 @@ Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() {
  return Status::OK();
 }

+void DBImpl::UntrackDataFiles() {
+  TrackOrUntrackFiles(/*existing_data_files=*/{},
+                      /*track=*/false);
+}
+
 Status DBImpl::CloseHelper() {
  // Guarantee that there is no background error recovery in progress before
  // continuing with the shutdown
@ -653,8 +659,9 @@ Status DBImpl::CloseHelper() {
  // We need to release them before the block cache is destroyed. The block
  // cache may be destroyed inside versions_.reset(), when column family data
  // list is destroyed, so leaving handles in table cache after
-  // versions_.reset() may cause issues.
-  // Here we clean all unreferenced handles in table cache.
+  // versions_.reset() may cause issues. Here we clean all unreferenced handles
+  // in table cache, and (for certain builds/conditions) assert that no obsolete
+  // files are hanging around unreferenced (leak) in the table/blob file cache.
  // Now we assume all user queries have finished, so only version set itself
  // can possibly hold the blocks from block cache. After releasing unreferenced
  // handles here, only handles held by version set left and inside
@ -662,12 +669,22 @@ Status DBImpl::CloseHelper() {
  // time a handle is released, we erase it from the cache too. By doing that,
  // we can guarantee that after versions_.reset(), table cache is empty
  // so the cache can be safely destroyed.
+#ifndef NDEBUG
+  TEST_VerifyNoObsoleteFilesCached(/*db_mutex_already_held=*/true);
+#endif  // !NDEBUG
  table_cache_->EraseUnRefEntries();

  for (auto& txn_entry : recovered_transactions_) {
    delete txn_entry.second;
  }

+  // Return an unowned SstFileManager to a consistent state
+  if (immutable_db_options_.sst_file_manager && !own_sfm_) {
+    mutex_.Unlock();
+    UntrackDataFiles();
+    mutex_.Lock();
+  }
+
  // versions need to be destroyed before table_cache since it can hold
  // references to table_cache.
  {
@ -835,10 +852,11 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
    InstrumentedMutexLock l(&mutex_);

    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto& mopts = *cfd->GetLatestMutableCFOptions();
      // preserve time is the max of 2 options.
      uint64_t preserve_seconds =
-          std::max(cfd->ioptions()->preserve_internal_time_seconds,
-                   cfd->ioptions()->preclude_last_level_data_seconds);
+          std::max(mopts.preserve_internal_time_seconds,
+                   mopts.preclude_last_level_data_seconds);
      if (!cfd->IsDropped() && preserve_seconds > 0) {
        min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
        max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
@ -1140,6 +1158,13 @@ void DBImpl::DumpStats() {
        continue;
      }

+      auto* table_factory =
+          cfd->GetCurrentMutableCFOptions()->table_factory.get();
+      assert(table_factory != nullptr);
+      // FIXME: need to a shared_ptr if/when block_cache is going to be mutable
+      Cache* cache =
+          table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+
      // Release DB mutex for gathering cache entry stats. Pass over all
      // column families for this first so that other stats are dumped
      // near-atomically.
@ -1148,10 +1173,6 @@ void DBImpl::DumpStats() {

      // Probe block cache for problems (if not already via another CF)
      if (immutable_db_options_.info_log) {
-        auto* table_factory = cfd->ioptions()->table_factory.get();
-        assert(table_factory != nullptr);
-        Cache* cache =
-            table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
        if (cache && probed_caches.insert(cache).second) {
          cache->ReportProblems(immutable_db_options_.info_log);
        }
@ -1525,7 +1546,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
                      io_s.ToString().c_str());
      // In case there is a fs error we should set it globally to prevent the
      // future writes
-      IOStatusCheck(io_s);
+      WALIOStatusCheck(io_s);
      // whether sync or not, we should abort the rest of function upon error
      return static_cast<Status>(io_s);
    }
@ -1682,7 +1703,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
                    io_s.ToString().c_str());
    // In case there is a fs error we should set it globally to prevent the
    // future writes
-    IOStatusCheck(io_s);
+    WALIOStatusCheck(io_s);
  }
  if (io_s.ok() && need_wal_dir_sync) {
    io_s = directories_.GetWalDir()->FsyncWithDirOptions(
@ -2053,15 +2074,19 @@ InternalIterator* DBImpl::NewInternalIterator(
    bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
  InternalIterator* internal_iter;
  assert(arena != nullptr);
+  auto prefix_extractor =
+      super_version->mutable_cf_options.prefix_extractor.get();
  // Need to create internal iterator from the arena.
  MergeIteratorBuilder merge_iter_builder(
      &cfd->internal_comparator(), arena,
-      !read_options.total_order_seek &&
-          super_version->mutable_cf_options.prefix_extractor != nullptr,
+      // FIXME? It's not clear what interpretation of prefix seek is needed
+      // here, and no unit test cares about the value provided here.
+      !read_options.total_order_seek && prefix_extractor != nullptr,
      read_options.iterate_upper_bound);
  // Collect iterator for mutable memtable
  auto mem_iter = super_version->mem->NewIterator(
-      read_options, super_version->GetSeqnoToTimeMapping(), arena);
+      read_options, super_version->GetSeqnoToTimeMapping(), arena,
+      super_version->mutable_cf_options.prefix_extractor.get());
  Status s;
  if (!read_options.ignore_range_deletions) {
    std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter;
@ -2085,6 +2110,7 @@ InternalIterator* DBImpl::NewInternalIterator(
  if (s.ok()) {
    super_version->imm->AddIterators(
        read_options, super_version->GetSeqnoToTimeMapping(),
+        super_version->mutable_cf_options.prefix_extractor.get(),
        &merge_iter_builder, !read_options.ignore_range_deletions);
  }
  TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
@ -2475,7 +2501,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
        RecordTick(stats_, MEMTABLE_HIT);
      }
    }
-    if (!done && !s.ok() && !s.IsMergeInProgress()) {
+    if (!s.ok() && !s.IsMergeInProgress() && !s.IsNotFound()) {
+      assert(done);
      ReturnAndCleanupSuperVersion(cfd, sv);
      return s;
    }
@ -3141,10 +3168,11 @@ Status DBImpl::MultiGetImpl(
  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);

  assert(sorted_keys);
+  assert(start_key + num_keys <= sorted_keys->size());
  // Clear the timestamps for returning results so that we can distinguish
  // between tombstone or key that has never been written
-  for (auto* kctx : *sorted_keys) {
-    assert(kctx);
+  for (size_t i = start_key; i < start_key + num_keys; ++i) {
+    KeyContext* kctx = (*sorted_keys)[i];
    if (kctx->timestamp) {
      kctx->timestamp->clear();
    }
@ -3207,6 +3235,8 @@ Status DBImpl::MultiGetImpl(
      s = Status::Aborted();
      break;
    }
+    // This could be a long-running operation
+    ROCKSDB_THREAD_YIELD_HOOK();
  }

  // Post processing (decrement reference counts and record statistics)
@ -3690,6 +3720,9 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
  edit.SetColumnFamily(cfd->GetID());

  Status s;
+  // Save re-aquiring lock for RegisterRecordSeqnoTimeWorker when not
+  // applicable
+  bool used_preserve_preclude = false;
  {
    InstrumentedMutexLock l(&mutex_);
    if (cfd->IsDropped()) {
@ -3705,9 +3738,11 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
      write_thread_.ExitUnbatched(&w);
    }
    if (s.ok()) {
-      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
-      max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
-                                    mutable_cf_options->max_write_buffer_number;
+      auto& moptions = *cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ -=
+          moptions.write_buffer_size * moptions.max_write_buffer_number;
+      used_preserve_preclude = moptions.preserve_internal_time_seconds > 0 ||
+                               moptions.preclude_last_level_data_seconds > 0;
    }

    if (!cf_support_snapshot) {
@ -3725,8 +3760,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
    bg_cv_.SignalAll();
  }

-  if (cfd->ioptions()->preserve_internal_time_seconds > 0 ||
-      cfd->ioptions()->preclude_last_level_data_seconds > 0) {
+  if (used_preserve_preclude) {
    s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
                                      /* is_new_db */ false);
  }
@ -3828,6 +3862,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
    }
  }
  if (read_options.tailing) {
+    read_options.total_order_seek |=
+        immutable_db_options_.prefix_seek_opt_in_only;
+
    auto iter = new ForwardIterator(this, read_options, cfd, sv,
                                    /* allow_unprepared_value */ true);
    result = NewDBIterator(
@ -3961,14 +3998,25 @@ std::unique_ptr<IterType> DBImpl::NewMultiCfIterator(
          "Different comparators are being used across CFs"));
    }
  }
+
  std::vector<Iterator*> child_iterators;
  Status s = NewIterators(_read_options, column_families, &child_iterators);
  if (!s.ok()) {
    return error_iterator_func(s);
  }
-  return std::make_unique<ImplType>(column_families[0]->GetComparator(),
-                                    column_families,
-                                    std::move(child_iterators));
+
+  assert(column_families.size() == child_iterators.size());
+
+  std::vector<std::pair<ColumnFamilyHandle*, std::unique_ptr<Iterator>>>
+      cfh_iter_pairs;
+  cfh_iter_pairs.reserve(column_families.size());
+  for (size_t i = 0; i < column_families.size(); ++i) {
+    cfh_iter_pairs.emplace_back(column_families[i], child_iterators[i]);
+  }
+
+  return std::make_unique<ImplType>(_read_options,
+                                    column_families[0]->GetComparator(),
+                                    std::move(cfh_iter_pairs));
 }

 Status DBImpl::NewIterators(
@ -4029,6 +4077,9 @@ Status DBImpl::NewIterators(

  assert(cf_sv_pairs.size() == column_families.size());
  if (read_options.tailing) {
+    read_options.total_order_seek |=
+        immutable_db_options_.prefix_seek_opt_in_only;
+
    for (const auto& cf_sv_pair : cf_sv_pairs) {
      auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
                                      cf_sv_pair.super_version,
@ -4269,8 +4320,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
    }
    // Avoid to go through every column family by checking a global threshold
    // first.
+    CfdList cf_scheduled;
    if (oldest_snapshot > bottommost_files_mark_threshold_) {
-      CfdList cf_scheduled;
      for (auto* cfd : *versions_->GetColumnFamilySet()) {
        if (!cfd->ioptions()->allow_ingest_behind) {
          cfd->current()->storage_info()->UpdateOldestSnapshot(
@ -4279,7 +4330,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
                   ->storage_info()
                   ->BottommostFilesMarkedForCompaction()
                   .empty()) {
-            SchedulePendingCompaction(cfd);
+            EnqueuePendingCompaction(cfd);
            MaybeScheduleFlushOrCompaction();
            cf_scheduled.push_back(cfd);
          }
@ -4302,6 +4353,24 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
      }
      bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
    }
+
+    // Avoid to go through every column family by checking a global threshold
+    // first.
+    if (oldest_snapshot >= standalone_range_deletion_files_mark_threshold_) {
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->IsDropped() || CfdListContains(cf_scheduled, cfd)) {
+          continue;
+        }
+        if (oldest_snapshot >=
+            cfd->current()
+                ->storage_info()
+                ->standalone_range_tombstone_files_mark_threshold()) {
+          EnqueuePendingCompaction(cfd);
+          MaybeScheduleFlushOrCompaction();
+          cf_scheduled.push_back(cfd);
+        }
+      }
+    }
  }
  delete casted_s;
 }
@ -4677,9 +4746,9 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
  // Convert user_key into a corresponding internal key.
  InternalKey k1(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
  InternalKey k2(limit.value(), kMaxSequenceNumber, kValueTypeForSeek);
-  MemTable::MemTableStats memStats =
+  ReadOnlyMemTable::MemTableStats memStats =
      sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
-  MemTable::MemTableStats immStats =
+  ReadOnlyMemTable::MemTableStats immStats =
      sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
  *count = memStats.count + immStats.count;
  *size = memStats.size + immStats.size;
@ -4753,6 +4822,24 @@ void DBImpl::ReleaseFileNumberFromPendingOutputs(
  }
 }

+std::list<uint64_t>::iterator DBImpl::CaptureOptionsFileNumber() {
+  // We need to remember the iterator of our insert, because after the
+  // compaction is done, we need to remove that element from
+  // min_options_file_numbers_.
+  min_options_file_numbers_.push_back(versions_->options_file_number());
+  auto min_options_file_numbers_inserted_elem = min_options_file_numbers_.end();
+  --min_options_file_numbers_inserted_elem;
+  return min_options_file_numbers_inserted_elem;
+}
+
+void DBImpl::ReleaseOptionsFileNumber(
+    std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+  if (v.get() != nullptr) {
+    min_options_file_numbers_.erase(*v.get());
+    v.reset();
+  }
+}
+
 Status DBImpl::GetUpdatesSince(
    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
    const TransactionLogIterator::ReadOptions& read_options) {
@ -5116,11 +5203,12 @@ Status DBImpl::GetDbIdentity(std::string& identity) const {
  return Status::OK();
 }

-Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
+Status DBImpl::GetDbIdentityFromIdentityFile(const IOOptions& opts,
+                                             std::string* identity) const {
  std::string idfilename = IdentityFileName(dbname_);
  const FileOptions soptions;

-  Status s = ReadFileToString(fs_.get(), idfilename, identity);
+  Status s = ReadFileToString(fs_.get(), idfilename, opts, identity);
  if (!s.ok()) {
    return s;
  }
@ -5240,6 +5328,14 @@ Status DestroyDB(const std::string& dbname, const Options& options,
  Env* env = soptions.env;
  std::vector<std::string> filenames;
  bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
+  auto sfm = static_cast_with_check<SstFileManagerImpl>(
+      options.sst_file_manager.get());
+  // Allocate a separate trash bucket to be used by all the to be deleted
+  // files, so we can later wait for this bucket to be empty before return.
+  std::optional<int32_t> bucket;
+  if (sfm) {
+    bucket = sfm->NewTrashBucket();
+  }

  // Reset the logger because it holds a handle to the
  // log file and prevents cleanup and directory removal
@ -5251,6 +5347,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
                    /*IODebugContext*=*/nullptr)
      .PermitUncheckedError();

+  std::set<std::string> paths_to_delete;
  FileLock* lock;
  const std::string lockname = LockFileName(dbname);
  Status result = env->LockFile(lockname, &lock);
@ -5267,10 +5364,9 @@ Status DestroyDB(const std::string& dbname, const Options& options,
          del = DestroyDB(path_to_delete, options);
        } else if (type == kTableFile || type == kWalFile ||
                   type == kBlobFile) {
-          del = DeleteDBFile(
-              &soptions, path_to_delete, dbname,
-              /*force_bg=*/false,
-              /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
+          del = DeleteUnaccountedDBFile(&soptions, path_to_delete, dbname,
+                                        /*force_bg=*/false,
+                                        /*force_fg=*/false, bucket);
        } else {
          del = env->DeleteFile(path_to_delete);
        }
@ -5279,6 +5375,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
        }
      }
    }
+    paths_to_delete.insert(dbname);

    std::set<std::string> paths;
    for (const DbPath& db_path : options.db_paths) {
@ -5300,18 +5397,19 @@ Status DestroyDB(const std::string& dbname, const Options& options,
              (type == kTableFile ||
               type == kBlobFile)) {  // Lock file will be deleted at end
            std::string file_path = path + "/" + fname;
-            Status del = DeleteDBFile(&soptions, file_path, dbname,
-                                      /*force_bg=*/false, /*force_fg=*/false);
+            Status del = DeleteUnaccountedDBFile(&soptions, file_path, dbname,
+                                                 /*force_bg=*/false,
+                                                 /*force_fg=*/false, bucket);
            if (!del.ok() && result.ok()) {
              result = del;
            }
          }
        }
-        // TODO: Should we return an error if we cannot delete the directory?
-        env->DeleteDir(path).PermitUncheckedError();
      }
    }

+    paths_to_delete.merge(paths);
+
    std::vector<std::string> walDirFiles;
    std::string archivedir = ArchivalDirectory(dbname);
    bool wal_dir_exists = false;
@ -5335,46 +5433,49 @@ Status DestroyDB(const std::string& dbname, const Options& options,
      // Delete archival files.
      for (const auto& file : archiveFiles) {
        if (ParseFileName(file, &number, &type) && type == kWalFile) {
-          Status del =
-              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
-                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+          Status del = DeleteUnaccountedDBFile(
+              &soptions, archivedir + "/" + file, archivedir,
+              /*force_bg=*/false, /*force_fg=*/!wal_in_db_path, bucket);
          if (!del.ok() && result.ok()) {
            result = del;
          }
        }
      }
-      // Ignore error in case dir contains other files
-      env->DeleteDir(archivedir).PermitUncheckedError();
+      paths_to_delete.insert(archivedir);
    }

    // Delete log files in the WAL dir
    if (wal_dir_exists) {
      for (const auto& file : walDirFiles) {
        if (ParseFileName(file, &number, &type) && type == kWalFile) {
-          Status del =
-              DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
-                           soptions.wal_dir, /*force_bg=*/false,
-                           /*force_fg=*/!wal_in_db_path);
+          Status del = DeleteUnaccountedDBFile(
+              &soptions, LogFileName(soptions.wal_dir, number),
+              soptions.wal_dir, /*force_bg=*/false,
+              /*force_fg=*/!wal_in_db_path, bucket);
          if (!del.ok() && result.ok()) {
            result = del;
          }
        }
      }
-      // Ignore error in case dir contains other files
-      env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
+      paths_to_delete.insert(soptions.wal_dir);
    }

    // Ignore error since state is already gone
    env->UnlockFile(lock).PermitUncheckedError();
    env->DeleteFile(lockname).PermitUncheckedError();

+    // Make sure trash files are all cleared before return.
+    if (sfm && bucket.has_value()) {
+      sfm->WaitForEmptyTrashBucket(bucket.value());
+    }
    // sst_file_manager holds a ref to the logger. Make sure the logger is
    // gone before trying to remove the directory.
    soptions.sst_file_manager.reset();

    // Ignore error in case dir contains other files
-    env->DeleteDir(dbname).PermitUncheckedError();
-    ;
+    for (const auto& path_to_delete : paths_to_delete) {
+      env->DeleteDir(path_to_delete).PermitUncheckedError();
+    }
  }
  return result;
 }
@ -5772,7 +5873,6 @@ Status DBImpl::IngestExternalFile(
 Status DBImpl::IngestExternalFiles(
    const std::vector<IngestExternalFileArg>& args) {
  // TODO: plumb Env::IOActivity, Env::IOPriority
-  const ReadOptions read_options;
  const WriteOptions write_options;

  if (args.empty()) {
@ -5798,6 +5898,10 @@ Status DBImpl::IngestExternalFiles(
      snprintf(err_msg, 128, "external_files[%zu] is empty", i);
      return Status::InvalidArgument(err_msg);
    }
+    if (i && args[i].options.fill_cache != args[i - 1].options.fill_cache) {
+      return Status::InvalidArgument(
+          "fill_cache should be the same across ingestion options.");
+    }
  }
  for (const auto& arg : args) {
    const IngestExternalFileOptions& ingest_opts = arg.options;
@ -5820,11 +5924,10 @@ Status DBImpl::IngestExternalFiles(
            "write_global_seqno is deprecated and does not work with "
            "allow_db_generated_files.");
      }
-      if (ingest_opts.move_files) {
-        return Status::NotSupported(
-            "Options move_files and allow_db_generated_files are not "
-            "compatible.");
-      }
+    }
+    if (ingest_opts.move_files && ingest_opts.link_files) {
+      return Status::InvalidArgument(
+          "`move_files` and `link_files` can not both be true.");
    }
  }

@ -5858,9 +5961,9 @@ Status DBImpl::IngestExternalFiles(
  uint64_t start_file_number = next_file_number;
  for (size_t i = 1; i != num_cfs; ++i) {
    start_file_number += args[i - 1].external_files.size();
-    auto* cfd =
-        static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
-    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    SuperVersion* super_version =
+        ingestion_jobs[i].GetColumnFamilyData()->GetReferencedSuperVersion(
+            this);
    Status es = ingestion_jobs[i].Prepare(
        args[i].external_files, args[i].files_checksums,
        args[i].files_checksum_func_names, args[i].file_temperature,
@ -5874,9 +5977,9 @@ Status DBImpl::IngestExternalFiles(
  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
  {
-    auto* cfd =
-        static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
-    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    SuperVersion* super_version =
+        ingestion_jobs[0].GetColumnFamilyData()->GetReferencedSuperVersion(
+            this);
    Status es = ingestion_jobs[0].Prepare(
        args[0].external_files, args[0].files_checksums,
        args[0].files_checksum_func_names, args[0].file_temperature,
@ -5927,8 +6030,7 @@ Status DBImpl::IngestExternalFiles(
    bool at_least_one_cf_need_flush = false;
    std::vector<bool> need_flush(num_cfs, false);
    for (size_t i = 0; i != num_cfs; ++i) {
-      auto* cfd =
-          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
      if (cfd->IsDropped()) {
        // TODO (yanqin) investigate whether we should abort ingestion or
        // proceed with other non-dropped column families.
@ -5960,12 +6062,10 @@ Status DBImpl::IngestExternalFiles(
        for (size_t i = 0; i != num_cfs; ++i) {
          if (need_flush[i]) {
            mutex_.Unlock();
-            auto* cfd =
-                static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
-                    ->cfd();
-            status = FlushMemTable(cfd, flush_opts,
-                                   FlushReason::kExternalFileIngestion,
-                                   true /* entered_write_thread */);
+            status =
+                FlushMemTable(ingestion_jobs[i].GetColumnFamilyData(),
+                              flush_opts, FlushReason::kExternalFileIngestion,
+                              true /* entered_write_thread */);
            mutex_.Lock();
            if (!status.ok()) {
              break;
@ -5973,6 +6073,13 @@ Status DBImpl::IngestExternalFiles(
          }
        }
      }
+      if (status.ok()) {
+        for (size_t i = 0; i != num_cfs; ++i) {
+          if (immutable_db_options_.atomic_flush || need_flush[i]) {
+            ingestion_jobs[i].SetFlushedBeforeRun();
+          }
+        }
+      }
    }
    // Run ingestion jobs.
    if (status.ok()) {
@ -5986,16 +6093,15 @@ Status DBImpl::IngestExternalFiles(
      }
    }
    if (status.ok()) {
+      ReadOptions read_options;
+      read_options.fill_cache = args[0].options.fill_cache;
      autovector<ColumnFamilyData*> cfds_to_commit;
      autovector<const MutableCFOptions*> mutable_cf_options_list;
      autovector<autovector<VersionEdit*>> edit_lists;
      uint32_t num_entries = 0;
      for (size_t i = 0; i != num_cfs; ++i) {
-        auto* cfd =
-            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
-        if (cfd->IsDropped()) {
-          continue;
-        }
+        auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
+        assert(!cfd->IsDropped());
        cfds_to_commit.push_back(cfd);
        mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
        autovector<VersionEdit*> edit_list;
@ -6045,20 +6151,16 @@ Status DBImpl::IngestExternalFiles(

    if (status.ok()) {
      for (size_t i = 0; i != num_cfs; ++i) {
-        auto* cfd =
-            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
-        if (!cfd->IsDropped()) {
-          InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
-                                             *cfd->GetLatestMutableCFOptions());
+        auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
+        assert(!cfd->IsDropped());
+        InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+                                           *cfd->GetLatestMutableCFOptions());
 #ifndef NDEBUG
-          if (0 == i && num_cfs > 1) {
-            TEST_SYNC_POINT(
-                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
-            TEST_SYNC_POINT(
-                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
-          }
-#endif  // !NDEBUG
+        if (0 == i && num_cfs > 1) {
+          TEST_SYNC_POINT("DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+          TEST_SYNC_POINT("DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
        }
+#endif  // !NDEBUG
      }
    } else if (versions_->io_status().IsIOError()) {
      // Error while writing to MANIFEST.
@ -6100,8 +6202,7 @@ Status DBImpl::IngestExternalFiles(
  }
  if (status.ok()) {
    for (size_t i = 0; i != num_cfs; ++i) {
-      auto* cfd =
-          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      auto* cfd = ingestion_jobs[i].GetColumnFamilyData();
      if (!cfd->IsDropped()) {
        NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
      }
@ -6732,6 +6833,62 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
  }
 }

+void DBImpl::TrackOrUntrackFiles(
+    const std::vector<std::string>& existing_data_files, bool track) {
+  auto sfm = static_cast_with_check<SstFileManagerImpl>(
+      immutable_db_options_.sst_file_manager.get());
+  assert(sfm);
+  std::vector<ColumnFamilyMetaData> metadata;
+  GetAllColumnFamilyMetaData(&metadata);
+  auto action = [&](const std::string& file_path,
+                    std::optional<uint64_t> size) {
+    if (track) {
+      if (size) {
+        sfm->OnAddFile(file_path, *size).PermitUncheckedError();
+      } else {
+        sfm->OnAddFile(file_path).PermitUncheckedError();
+      }
+    } else {
+      sfm->OnUntrackFile(file_path).PermitUncheckedError();
+    }
+  };
+
+  std::unordered_set<std::string> referenced_files;
+  for (const auto& md : metadata) {
+    for (const auto& lmd : md.levels) {
+      for (const auto& fmd : lmd.files) {
+        // We're assuming that each sst file name exists in at most one of
+        // the paths.
+        std::string file_path =
+            fmd.directory + kFilePathSeparator + fmd.relative_filename;
+        action(file_path, fmd.size);
+        referenced_files.insert(file_path);
+      }
+    }
+    for (const auto& bmd : md.blob_files) {
+      std::string name = bmd.blob_file_name;
+      // The BlobMetaData.blob_file_name may start with "/".
+      if (!name.empty() && name[0] == kFilePathSeparator) {
+        name = name.substr(1);
+      }
+      // We're assuming that each blob file name exists in at most one of
+      // the paths.
+      std::string file_path = bmd.blob_file_path + kFilePathSeparator + name;
+      action(file_path, bmd.blob_file_size);
+      referenced_files.insert(file_path);
+    }
+  }
+
+  for (const auto& file_path : existing_data_files) {
+    if (referenced_files.find(file_path) != referenced_files.end()) {
+      continue;
+    }
+    // There shouldn't be any duplicated files. In case there is, SstFileManager
+    // will take care of deduping it.
+    action(file_path, /*size=*/std::nullopt);
+  }
+}
+
 void DBImpl::InstallSeqnoToTimeMappingInSV(
    std::vector<SuperVersionContext>* sv_contexts) {
  mutex_.AssertHeld();
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@ -482,7 +482,8 @@ class DBImpl : public DB {

  Status GetDbIdentity(std::string& identity) const override;

-  virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+  virtual Status GetDbIdentityFromIdentityFile(const IOOptions& opts,
+                                               std::string* identity) const;

  Status GetDbSessionId(std::string& session_id) const override;

@ -853,6 +854,8 @@ class DBImpl : public DB {

  uint64_t GetObsoleteSstFilesSize();

+  uint64_t MinOptionsFileNumberToKeep();
+
  // Returns the list of live files in 'live' and the list
  // of all files in the filesystem in 'candidate_files'.
  // If force == false and the last call was less than
@ -1151,6 +1154,8 @@ class DBImpl : public DB {
  // Get the background error status
  Status TEST_GetBGError();

+  bool TEST_IsRecoveryInProgress();
+
  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
  uint64_t TEST_MaxNextLevelOverlappingBytes(
@ -1195,9 +1200,7 @@ class DBImpl : public DB {

  uint64_t TEST_total_log_size() const { return total_log_size_; }

-  // Returns column family name to ImmutableCFOptions map.
-  Status TEST_GetAllImmutableCFOptions(
-      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+  void TEST_GetAllBlockCaches(std::unordered_set<const Cache*>* cache_set);

  // Return the lastest MutableCFOptions of a column family
  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
@ -1226,6 +1229,8 @@ class DBImpl : public DB {
    return logs_.back().number;
  }

+  void TEST_DeleteObsoleteFiles();
+
  const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
    return files_grabbed_for_purge_;
  }
@ -1235,9 +1240,14 @@ class DBImpl : public DB {
  static Status TEST_ValidateOptions(const DBOptions& db_options) {
    return ValidateOptions(db_options);
  }
-
 #endif  // NDEBUG

+  // In certain configurations, verify that the table/blob file cache only
+  // contains entries for live files, to check for effective leaks of open
+  // files. This can only be called when purging of obsolete files has
+  // "settled," such as during parts of DB Close().
+  void TEST_VerifyNoObsoleteFilesCached(bool db_mutex_already_held) const;
+
  // persist stats to column family "_persistent_stats"
  void PersistStats();

@ -1580,11 +1590,12 @@ class DBImpl : public DB {

  virtual bool OwnTablesAndLogs() const { return true; }

-  // Setup DB identity file, and write DB ID to manifest if necessary.
+  // Read/create DB identity file (as appropriate), and write DB ID to
+  // version_edit if provided.
  Status SetupDBId(const WriteOptions& write_options, bool read_only,
-                   RecoveryContext* recovery_ctx);
-  // Assign db_id_ and write DB ID to manifest if necessary.
-  void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx);
+                   bool is_new_db, bool is_retry, VersionEdit* version_edit);
+  // Assign db_id_ and write DB ID to version_edit if provided.
+  void SetDBId(std::string&& id, bool read_only, VersionEdit* version_edit);

  // Collect a deduplicated collection of paths used by this DB, including
  // dbname_, DBOptions.db_paths, ColumnFamilyOptions.cf_paths.
@ -1614,9 +1625,15 @@ class DBImpl : public DB {
  // vast majority of all files), since it already has the file size
  // on record, we don't need to query the file system. Otherwise, we query the
  // file system for the size of an unreferenced file.
+  // REQUIRES: mutex unlocked
  void TrackExistingDataFiles(
      const std::vector<std::string>& existing_data_files);

+  // Untrack data files in sst manager. This is only called during DB::Close on
+  // an unowned SstFileManager, to return it to a consistent state.
+  // REQUIRES: mutex unlocked
+  void UntrackDataFiles();
+
  // SetDbSessionId() should be called in the constuctor DBImpl()
  // to ensure that db_session_id_ gets updated every time the DB is opened
  void SetDbSessionId();
@ -1683,6 +1700,8 @@ class DBImpl : public DB {
  friend class XFTransactionWriteHandler;
  friend class DBBlobIndexTest;
  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class CompactionServiceTest_PreservedOptionsLocalCompaction_Test;
+  friend class CompactionServiceTest_PreservedOptionsRemoteCompaction_Test;
 #endif

  struct CompactionState;
@ -1691,7 +1710,7 @@ class DBImpl : public DB {

  struct WriteContext {
    SuperVersionContext superversion_context;
-    autovector<MemTable*> memtables_to_free_;
+    autovector<ReadOnlyMemTable*> memtables_to_free_;

    explicit WriteContext(bool create_superversion = false)
        : superversion_context(create_superversion) {}
@ -1954,6 +1973,13 @@ class DBImpl : public DB {
  void ReleaseFileNumberFromPendingOutputs(
      std::unique_ptr<std::list<uint64_t>::iterator>& v);

+  // Similar to pending_outputs, preserve OPTIONS file. Used for remote
+  // compaction.
+  std::list<uint64_t>::iterator CaptureOptionsFileNumber();
+  void ReleaseOptionsFileNumber(
+      std::unique_ptr<std::list<uint64_t>::iterator>& v);
+
+  // Sets bg error if there is an error writing to WAL.
  IOStatus SyncClosedWals(const WriteOptions& write_options,
                          JobContext* job_context, VersionEdit* synced_wals,
                          bool error_recovery_in_prog);
@ -2026,6 +2052,8 @@ class DBImpl : public DB {

  Status TrimMemtableHistory(WriteContext* context);

+  // Switches the current live memtable to immutable/read-only memtable.
+  // A new WAL is created if the current WAL is not empty.
  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);

  // Select and output column families qualified for atomic flush in
@ -2064,17 +2092,18 @@ class DBImpl : public DB {
  // memtable pending flush.
  // resuming_from_bg_err indicates whether the caller is attempting to resume
  // from background error.
-  Status WaitForFlushMemTable(ColumnFamilyData* cfd,
-                              const uint64_t* flush_memtable_id = nullptr,
-                              bool resuming_from_bg_err = false) {
+  Status WaitForFlushMemTable(
+      ColumnFamilyData* cfd, const uint64_t* flush_memtable_id = nullptr,
+      bool resuming_from_bg_err = false,
+      std::optional<FlushReason> flush_reason = std::nullopt) {
    return WaitForFlushMemTables({cfd}, {flush_memtable_id},
-                                 resuming_from_bg_err);
+                                 resuming_from_bg_err, flush_reason);
  }
  // Wait for memtables to be flushed for multiple column families.
  Status WaitForFlushMemTables(
      const autovector<ColumnFamilyData*>& cfds,
      const autovector<const uint64_t*>& flush_memtable_ids,
-      bool resuming_from_bg_err);
+      bool resuming_from_bg_err, std::optional<FlushReason> flush_reason);

  inline void WaitForPendingWrites() {
    mutex_.AssertHeld();
@ -2172,7 +2201,7 @@ class DBImpl : public DB {

  // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
  // WAL, sync WAL fails, if paranoid check is enabled.
-  void IOStatusCheck(const IOStatus& status);
+  void WALIOStatusCheck(const IOStatus& status);

  // Used by WriteImpl to update bg_error_ in case of memtable insert error.
  void MemTableInsertStatusCheck(const Status& memtable_insert_status);
@ -2185,7 +2214,9 @@ class DBImpl : public DB {
                          JobContext* job_context, LogBuffer* log_buffer,
                          CompactionJobInfo* compaction_job_info);

-  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+  // REQUIRES: mutex unlocked
+  void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
+                           bool track);

  void MaybeScheduleFlushOrCompaction();

@ -2214,10 +2245,27 @@ class DBImpl : public DB {
  void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
                            FlushReason flush_reason, FlushRequest* req);

-  // Returns true if `req` is successfully enqueued.
-  bool SchedulePendingFlush(const FlushRequest& req);
+  // Below functions are for executing flush, compaction in the background. A
+  // dequeue is the communication channel between threads that asks for the work
+  // to be done and the available threads in the thread pool that pick it up to
+  // execute it. We use these terminologies to describe the state of the work
+  // and its transitions:
+  // 1) It becomes pending once it's successfully enqueued into the
+  //    corresponding dequeue, a work in this state is also called unscheduled.
+  //    Counter `unscheduled_*_` counts work in this state.
+  // 2) When `MaybeScheduleFlushOrCompaction` schedule a thread to run `BGWork*`
+  //    for the work, it becomes scheduled
+  //    Counter `bg_*_scheduled_` counts work in this state.
+  // 3) Once the thread start to execute `BGWork*`, the work is popped from the
+  //    dequeue, it is now in running state
+  //    Counter `num_running_*_` counts work in this state.
+  // 4) Eventually, the work is finished. We don't need to specifically track
+  //    finished work.

-  void SchedulePendingCompaction(ColumnFamilyData* cfd);
+  // Returns true if `req` is successfully enqueued.
+  bool EnqueuePendingFlush(const FlushRequest& req);
+
+  void EnqueuePendingCompaction(ColumnFamilyData* cfd);
  void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
                            FileType type, uint64_t number, int job_id);
  static void BGWorkCompaction(void* arg);
@ -2722,6 +2770,11 @@ class DBImpl : public DB {
  // State is protected with db mutex.
  std::list<uint64_t> pending_outputs_;

+  // Similar to pending_outputs_, FindObsoleteFiles()/PurgeObsoleteFiles() never
+  // deletes any OPTIONS file that has number bigger than any of the file number
+  // in min_options_file_numbers_.
+  std::list<uint64_t> min_options_file_numbers_;
+
  // flush_queue_ and compaction_queue_ hold column families that we need to
  // flush and compact, respectively.
  // A column family is inserted into flush_queue_ when it satisfies condition
@ -2844,6 +2897,11 @@ class DBImpl : public DB {
  // garbages, among all column families.
  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;

+  // The min threshold to trigger compactions for standalone range deletion
+  // files that are marked for compaction.
+  SequenceNumber standalone_range_deletion_files_mark_threshold_ =
+      kMaxSequenceNumber;
+
  LogsWithPrepTracker logs_with_prep_tracker_;

  // Callback for compaction to check if a key is visible to a snapshot.
@ -2944,6 +3002,15 @@ DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
 CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
                                    const MutableCFOptions& mutable_cf_options);

+// Return a VersionEdit for the DB's recovery when the `memtables` of the
+// specified column family are obsolete. Specifically, the min log number to
+// keep, and the WAL files that can be deleted.
+VersionEdit GetDBRecoveryEditForObsoletingMemTables(
+    VersionSet* vset, const ColumnFamilyData& cfd,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<ReadOnlyMemTable*>& memtables,
+    LogsWithPrepTracker* prep_tracker);
+
 // Return the earliest log file to keep after the memtable flush is
 // finalized.
 // `cfd_to_flush` is the column family whose memtable (specified in
@ -2953,13 +3020,13 @@ CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions,
 uint64_t PrecomputeMinLogNumberToKeep2PC(
    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
    const autovector<VersionEdit*>& edit_list,
-    const autovector<MemTable*>& memtables_to_flush,
+    const autovector<ReadOnlyMemTable*>& memtables_to_flush,
    LogsWithPrepTracker* prep_tracker);
 // For atomic flush.
 uint64_t PrecomputeMinLogNumberToKeep2PC(
    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
    const autovector<autovector<VersionEdit*>>& edit_lists,
-    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush,
    LogsWithPrepTracker* prep_tracker);

 // In non-2PC mode, WALs with log number < the returned number can be
@ -2976,11 +3043,11 @@ uint64_t PrecomputeMinLogNumberToKeepNon2PC(
 // will not depend on any WAL file. nullptr means no memtable is being flushed.
 // The function is only applicable to 2pc mode.
 uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
+    VersionSet* vset, const autovector<ReadOnlyMemTable*>& memtables_to_flush);
 // For atomic flush.
 uint64_t FindMinPrepLogReferencedByMemTable(
    VersionSet* vset,
-    const autovector<const autovector<MemTable*>*>& memtables_to_flush);
+    const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush);

 // Fix user-supplied options to be reasonable
 template <class T, class V>
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@ -753,7 +753,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(

  if (s.ok()) {
    autovector<ColumnFamilyData*> tmp_cfds;
-    autovector<const autovector<MemTable*>*> mems_list;
+    autovector<const autovector<ReadOnlyMemTable*>*> mems_list;
    autovector<const MutableCFOptions*> mutable_cf_options_list;
    autovector<FileMetaData*> tmp_file_meta;
    autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
@ -1457,11 +1457,6 @@ Status DBImpl::CompactFilesImpl(
    input_set.insert(TableFileNameToNumber(file_name));
  }

-  ColumnFamilyMetaData cf_meta;
-  // TODO(yhchiang): can directly use version here if none of the
-  // following functions call is pluggable to external developers.
-  version->GetColumnFamilyMetaData(&cf_meta);
-
  if (output_path_id < 0) {
    if (cfd->ioptions()->cf_paths.size() == 1U) {
      output_path_id = 0;
@ -1482,7 +1477,7 @@ Status DBImpl::CompactFilesImpl(

  std::vector<CompactionInputFiles> input_files;
  Status s = cfd->compaction_picker()->SanitizeAndConvertCompactionInputFiles(
-      &input_set, cf_meta, output_level, version->storage_info(), &input_files);
+      &input_set, output_level, version, &input_files);
  TEST_SYNC_POINT(
      "DBImpl::CompactFilesImpl::PostSanitizeAndConvertCompactionInputFiles");
  if (!s.ok()) {
@ -1561,6 +1556,12 @@ Status DBImpl::CompactFilesImpl(

  compaction_job.Prepare();

+  std::unique_ptr<std::list<uint64_t>::iterator> min_options_file_number_elem;
+  if (immutable_db_options().compaction_service != nullptr) {
+    min_options_file_number_elem.reset(
+        new std::list<uint64_t>::iterator(CaptureOptionsFileNumber()));
+  }
+
  mutex_.Unlock();
  TEST_SYNC_POINT("CompactFilesImpl:0");
  TEST_SYNC_POINT("CompactFilesImpl:1");
@ -1570,6 +1571,10 @@ Status DBImpl::CompactFilesImpl(
  TEST_SYNC_POINT("CompactFilesImpl:3");
  mutex_.Lock();

+  if (immutable_db_options().compaction_service != nullptr) {
+    ReleaseOptionsFileNumber(min_options_file_number_elem);
+  }
+
  bool compaction_released = false;
  Status status =
      compaction_job.Install(*c->mutable_cf_options(), &compaction_released);
@ -1852,8 +1857,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
        mutable_cf_options.compression_opts,
        mutable_cf_options.default_write_temperature,
        0 /* max_subcompactions, not applicable */,
-        {} /* grandparents, not applicable */, false /* is manual */,
-        "" /* trim_ts */, -1 /* score, not applicable */,
+        {} /* grandparents, not applicable */,
+        std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
+        false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
        false /* is deletion compaction, not applicable */,
        false /* l0_files_might_overlap, not applicable */,
        CompactionReason::kRefitLevel));
@ -1880,7 +1886,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
    Status status = versions_->LogAndApply(cfd, mutable_cf_options,
                                           read_options, write_options, &edit,
                                           &mutex_, directories_.GetDbDir());
-
+    c->MarkFilesBeingCompacted(false);
    cfd->compaction_picker()->UnregisterCompaction(c.get());
    c.reset();

@ -2377,7 +2383,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
        ColumnFamilyData* loop_cfd =
            req.cfd_to_max_mem_id_to_persist.begin()->first;
        bool already_queued_for_flush = loop_cfd->queued_for_flush();
-        bool flush_req_enqueued = SchedulePendingFlush(req);
+        bool flush_req_enqueued = EnqueuePendingFlush(req);
        if (already_queued_for_flush || flush_req_enqueued) {
          loop_cfd->SetFlushSkipReschedule();
        }
@ -2407,7 +2413,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
    }
    s = WaitForFlushMemTables(
        cfds, flush_memtable_ids,
-        flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
+        flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
+        flush_reason);
    InstrumentedMutexLock lock_guard(&mutex_);
    for (auto* tmp_cfd : cfds) {
      tmp_cfd->UnrefAndTryDelete();
@ -2528,7 +2535,7 @@ Status DBImpl::AtomicFlushMemTables(
        }
      }
      GenerateFlushRequest(cfds, flush_reason, &flush_req);
-      SchedulePendingFlush(flush_req);
+      EnqueuePendingFlush(flush_req);
      MaybeScheduleFlushOrCompaction();
    }

@ -2549,7 +2556,8 @@ Status DBImpl::AtomicFlushMemTables(
    }
    s = WaitForFlushMemTables(
        cfds, flush_memtable_ids,
-        flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
+        flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
+        flush_reason);
    InstrumentedMutexLock lock_guard(&mutex_);
    for (auto* cfd : cfds) {
      cfd->UnrefAndTryDelete();
@ -2583,7 +2591,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
  if (immutable_db_options_.atomic_flush) {
    FlushRequest flush_req;
    GenerateFlushRequest(cfds, flush_reason, &flush_req);
-    SchedulePendingFlush(flush_req);
+    EnqueuePendingFlush(flush_req);
    for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
      flush_memtable_ids.push_back(iter.second);
    }
@ -2597,7 +2605,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
          flush_reason,
          {{cfd,
            std::numeric_limits<uint64_t>::max() /* max_mem_id_to_persist */}}};
-      if (SchedulePendingFlush(flush_req)) {
+      if (EnqueuePendingFlush(flush_req)) {
        cfd->SetFlushSkipReschedule();
      };
    }
@ -2612,7 +2620,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
      flush_memtable_id_ptrs.push_back(&flush_memtable_id);
    }
    s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs,
-                              true /* resuming_from_bg_err */);
+                              true /* resuming_from_bg_err */, flush_reason);
    mutex_.Lock();
  }

@ -2712,7 +2720,7 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
 Status DBImpl::WaitForFlushMemTables(
    const autovector<ColumnFamilyData*>& cfds,
    const autovector<const uint64_t*>& flush_memtable_ids,
-    bool resuming_from_bg_err) {
+    bool resuming_from_bg_err, std::optional<FlushReason> flush_reason) {
  int num = static_cast<int>(cfds.size());
  // Wait until the compaction completes
  InstrumentedMutexLock l(&mutex_);
@ -2750,7 +2758,15 @@ Status DBImpl::WaitForFlushMemTables(
                 (flush_memtable_ids[i] != nullptr &&
                  cfds[i]->imm()->GetEarliestMemTableID() >
                      *flush_memtable_ids[i])) {
-        ++num_finished;
+        // Make file ingestion's flush wait until SuperVersion is also updated
+        // since after flush, it does range overlapping check and file level
+        // assignment with the current SuperVersion.
+        if (!flush_reason.has_value() ||
+            flush_reason.value() != FlushReason::kExternalFileIngestion ||
+            cfds[i]->GetSuperVersion()->imm->GetID() ==
+                cfds[i]->imm()->current()->GetID()) {
+          ++num_finished;
+        }
      }
    }
    if (1 == num_dropped && 1 == num) {
@ -2950,6 +2966,7 @@ void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
  cfd->Ref();
  compaction_queue_.push_back(cfd);
  cfd->set_queued_for_compaction(true);
+  ++unscheduled_compactions_;
 }

 ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
@ -3005,7 +3022,7 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue(
  return cfd;
 }

-bool DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
+bool DBImpl::EnqueuePendingFlush(const FlushRequest& flush_req) {
  mutex_.AssertHeld();
  bool enqueued = false;
  if (reject_new_background_jobs_) {
@ -3041,16 +3058,15 @@ bool DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) {
  return enqueued;
 }

-void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+void DBImpl::EnqueuePendingCompaction(ColumnFamilyData* cfd) {
  mutex_.AssertHeld();
  if (reject_new_background_jobs_) {
    return;
  }
  if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
-    TEST_SYNC_POINT_CALLBACK("SchedulePendingCompaction::cfd",
+    TEST_SYNC_POINT_CALLBACK("EnqueuePendingCompaction::cfd",
                             static_cast<void*>(cfd));
    AddToCompactionQueue(cfd);
-    ++unscheduled_compactions_;
  }
 }

@ -3218,7 +3234,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
 #ifndef NDEBUG
      flush_req.reschedule_count += 1;
 #endif /* !NDEBUG */
-      SchedulePendingFlush(flush_req);
+      EnqueuePendingFlush(flush_req);
      *reason = flush_reason;
      *flush_rescheduled_to_retain_udt = true;
      return Status::TryAgain();
@ -3541,6 +3557,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
      is_manual && manual_compaction->disallow_trivial_move;

  CompactionJobStats compaction_job_stats;
+  // Set is_remote_compaction to true on CompactionBegin Event if
+  // compaction_service is set except for trivial moves. We do not know whether
+  // remote compaction will actually be successfully scheduled, or fall back to
+  // local at this time. CompactionCompleted event will tell the truth where
+  // the compaction actually happened.
+  compaction_job_stats.is_remote_compaction =
+      immutable_db_options().compaction_service != nullptr;
+
  Status status;
  if (!error_handler_.IsBGWorkStopped()) {
    if (shutting_down_.load(std::memory_order_acquire)) {
@ -3661,8 +3685,20 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
      // compaction is not necessary. Need to make sure mutex is held
      // until we make a copy in the following code
      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
+      SnapshotChecker* snapshot_checker = nullptr;
+      std::vector<SequenceNumber> snapshot_seqs;
+      // This info is not useful for other scenarios, so save querying existing
+      // snapshots for those cases.
+      if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+          cfd->user_comparator()->timestamp_size() == 0) {
+        SequenceNumber earliest_write_conflict_snapshot;
+        GetSnapshotContext(job_context, &snapshot_seqs,
+                           &earliest_write_conflict_snapshot,
+                           &snapshot_checker);
+        assert(is_snapshot_supported_ || snapshots_.empty());
+      }
      c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_,
-                                  log_buffer));
+                                  snapshot_seqs, snapshot_checker, log_buffer));
      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");

      if (c != nullptr) {
@ -3678,7 +3714,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
              ->ComputeCompactionScore(*(c->immutable_options()),
                                       *(c->mutable_cf_options()));
          AddToCompactionQueue(cfd);
-          ++unscheduled_compactions_;

          c.reset();
          // Don't need to sleep here, because BackgroundCallCompaction
@ -3707,7 +3742,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
          if (cfd->NeedsCompaction()) {
            // Yes, we need more compactions!
            AddToCompactionQueue(cfd);
-            ++unscheduled_compactions_;
            MaybeScheduleFlushOrCompaction();
          }
        }
@ -3768,6 +3802,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
    ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);

    compaction_job_stats.num_input_files = c->num_input_files(0);
+    // Trivial moves do not get compacted remotely
+    compaction_job_stats.is_remote_compaction = false;

    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                            compaction_job_stats, job_context->job_id);
@ -3903,6 +3939,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
        &bg_bottom_compaction_scheduled_);
    compaction_job.Prepare();

+    std::unique_ptr<std::list<uint64_t>::iterator> min_options_file_number_elem;
+    if (immutable_db_options().compaction_service != nullptr) {
+      min_options_file_number_elem.reset(
+          new std::list<uint64_t>::iterator(CaptureOptionsFileNumber()));
+    }
+
    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                            compaction_job_stats, job_context->job_id);
    mutex_.Unlock();
@ -3912,6 +3954,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
    compaction_job.Run().PermitUncheckedError();
    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
    mutex_.Lock();
+
+    if (immutable_db_options().compaction_service != nullptr) {
+      ReleaseOptionsFileNumber(min_options_file_number_elem);
+    }
+
    status =
        compaction_job.Install(*c->mutable_cf_options(), &compaction_released);
    io_s = compaction_job.io_status();
@ -3939,7 +3986,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
      // Sanity checking that compaction files are freed.
      for (size_t i = 0; i < c->num_input_levels(); i++) {
        for (size_t j = 0; j < c->inputs(i)->size(); j++) {
-          assert(!c->input(i, j)->being_compacted);
+          // When status is not OK, compaction's result installation failed and
+          // no new Version installed. The files could have been released and
+          // picked up again by other compaction attempts.
+          assert(!c->input(i, j)->being_compacted || !status.ok());
        }
      }
      std::unordered_set<Compaction*>* cip = c->column_family_data()
@ -3997,7 +4047,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                                   *(c->mutable_cf_options()));
      if (!cfd->queued_for_compaction()) {
        AddToCompactionQueue(cfd);
-        ++unscheduled_compactions_;
      }
    }
  }
@ -4259,17 +4308,23 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
  // newer snapshot created and released frequently, the compaction will be
  // triggered soon anyway.
  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  standalone_range_deletion_files_mark_threshold_ = kMaxSequenceNumber;
  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
    if (!my_cfd->ioptions()->allow_ingest_behind) {
      bottommost_files_mark_threshold_ = std::min(
          bottommost_files_mark_threshold_,
          my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
    }
+    standalone_range_deletion_files_mark_threshold_ =
+        std::min(standalone_range_deletion_files_mark_threshold_,
+                 cfd->current()
+                     ->storage_info()
+                     ->standalone_range_tombstone_files_mark_threshold());
  }

  // Whenever we install new SuperVersion, we might need to issue new flushes or
  // compactions.
-  SchedulePendingCompaction(cfd);
+  EnqueuePendingCompaction(cfd);
  MaybeScheduleFlushOrCompaction();

  // Update max_total_in_memory_state_
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@ -9,6 +9,7 @@

 #ifndef NDEBUG

+#include "db/blob/blob_file_cache.h"
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
@ -199,6 +200,11 @@ Status DBImpl::TEST_GetBGError() {
  return error_handler_.GetBGError();
 }

+bool DBImpl::TEST_IsRecoveryInProgress() {
+  InstrumentedMutexLock l(&mutex_);
+  return error_handler_.IsRecoveryInProgress();
+}
+
 void DBImpl::TEST_LockMutex() { mutex_.Lock(); }

 void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
@ -227,23 +233,16 @@ uint64_t DBImpl::TEST_LogfileNumber() {
  return logfile_number_;
 }

-Status DBImpl::TEST_GetAllImmutableCFOptions(
-    std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
-  std::vector<std::string> cf_names;
-  std::vector<const ImmutableCFOptions*> iopts;
-  {
-    InstrumentedMutexLock l(&mutex_);
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      cf_names.push_back(cfd->GetName());
-      iopts.push_back(cfd->ioptions());
+void DBImpl::TEST_GetAllBlockCaches(
+    std::unordered_set<const Cache*>* cache_set) {
+  InstrumentedMutexLock l(&mutex_);
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (const auto bbto =
+            cfd->GetCurrentMutableCFOptions()
+                ->table_factory->GetOptions<BlockBasedTableOptions>()) {
+      cache_set->insert(bbto->block_cache.get());
    }
  }
-  iopts_map->clear();
-  for (size_t i = 0; i < cf_names.size(); ++i) {
-    iopts_map->insert({cf_names[i], iopts[i]});
-  }
-
-  return Status::OK();
 }

 uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
@ -259,7 +258,7 @@ size_t DBImpl::TEST_LogsWithPrepSize() {
 }

 uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
-  autovector<MemTable*> empty_list;
+  autovector<ReadOnlyMemTable*> empty_list;
  return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
 }

@ -314,9 +313,75 @@ const autovector<uint64_t>& DBImpl::TEST_GetFilesToQuarantine() const {
  return error_handler_.GetFilesToQuarantine();
 }

+void DBImpl::TEST_DeleteObsoleteFiles() {
+  InstrumentedMutexLock l(&mutex_);
+  DeleteObsoleteFiles();
+}
+
 size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
  InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
  return EstimateInMemoryStatsHistorySize();
 }
+
+void DBImpl::TEST_VerifyNoObsoleteFilesCached(
+    bool db_mutex_already_held) const {
+  // This check is somewhat expensive and obscure to make a part of every
+  // unit test in every build variety. Thus, we only enable it for ASAN builds.
+  if (!kMustFreeHeapAllocations) {
+    return;
+  }
+
+  std::optional<InstrumentedMutexLock> l;
+  if (db_mutex_already_held) {
+    mutex_.AssertHeld();
+  } else {
+    l.emplace(&mutex_);
+  }
+
+  if (!opened_successfully_) {
+    // We don't need to pro-actively clean up open files during DB::Open()
+    // if we know we are about to fail and clean up in Close().
+    return;
+  }
+  if (disable_delete_obsolete_files_ > 0) {
+    // For better or worse, DB::Close() is allowed with deletions disabled.
+    // Since we generally associate clean-up of open files with deleting them,
+    // we allow "obsolete" open files when deletions are disabled.
+    return;
+  }
+
+  // Live and "quarantined" files are allowed to be open in table cache
+  std::set<uint64_t> live_and_quar_files;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    // Iterate over live versions
+    Version* current = cfd->current();
+    Version* ver = current;
+    do {
+      // Sneakily add both SST and blob files to the same list
+      std::vector<uint64_t> live_files_vec;
+      ver->AddLiveFiles(&live_files_vec, &live_files_vec);
+      live_and_quar_files.insert(live_files_vec.begin(), live_files_vec.end());
+
+      ver = ver->Next();
+    } while (ver != current);
+  }
+  {
+    const auto& quar_files = error_handler_.GetFilesToQuarantine();
+    live_and_quar_files.insert(quar_files.begin(), quar_files.end());
+  }
+  auto fn = [&live_and_quar_files](const Slice& key, Cache::ObjectPtr, size_t,
+                                   const Cache::CacheItemHelper*) {
+    // See TableCache and BlobFileCache
+    assert(key.size() == sizeof(uint64_t));
+    uint64_t file_number;
+    GetUnaligned(reinterpret_cast<const uint64_t*>(key.data()), &file_number);
+    // Assert file is in live/quarantined set
+    assert(live_and_quar_files.find(file_number) != live_and_quar_files.end());
+  };
+  table_cache_->ApplyToAllEntries(fn, {});
+}
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // NDEBUG
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@ -47,7 +47,7 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
    // compaction score
    vstorage->ComputeCompactionScore(*cfd->ioptions(),
                                     *cfd->GetLatestMutableCFOptions());
-    SchedulePendingCompaction(cfd);
+    EnqueuePendingCompaction(cfd);
    MaybeScheduleFlushOrCompaction();
  }
  return Status::OK();
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@ -43,6 +43,14 @@ uint64_t DBImpl::GetObsoleteSstFilesSize() {
  return versions_->GetObsoleteSstFilesSize();
 }

+uint64_t DBImpl::MinOptionsFileNumberToKeep() {
+  mutex_.AssertHeld();
+  if (!min_options_file_numbers_.empty()) {
+    return *min_options_file_numbers_.begin();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
 Status DBImpl::DisableFileDeletions() {
  Status s;
  int my_disable_delete_obsolete_files;
@ -147,6 +155,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
  // here but later find newer generated unfinalized files while scanning.
  job_context->min_pending_output = MinObsoleteSstNumberToKeep();
  job_context->files_to_quarantine = error_handler_.GetFilesToQuarantine();
+  job_context->min_options_file_number = MinOptionsFileNumberToKeep();

  // Get obsolete files.  This function will also update the list of
  // pending files in VersionSet().
@ -440,14 +449,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
      // File is being deleted (actually obsolete)
      auto number = file.metadata->fd.GetNumber();
      candidate_files.emplace_back(MakeTableFileName(number), file.path);
-      if (handle == nullptr) {
-        // For files not "pinned" in table cache
-        handle = TableCache::Lookup(table_cache_.get(), number);
-      }
-      if (handle) {
-        TableCache::ReleaseObsolete(table_cache_.get(), handle,
-                                    file.uncache_aggressiveness);
-      }
+      TableCache::ReleaseObsolete(table_cache_.get(), number, handle,
+                                  file.uncache_aggressiveness);
    }
    file.DeleteMetadata();
  }
@ -498,7 +501,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
                                dbname_);

  // File numbers of most recent two OPTIONS file in candidate_files (found in
-  // previos FindObsoleteFiles(full_scan=true))
+  // previous FindObsoleteFiles(full_scan=true))
  // At this point, there must not be any duplicate file numbers in
  // candidate_files.
  uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
@ -519,6 +522,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
    }
  }

+  // For remote compactions, we need to keep OPTIONS file that may get
+  // referenced by the remote worker
+
+  optsfile_num2 = std::min(optsfile_num2, state.min_options_file_number);
+
  // Close WALs before trying to delete them.
  for (const auto w : state.logs_to_free) {
    // TODO: maybe check the return value of Close.
@ -558,9 +566,17 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
      case kTableFile:
        // If the second condition is not there, this makes
        // DontDeletePendingOutputs fail
+        // FIXME: but should NOT keep if it came from sst_delete_files?
        keep = (sst_live_set.find(number) != sst_live_set.end()) ||
               number >= state.min_pending_output;
        if (!keep) {
+          // NOTE: sometimes redundant (if came from sst_delete_files)
+          // We don't know which column family is applicable here so we don't
+          // know what uncache_aggressiveness would be used with
+          // ReleaseObsolete(). Anyway, obsolete files ideally go into
+          // sst_delete_files for better/quicker handling, and this is just a
+          // backstop.
+          TableCache::Evict(table_cache_.get(), number);
          files_to_del.insert(number);
        }
        break;
@ -722,13 +738,46 @@ void DBImpl::DeleteObsoleteFiles() {
  mutex_.Lock();
 }

+VersionEdit GetDBRecoveryEditForObsoletingMemTables(
+    VersionSet* vset, const ColumnFamilyData& cfd,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<ReadOnlyMemTable*>& memtables,
+    LogsWithPrepTracker* prep_tracker) {
+  VersionEdit wal_deletion_edit;
+  uint64_t min_wal_number_to_keep = 0;
+  assert(edit_list.size() > 0);
+  if (vset->db_options()->allow_2pc) {
+    // Note that if mempurge is successful, the edit_list will
+    // not be applicable (contains info of new min_log number to keep,
+    // and level 0 file path of SST file created during normal flush,
+    // so both pieces of information are irrelevant after a successful
+    // mempurge operation).
+    min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+        vset, cfd, edit_list, memtables, prep_tracker);
+
+    // We piggyback the information of earliest log file to keep in the
+    // manifest entry for the last file flushed.
+  } else {
+    min_wal_number_to_keep =
+        PrecomputeMinLogNumberToKeepNon2PC(vset, cfd, edit_list);
+  }
+
+  wal_deletion_edit.SetMinLogNumberToKeep(min_wal_number_to_keep);
+  if (vset->db_options()->track_and_verify_wals_in_manifest) {
+    if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
+      wal_deletion_edit.DeleteWalsBefore(min_wal_number_to_keep);
+    }
+  }
+  return wal_deletion_edit;
+}
+
 uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
+    VersionSet* vset, const autovector<ReadOnlyMemTable*>& memtables_to_flush) {
  uint64_t min_log = 0;

  // we must look through the memtables for two phase transactions
  // that have been committed but not yet flushed
-  std::unordered_set<MemTable*> memtables_to_flush_set(
+  std::unordered_set<ReadOnlyMemTable*> memtables_to_flush_set(
      memtables_to_flush.begin(), memtables_to_flush.end());
  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
    if (loop_cfd->IsDropped()) {
@ -753,12 +802,12 @@ uint64_t FindMinPrepLogReferencedByMemTable(
 }

 uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset,
-    const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
+    VersionSet* vset, const autovector<const autovector<ReadOnlyMemTable*>*>&
+                          memtables_to_flush) {
  uint64_t min_log = 0;

-  std::unordered_set<MemTable*> memtables_to_flush_set;
-  for (const autovector<MemTable*>* memtables : memtables_to_flush) {
+  std::unordered_set<ReadOnlyMemTable*> memtables_to_flush_set;
+  for (const autovector<ReadOnlyMemTable*>* memtables : memtables_to_flush) {
    memtables_to_flush_set.insert(memtables->begin(), memtables->end());
  }
  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
@ -850,7 +899,7 @@ uint64_t PrecomputeMinLogNumberToKeepNon2PC(
 uint64_t PrecomputeMinLogNumberToKeep2PC(
    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
    const autovector<VersionEdit*>& edit_list,
-    const autovector<MemTable*>& memtables_to_flush,
+    const autovector<ReadOnlyMemTable*>& memtables_to_flush,
    LogsWithPrepTracker* prep_tracker) {
  assert(vset != nullptr);
  assert(prep_tracker != nullptr);
@ -891,7 +940,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
 uint64_t PrecomputeMinLogNumberToKeep2PC(
    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
    const autovector<autovector<VersionEdit*>>& edit_lists,
-    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    const autovector<const autovector<ReadOnlyMemTable*>*>& memtables_to_flush,
    LogsWithPrepTracker* prep_tracker) {
  assert(vset != nullptr);
  assert(prep_tracker != nullptr);
@ -921,57 +970,65 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
 }

 void DBImpl::SetDBId(std::string&& id, bool read_only,
-                     RecoveryContext* recovery_ctx) {
+                     VersionEdit* version_edit) {
  assert(db_id_.empty());
  assert(!id.empty());
  db_id_ = std::move(id);
-  if (!read_only && immutable_db_options_.write_dbid_to_manifest) {
-    assert(recovery_ctx != nullptr);
+  if (!read_only && version_edit) {
+    assert(version_edit != nullptr);
    assert(versions_->GetColumnFamilySet() != nullptr);
-    VersionEdit edit;
-    edit.SetDBId(db_id_);
+    version_edit->SetDBId(db_id_);
    versions_->db_id_ = db_id_;
-    recovery_ctx->UpdateVersionEdits(
-        versions_->GetColumnFamilySet()->GetDefault(), edit);
  }
 }

 Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
-                         RecoveryContext* recovery_ctx) {
+                         bool is_new_db, bool is_retry,
+                         VersionEdit* version_edit) {
  Status s;
-  // Check for the IDENTITY file and create it if not there or
-  // broken or not matching manifest
-  std::string db_id_in_file;
-  s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
-  if (s.ok()) {
-    s = GetDbIdentityFromIdentityFile(&db_id_in_file);
-    if (s.ok() && !db_id_in_file.empty()) {
-      if (db_id_.empty()) {
-        // Loaded from file and wasn't already known from manifest
-        SetDBId(std::move(db_id_in_file), read_only, recovery_ctx);
-        return s;
-      } else if (db_id_ == db_id_in_file) {
-        // Loaded from file and matches manifest
-        return s;
+  if (!is_new_db) {
+    // Check for the IDENTITY file and create it if not there or
+    // broken or not matching manifest
+    std::string db_id_in_file;
+    s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+    if (s.ok()) {
+      IOOptions opts;
+      if (is_retry) {
+        opts.verify_and_reconstruct_read = true;
+      }
+      s = GetDbIdentityFromIdentityFile(opts, &db_id_in_file);
+      if (s.ok() && !db_id_in_file.empty()) {
+        if (db_id_.empty()) {
+          // Loaded from file and wasn't already known from manifest
+          SetDBId(std::move(db_id_in_file), read_only, version_edit);
+          return s;
+        } else if (db_id_ == db_id_in_file) {
+          // Loaded from file and matches manifest
+          return s;
+        }
      }
    }
-  }
-  if (s.IsNotFound()) {
-    s = Status::OK();
-  }
-  if (!s.ok()) {
-    assert(s.IsIOError());
-    return s;
+    if (s.IsNotFound()) {
+      s = Status::OK();
+    }
+    if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
+    }
  }
  // Otherwise IDENTITY file is missing or no good.
  // Generate new id if needed
  if (db_id_.empty()) {
-    SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx);
+    SetDBId(env_->GenerateUniqueId(), read_only, version_edit);
  }
  // Persist it to IDENTITY file if allowed
-  if (!read_only) {
-    s = SetIdentityFile(write_options, env_, dbname_, db_id_);
+  if (!read_only && immutable_db_options_.write_identity_file) {
+    s = SetIdentityFile(write_options, env_, dbname_,
+                        immutable_db_options_.metadata_write_temperature,
+                        db_id_);
  }
+  // NOTE: an obsolete IDENTITY file with write_identity_file=false is handled
+  // elsewhere, so that it's only deleted after successful recovery
  return s;
 }

--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@ -289,27 +289,25 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
          "start_time and end_time cannot be the same");
    }
  }
+
+  if (!db_options.write_dbid_to_manifest && !db_options.write_identity_file) {
+    return Status::InvalidArgument(
+        "write_dbid_to_manifest and write_identity_file cannot both be false");
+  }
  return Status::OK();
 }

 Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
-  VersionEdit new_db;
+  VersionEdit new_db_edit;
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
-  Status s = SetIdentityFile(write_options, env_, dbname_);
+  Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true,
+                       /*is_retry=*/false, &new_db_edit);
  if (!s.ok()) {
    return s;
  }
-  if (immutable_db_options_.write_dbid_to_manifest) {
-    std::string temp_db_id;
-    s = GetDbIdentityFromIdentityFile(&temp_db_id);
-    if (!s.ok()) {
-      return s;
-    }
-    new_db.SetDBId(temp_db_id);
-  }
-  new_db.SetLogNumber(0);
-  new_db.SetNextFile(2);
-  new_db.SetLastSequence(0);
+  new_db_edit.SetLogNumber(0);
+  new_db_edit.SetNextFile(2);
+  new_db_edit.SetLastSequence(0);

  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
  const std::string manifest = DescriptorFileName(dbname_, 1);
@ -319,6 +317,12 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
    }
    std::unique_ptr<FSWritableFile> file;
    FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
+    // DB option takes precedence when not kUnknown
+    if (immutable_db_options_.metadata_write_temperature !=
+        Temperature::kUnknown) {
+      file_options.temperature =
+          immutable_db_options_.metadata_write_temperature;
+    }
    s = NewWritableFile(fs_.get(), manifest, &file, file_options);
    if (!s.ok()) {
      return s;
@ -335,7 +339,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
        tmp_set.Contains(FileType::kDescriptorFile)));
    log::Writer log(std::move(file_writer), 0, false);
    std::string record;
-    new_db.EncodeTo(&record);
+    new_db_edit.EncodeTo(&record);
    s = log.AddRecord(write_options, record);
    if (s.ok()) {
      s = SyncManifest(&immutable_db_options_, write_options, log.file());
@ -344,6 +348,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
  if (s.ok()) {
    // Make "CURRENT" file that points to the new manifest file.
    s = SetCurrentFile(write_options, fs_.get(), dbname_, 1,
+                       immutable_db_options_.metadata_write_temperature,
                       directories_.GetDbDir());
    if (new_filenames) {
      new_filenames->emplace_back(
@ -520,7 +525,7 @@ Status DBImpl::Recover(
    }
    assert(s.ok());
  }
-  assert(db_id_.empty());
+  assert(is_new_db || db_id_.empty());
  Status s;
  bool missing_table_file = false;
  if (!immutable_db_options_.best_efforts_recovery) {
@ -530,6 +535,12 @@ Status DBImpl::Recover(
                           /*no_error_if_files_missing=*/false, is_retry,
                           &desc_status);
    desc_status.PermitUncheckedError();
+    if (is_retry) {
+      RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT);
+      if (desc_status.ok()) {
+        RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
+      }
+    }
    if (can_retry) {
      // If we're opening for the first time and the failure is likely due to
      // a corrupt MANIFEST file (could result in either the log::Reader
@ -564,6 +575,7 @@ Status DBImpl::Recover(
  }
  if (s.ok() && !read_only) {
    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto& moptions = *cfd->GetLatestMutableCFOptions();
      // Try to trivially move files down the LSM tree to start from bottommost
      // level when level_compaction_dynamic_level_bytes is enabled. This should
      // only be useful when user is migrating to turning on this option.
@ -581,14 +593,14 @@ Status DBImpl::Recover(
      if (cfd->ioptions()->compaction_style ==
              CompactionStyle::kCompactionStyleLevel &&
          cfd->ioptions()->level_compaction_dynamic_level_bytes &&
-          !cfd->GetLatestMutableCFOptions()->disable_auto_compactions) {
+          !moptions.disable_auto_compactions) {
        int to_level = cfd->ioptions()->num_levels - 1;
        // last level is reserved
        // allow_ingest_behind does not support Level Compaction,
        // and per_key_placement can have infinite compaction loop for Level
        // Compaction. Adjust to_level here just to be safe.
        if (cfd->ioptions()->allow_ingest_behind ||
-            cfd->ioptions()->preclude_last_level_data_seconds > 0) {
+            moptions.preclude_last_level_data_seconds > 0) {
          to_level -= 1;
        }
        // Whether this column family has a level trivially moved
@ -660,7 +672,17 @@ Status DBImpl::Recover(
      }
    }
  }
-  s = SetupDBId(write_options, read_only, recovery_ctx);
+  if (is_new_db) {
+    // Already set up DB ID in NewDB
+  } else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) {
+    VersionEdit edit;
+    s = SetupDBId(write_options, read_only, is_new_db, is_retry, &edit);
+    recovery_ctx->UpdateVersionEdits(
+        versions_->GetColumnFamilySet()->GetDefault(), edit);
+  } else {
+    s = SetupDBId(write_options, read_only, is_new_db, is_retry, nullptr);
+  }
+  assert(!s.ok() || !db_id_.empty());
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
  if (s.ok() && !read_only) {
    s = MaybeUpdateNextFileNumber(recovery_ctx);
@ -1253,7 +1275,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
          reader.GetRecordedTimestampSize();
      status = HandleWriteBatchTimestampSizeDifference(
          &batch, running_ts_sz, record_ts_sz,
-          TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch);
+          TimestampSizeConsistencyMode::kReconcileInconsistency, seq_per_batch_,
+          batch_per_txn_, &new_batch);
      if (!status.ok()) {
        return status;
      }
@ -1646,9 +1669,19 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
  Arena arena;
  Status s;
  TableProperties table_properties;
+  const auto* ucmp = cfd->internal_comparator().user_comparator();
+  assert(ucmp);
+  const size_t ts_sz = ucmp->timestamp_size();
+  const bool logical_strip_timestamp =
+      ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps;
  {
    ScopedArenaPtr<InternalIterator> iter(
-        mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+        logical_strip_timestamp
+            ? mem->NewTimestampStrippingIterator(
+                  ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+                  /*prefix_extractor=*/nullptr, ts_sz)
+            : mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+                               /*prefix_extractor=*/nullptr));
    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                    "[%s] [WriteLevel0TableForRecovery]"
                    " Level-0 table #%" PRIu64 ": started",
@ -1667,7 +1700,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
    meta.oldest_ancester_time = current_time;
    meta.epoch_number = cfd->NewEpochNumber();
    {
-      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      auto write_hint =
+          cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
      mutex_.Unlock();

      SequenceNumber earliest_write_conflict_snapshot;
@ -1682,11 +1716,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
          range_del_iters;
      auto range_del_iter =
-          // This is called during recovery, where a live memtable is flushed
-          // directly. In this case, no fragmented tombstone list is cached in
-          // this memtable yet.
-          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
-                                         false /* immutable_memtable */);
+          logical_strip_timestamp
+              ? mem->NewTimestampStrippingRangeTombstoneIterator(
+                    ro, kMaxSequenceNumber, ts_sz)
+              // This is called during recovery, where a live memtable is
+              // flushed directly. In this case, no fragmented tombstone list is
+              // cached in this memtable yet.
+              : mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
+                                               false /* immutable_memtable */);
      if (range_del_iter != nullptr) {
        range_del_iters.emplace_back(range_del_iter);
      }
@ -1700,10 +1737,11 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
          cfd->internal_comparator(), cfd->internal_tbl_prop_coll_factories(),
          GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
          mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
-          0 /* level */, false /* is_bottommost */,
-          TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
-          0 /* file_creation_time */, db_id_, db_session_id_,
-          0 /* target_file_size */, meta.fd.GetNumber(), kMaxSequenceNumber);
+          0 /* level */, current_time /* newest_key_time */,
+          false /* is_bottommost */, TableFileCreationReason::kRecovery,
+          0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
+          db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber(),
+          kMaxSequenceNumber);
      Version* version = cfd->current();
      version->Ref();
      uint64_t num_input_entries = 0;
@ -1733,7 +1771,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
        s = io_s;
      }

-      uint64_t total_num_entries = mem->num_entries();
+      uint64_t total_num_entries = mem->NumEntries();
      if (s.ok() && total_num_entries != num_input_entries) {
        std::string msg = "Expected " + std::to_string(total_num_entries) +
                          " entries in memtable, but read " +
@ -1772,9 +1810,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,

    // For UDT in memtable only feature, move up the cutoff timestamp whenever
    // a flush happens.
-    const Comparator* ucmp = cfd->user_comparator();
-    size_t ts_sz = ucmp->timestamp_size();
-    if (ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps) {
+    if (logical_strip_timestamp) {
      Slice mem_newest_udt = mem->GetNewestUDT();
      std::string full_history_ts_low = cfd->GetFullHistoryTsLow();
      if (full_history_ts_low.empty() ||
@ -1930,6 +1966,10 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
      BuildDBOptions(immutable_db_options_, mutable_db_options_);
  FileOptions opt_file_options =
      fs_->OptimizeForLogWrite(file_options_, db_options);
+  // DB option takes precedence when not kUnknown
+  if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
+    opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
+  }
  std::string wal_dir = immutable_db_options_.GetWalDir();
  std::string log_fname = LogFileName(wal_dir, log_file_num);

@ -1969,46 +2009,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,

 void DBImpl::TrackExistingDataFiles(
    const std::vector<std::string>& existing_data_files) {
-  auto sfm = static_cast<SstFileManagerImpl*>(
-      immutable_db_options_.sst_file_manager.get());
-  assert(sfm);
-  std::vector<ColumnFamilyMetaData> metadata;
-  GetAllColumnFamilyMetaData(&metadata);
-
-  std::unordered_set<std::string> referenced_files;
-  for (const auto& md : metadata) {
-    for (const auto& lmd : md.levels) {
-      for (const auto& fmd : lmd.files) {
-        // We're assuming that each sst file name exists in at most one of
-        // the paths.
-        std::string file_path =
-            fmd.directory + kFilePathSeparator + fmd.relative_filename;
-        sfm->OnAddFile(file_path, fmd.size).PermitUncheckedError();
-        referenced_files.insert(file_path);
-      }
-    }
-    for (const auto& bmd : md.blob_files) {
-      std::string name = bmd.blob_file_name;
-      // The BlobMetaData.blob_file_name may start with "/".
-      if (!name.empty() && name[0] == kFilePathSeparator) {
-        name = name.substr(1);
-      }
-      // We're assuming that each blob file name exists in at most one of
-      // the paths.
-      std::string file_path = bmd.blob_file_path + kFilePathSeparator + name;
-      sfm->OnAddFile(file_path, bmd.blob_file_size).PermitUncheckedError();
-      referenced_files.insert(file_path);
-    }
-  }
-
-  for (const auto& file_path : existing_data_files) {
-    if (referenced_files.find(file_path) != referenced_files.end()) {
-      continue;
-    }
-    // There shouldn't be any duplicated files. In case there is, SstFileManager
-    // will take care of deduping it.
-    sfm->OnAddFile(file_path).PermitUncheckedError();
-  }
+  TrackOrUntrackFiles(existing_data_files, /*track=*/true);
 }

 Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
@ -2152,6 +2153,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
    s = impl->LogAndApplyForRecovery(recovery_ctx);
  }

+  if (s.ok() && !impl->immutable_db_options_.write_identity_file) {
+    // On successful recovery, delete an obsolete IDENTITY file to avoid DB ID
+    // inconsistency
+    impl->env_->DeleteFile(IdentityFileName(impl->dbname_))
+        .PermitUncheckedError();
+  }
+
  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
    impl->mutex_.AssertHeld();
    s = impl->InitPersistStatsColumnFamily();
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@ -265,7 +265,8 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
    const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
    std::string manifest_path;
    uint64_t manifest_file_number;
-    s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
+    s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), /*is_retry=*/false,
+                                           &manifest_path,
                                           &manifest_file_number);
  } else {
    // Historic behavior that doesn't necessarily make sense
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@ -12,7 +12,8 @@
 #include "logging/auto_roll_logger.h"
 #include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
-#include "rocksdb/configurable.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/options_util.h"
 #include "util/cast_util.h"
 #include "util/write_batch_util.h"

@ -232,7 +233,8 @@ Status DBImplSecondary::RecoverLogFiles(
          reader->GetRecordedTimestampSize();
      status = HandleWriteBatchTimestampSizeDifference(
          &batch, running_ts_sz, record_ts_sz,
-          TimestampSizeConsistencyMode::kVerifyConsistency);
+          TimestampSizeConsistencyMode::kVerifyConsistency, seq_per_batch_,
+          batch_per_txn_);
      if (!status.ok()) {
        break;
      }
@ -246,9 +248,7 @@ Status DBImplSecondary::RecoverLogFiles(
          if (cfd == nullptr) {
            continue;
          }
-          if (cfds_changed->count(cfd) == 0) {
-            cfds_changed->insert(cfd);
-          }
+          cfds_changed->insert(cfd);
          const std::vector<FileMetaData*>& l0_files =
              cfd->current()->storage_info()->LevelFiles(0);
          SequenceNumber seq =
@ -938,69 +938,101 @@ Status DB::OpenAndCompact(
    const std::string& output_directory, const std::string& input,
    std::string* output,
    const CompactionServiceOptionsOverride& override_options) {
+  // Check for cancellation
  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
  }
+
+  // 1. Deserialize Compaction Input
  CompactionServiceInput compaction_input;
  Status s = CompactionServiceInput::Read(input, &compaction_input);
  if (!s.ok()) {
    return s;
  }

-  compaction_input.db_options.max_open_files = -1;
-  compaction_input.db_options.compaction_service = nullptr;
-  if (compaction_input.db_options.statistics) {
-    compaction_input.db_options.statistics.reset();
-  }
-  compaction_input.db_options.env = override_options.env;
-  compaction_input.db_options.file_checksum_gen_factory =
-      override_options.file_checksum_gen_factory;
-  compaction_input.db_options.statistics = override_options.statistics;
-  compaction_input.column_family.options.comparator =
-      override_options.comparator;
-  compaction_input.column_family.options.merge_operator =
-      override_options.merge_operator;
-  compaction_input.column_family.options.compaction_filter =
-      override_options.compaction_filter;
-  compaction_input.column_family.options.compaction_filter_factory =
-      override_options.compaction_filter_factory;
-  compaction_input.column_family.options.prefix_extractor =
-      override_options.prefix_extractor;
-  compaction_input.column_family.options.table_factory =
-      override_options.table_factory;
-  compaction_input.column_family.options.sst_partitioner_factory =
-      override_options.sst_partitioner_factory;
-  compaction_input.column_family.options.table_properties_collector_factories =
-      override_options.table_properties_collector_factories;
-  compaction_input.db_options.listeners = override_options.listeners;
+  // 2. Load the options
+  DBOptions db_options;
+  ConfigOptions config_options;
+  config_options.env = override_options.env;
+  std::vector<ColumnFamilyDescriptor> all_column_families;

-  std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.push_back(compaction_input.column_family);
-  // TODO: we have to open default CF, because of an implementation limitation,
-  // currently we just use the same CF option from input, which is not collect
-  // and open may fail.
-  if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
-    column_families.emplace_back(kDefaultColumnFamilyName,
-                                 compaction_input.column_family.options);
-  }
+  std::string options_file_name =
+      OptionsFileName(name, compaction_input.options_file_number);

-  DB* db;
-  std::vector<ColumnFamilyHandle*> handles;
-
-  s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
-                          column_families, &handles, &db);
+  s = LoadOptionsFromFile(config_options, options_file_name, &db_options,
+                          &all_column_families);
  if (!s.ok()) {
    return s;
  }

+  // 3. Override pointer configurations in DBOptions with
+  // CompactionServiceOptionsOverride
+  db_options.env = override_options.env;
+  db_options.file_checksum_gen_factory =
+      override_options.file_checksum_gen_factory;
+  db_options.statistics = override_options.statistics;
+  db_options.listeners = override_options.listeners;
+  db_options.compaction_service = nullptr;
+  // We will close the DB after the compaction anyway.
+  // Open as many files as needed for the compaction.
+  db_options.max_open_files = -1;
+
+  // 4. Filter CFs that are needed for OpenAndCompact()
+  // We do not need to open all column families for the remote compaction.
+  // Only open default CF + target CF. If target CF == default CF, we will open
+  // just the default CF (Due to current limitation, DB cannot open without the
+  // default CF)
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (auto& cf : all_column_families) {
+    if (cf.name == compaction_input.cf_name) {
+      cf.options.comparator = override_options.comparator;
+      cf.options.merge_operator = override_options.merge_operator;
+      cf.options.compaction_filter = override_options.compaction_filter;
+      cf.options.compaction_filter_factory =
+          override_options.compaction_filter_factory;
+      cf.options.prefix_extractor = override_options.prefix_extractor;
+      cf.options.table_factory = override_options.table_factory;
+      cf.options.sst_partitioner_factory =
+          override_options.sst_partitioner_factory;
+      cf.options.table_properties_collector_factories =
+          override_options.table_properties_collector_factories;
+      column_families.emplace_back(cf);
+    } else if (cf.name == kDefaultColumnFamilyName) {
+      column_families.emplace_back(cf);
+    }
+  }
+
+  // 5. Open db As Secondary
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  s = DB::OpenAsSecondary(db_options, name, output_directory, column_families,
+                          &handles, &db);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(db);
+
+  // 6. Find the handle of the Column Family that this will compact
+  ColumnFamilyHandle* cfh = nullptr;
+  for (auto* handle : handles) {
+    if (compaction_input.cf_name == handle->GetName()) {
+      cfh = handle;
+      break;
+    }
+  }
+  assert(cfh);
+
+  // 7. Run the compaction without installation.
+  // Output will be stored in the directory specified by output_directory
  CompactionServiceResult compaction_result;
  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
-  assert(handles.size() > 0);
-  s = db_secondary->CompactWithoutInstallation(
-      options, handles[0], compaction_input, &compaction_result);
+  s = db_secondary->CompactWithoutInstallation(options, cfh, compaction_input,
+                                               &compaction_result);

+  // 8. Serialize the result
  Status serialization_status = compaction_result.Write(output);

+  // 9. Close the db and return
  for (auto& handle : handles) {
    delete handle;
  }
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@ -656,7 +656,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,

  if (!io_s.ok()) {
    // Check WriteToWAL status
-    IOStatusCheck(io_s);
+    WALIOStatusCheck(io_s);
  }
  if (!w.CallbackFailed()) {
    if (!io_s.ok()) {
@ -687,7 +687,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
        }
      }
      // Note: if we are to resume after non-OK statuses we need to revisit how
-      // we reacts to non-OK statuses here.
+      // we react to non-OK statuses here.
      versions_->SetLastSequence(last_sequence);
    }
    MemTableInsertStatusCheck(w.status);
@ -735,17 +735,6 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
    size_t total_byte_size = 0;

    if (w.status.ok()) {
-      // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
-      // grabs but does not seem thread-safe.
-      if (tracer_) {
-        InstrumentedMutexLock lock(&trace_mutex_);
-        if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
-          for (auto* writer : wal_write_group) {
-            // TODO: maybe handle the tracing status?
-            tracer_->Write(writer->batch).PermitUncheckedError();
-          }
-        }
-      }
      SequenceNumber next_sequence = current_sequence;
      for (auto* writer : wal_write_group) {
        assert(writer);
@ -760,6 +749,22 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
          }
        }
      }
+      // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+      // grabs but does not seem thread-safe.
+      if (tracer_) {
+        InstrumentedMutexLock lock(&trace_mutex_);
+        if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+          for (auto* writer : wal_write_group) {
+            if (writer->CallbackFailed()) {
+              // When optimisitc txn conflict checking fails, we should
+              // not record to trace.
+              continue;
+            }
+            // TODO: maybe handle the tracing status?
+            tracer_->Write(writer->batch).PermitUncheckedError();
+          }
+        }
+      }
      if (w.disable_wal) {
        has_unpersisted_data_.store(true, std::memory_order_relaxed);
      }
@ -799,7 +804,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,

    if (!io_s.ok()) {
      // Check WriteToWAL status
-      IOStatusCheck(io_s);
+      WALIOStatusCheck(io_s);
    } else if (!w.CallbackFailed()) {
      WriteStatusCheck(w.status);
    }
@ -969,21 +974,17 @@ Status DBImpl::WriteImplWALOnly(
  assert(w.state == WriteThread::STATE_GROUP_LEADER);

  if (publish_last_seq == kDoPublishLastSeq) {
-    Status status;
-
    // Currently we only use kDoPublishLastSeq in unordered_write
    assert(immutable_db_options_.unordered_write);
-    WriteContext write_context;
-    if (error_handler_.IsDBStopped()) {
-      status = error_handler_.GetBGError();
-    }
+
    // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
    // without paying the cost of obtaining the mutex.
-    if (status.ok()) {
-      LogContext log_context;
-      status = PreprocessWrite(write_options, &log_context, &write_context);
-      WriteStatusCheckOnLocked(status);
-    }
+    LogContext log_context;
+    WriteContext write_context;
+    Status status =
+        PreprocessWrite(write_options, &log_context, &write_context);
+    WriteStatusCheckOnLocked(status);
+
    if (!status.ok()) {
      WriteThread::WriteGroup write_group;
      write_thread->EnterAsBatchGroupLeader(&w, &write_group);
@ -1009,19 +1010,6 @@ Status DBImpl::WriteImplWALOnly(
  WriteThread::WriteGroup write_group;
  uint64_t last_sequence;
  write_thread->EnterAsBatchGroupLeader(&w, &write_group);
-  // Note: no need to update last_batch_group_size_ here since the batch writes
-  // to WAL only
-  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
-  // grabs but does not seem thread-safe.
-  if (tracer_) {
-    InstrumentedMutexLock lock(&trace_mutex_);
-    if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
-      for (auto* writer : write_group) {
-        // TODO: maybe handle the tracing status?
-        tracer_->Write(writer->batch).PermitUncheckedError();
-      }
-    }
-  }

  size_t pre_release_callback_cnt = 0;
  size_t total_byte_size = 0;
@ -1036,6 +1024,23 @@ Status DBImpl::WriteImplWALOnly(
    }
  }

+  // Note: no need to update last_batch_group_size_ here since the batch writes
+  // to WAL only
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+      for (auto* writer : write_group) {
+        if (writer->CallbackFailed()) {
+          continue;
+        }
+        // TODO: maybe handle the tracing status?
+        tracer_->Write(writer->batch).PermitUncheckedError();
+      }
+    }
+  }
+
  const bool concurrent_update = true;
  // Update stats while we are an exclusive group leader, so we know
  // that nobody else can be writing to these particular stats.
@ -1081,7 +1086,7 @@ Status DBImpl::WriteImplWALOnly(
    // This error checking and return is moved up to avoid using uninitialized
    // last_sequence.
    if (!io_s.ok()) {
-      IOStatusCheck(io_s);
+      WALIOStatusCheck(io_s);
      write_thread->ExitAsBatchGroupLeader(write_group, status);
      return status;
    }
@ -1179,7 +1184,7 @@ void DBImpl::WriteStatusCheck(const Status& status) {
  }
 }

-void DBImpl::IOStatusCheck(const IOStatus& io_status) {
+void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
  // Is setting bg_error_ enough here?  This will at least stop
  // compaction and fail any further writes.
  if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
@ -1187,7 +1192,8 @@ void DBImpl::IOStatusCheck(const IOStatus& io_status) {
      io_status.IsIOFenced()) {
    mutex_.Lock();
    // Maybe change the return status to void?
-    error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
+    error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback,
+                              /*wal_related=*/true);
    mutex_.Unlock();
  } else {
    // Force writable file to be continue writable.
@ -1484,9 +1490,14 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
        if (!io_s.ok()) {
          break;
        }
-        io_s = log.writer->file()->Sync(opts, immutable_db_options_.use_fsync);
-        if (!io_s.ok()) {
-          break;
+        // If last sync failed on a later WAL, this could be a fully synced
+        // and closed WAL that just needs to be recorded as synced in the
+        // manifest.
+        if (auto* f = log.writer->file()) {
+          io_s = f->Sync(opts, immutable_db_options_.use_fsync);
+          if (!io_s.ok()) {
+            break;
+          }
        }
      }
    }
@ -1599,6 +1610,8 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
 Status DBImpl::WriteRecoverableState() {
  mutex_.AssertHeld();
  if (!cached_recoverable_state_empty_) {
+    // Only for write-prepared and write-unprepared.
+    assert(seq_per_batch_);
    bool dont_care_bool;
    SequenceNumber next_seq;
    if (two_write_queues_) {
@ -1788,13 +1801,13 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
      if (!immutable_db_options_.atomic_flush) {
        FlushRequest flush_req;
        GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req);
-        SchedulePendingFlush(flush_req);
+        EnqueuePendingFlush(flush_req);
      }
    }
    if (immutable_db_options_.atomic_flush) {
      FlushRequest flush_req;
      GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req);
-      SchedulePendingFlush(flush_req);
+      EnqueuePendingFlush(flush_req);
    }
    MaybeScheduleFlushOrCompaction();
  }
@ -1880,13 +1893,13 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
        FlushRequest flush_req;
        GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager,
                             &flush_req);
-        SchedulePendingFlush(flush_req);
+        EnqueuePendingFlush(flush_req);
      }
    }
    if (immutable_db_options_.atomic_flush) {
      FlushRequest flush_req;
      GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req);
-      SchedulePendingFlush(flush_req);
+      EnqueuePendingFlush(flush_req);
    }
    MaybeScheduleFlushOrCompaction();
  }
@ -2162,12 +2175,12 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
      AssignAtomicFlushSeq(cfds);
      FlushRequest flush_req;
      GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req);
-      SchedulePendingFlush(flush_req);
+      EnqueuePendingFlush(flush_req);
    } else {
      for (auto* cfd : cfds) {
        FlushRequest flush_req;
        GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req);
-        SchedulePendingFlush(flush_req);
+        EnqueuePendingFlush(flush_req);
      }
    }
    MaybeScheduleFlushOrCompaction();
@ -2240,8 +2253,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
  memtable_info.cf_name = cfd->GetName();
  memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
  memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
-  memtable_info.num_entries = cfd->mem()->num_entries();
-  memtable_info.num_deletes = cfd->mem()->num_deletes();
+  memtable_info.num_entries = cfd->mem()->NumEntries();
+  memtable_info.num_deletes = cfd->mem()->NumDeletion();
  if (!cfd->ioptions()->persist_user_defined_timestamps &&
      cfd->user_comparator()->timestamp_size() > 0) {
    const Slice& newest_udt = cfd->mem()->GetNewestUDT();
@ -2325,7 +2338,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
    // We may have lost data from the WritableFileBuffer in-memory buffer for
    // the current log, so treat it as a fatal error and set bg_error
    if (!io_s.ok()) {
-      error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable,
+                                /*wal_related=*/true);
    } else {
      error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
    }
--- a/db/db_io_failure_test.cc
+++ b/db/db_io_failure_test.cc
@ -27,12 +27,14 @@ class CorruptionFS : public FileSystemWrapper {
        num_writable_file_errors_(0),
        corruption_trigger_(INT_MAX),
        read_count_(0),
+        corrupt_offset_(0),
+        corrupt_len_(0),
        rnd_(300),
        fs_buffer_(fs_buffer),
        verify_read_(verify_read) {}
  ~CorruptionFS() override {
    // Assert that the corruption was reset, which means it got triggered
-    assert(corruption_trigger_ == INT_MAX);
+    assert(corruption_trigger_ == INT_MAX || corrupt_len_ > 0);
  }
  const char* Name() const override { return "ErrorEnv"; }

@ -48,8 +50,10 @@ class CorruptionFS : public FileSystemWrapper {
  }

  void SetCorruptionTrigger(const int trigger) {
+    MutexLock l(&mutex_);
    corruption_trigger_ = trigger;
    read_count_ = 0;
+    corrupt_fname_.clear();
  }

  IOStatus NewRandomAccessFile(const std::string& fname,
@ -58,25 +62,31 @@ class CorruptionFS : public FileSystemWrapper {
                               IODebugContext* dbg) override {
    class CorruptionRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
     public:
-      CorruptionRandomAccessFile(CorruptionFS& fs,
+      CorruptionRandomAccessFile(CorruptionFS& fs, const std::string& fname,
                                 std::unique_ptr<FSRandomAccessFile>& file)
-          : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+          : FSRandomAccessFileOwnerWrapper(std::move(file)),
+            fs_(fs),
+            fname_(fname) {}

      IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
                    Slice* result, char* scratch,
                    IODebugContext* dbg) const override {
        IOStatus s = target()->Read(offset, len, opts, result, scratch, dbg);
        if (opts.verify_and_reconstruct_read) {
+          fs_.MaybeResetOverlapWithCorruptedChunk(fname_, offset,
+                                                  result->size());
          return s;
        }
+
+        MutexLock l(&fs_.mutex_);
        if (s.ok() && ++fs_.read_count_ >= fs_.corruption_trigger_) {
-          fs_.read_count_ = 0;
          fs_.corruption_trigger_ = INT_MAX;
          char* data = const_cast<char*>(result->data());
          std::memcpy(
              data,
              fs_.rnd_.RandomString(static_cast<int>(result->size())).c_str(),
              result->size());
+          fs_.SetCorruptedChunk(fname_, offset, result->size());
        }
        return s;
      }
@ -101,14 +111,76 @@ class CorruptionFS : public FileSystemWrapper {
        return IOStatus::OK();
      }

+      IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                        const IOOptions& /*options*/,
+                        IODebugContext* /*dbg*/) override {
+        return IOStatus::NotSupported("Prefetch");
+      }
+
     private:
      CorruptionFS& fs_;
+      std::string fname_;
    };

    std::unique_ptr<FSRandomAccessFile> file;
    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
    EXPECT_OK(s);
-    result->reset(new CorruptionRandomAccessFile(*this, file));
+    result->reset(new CorruptionRandomAccessFile(*this, fname, file));
+
+    return s;
+  }
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override {
+    class CorruptionSequentialFile : public FSSequentialFileOwnerWrapper {
+     public:
+      CorruptionSequentialFile(CorruptionFS& fs, const std::string& fname,
+                               std::unique_ptr<FSSequentialFile>& file)
+          : FSSequentialFileOwnerWrapper(std::move(file)),
+            fs_(fs),
+            fname_(fname),
+            offset_(0) {}
+
+      IOStatus Read(size_t len, const IOOptions& opts, Slice* result,
+                    char* scratch, IODebugContext* dbg) override {
+        IOStatus s = target()->Read(len, opts, result, scratch, dbg);
+        if (result->size() == 0 ||
+            fname_.find("IDENTITY") != std::string::npos) {
+          return s;
+        }
+
+        if (opts.verify_and_reconstruct_read) {
+          fs_.MaybeResetOverlapWithCorruptedChunk(fname_, offset_,
+                                                  result->size());
+          return s;
+        }
+
+        MutexLock l(&fs_.mutex_);
+        if (s.ok() && ++fs_.read_count_ >= fs_.corruption_trigger_) {
+          fs_.corruption_trigger_ = INT_MAX;
+          char* data = const_cast<char*>(result->data());
+          std::memcpy(
+              data,
+              fs_.rnd_.RandomString(static_cast<int>(result->size())).c_str(),
+              result->size());
+          fs_.SetCorruptedChunk(fname_, offset_, result->size());
+        }
+        offset_ += result->size();
+        return s;
+      }
+
+     private:
+      CorruptionFS& fs_;
+      std::string fname_;
+      size_t offset_;
+    };
+
+    std::unique_ptr<FSSequentialFile> file;
+    IOStatus s = target()->NewSequentialFile(fname, file_opts, &file, dbg);
+    EXPECT_OK(s);
+    result->reset(new CorruptionSequentialFile(*this, fname, file));

    return s;
  }
@ -123,12 +195,40 @@ class CorruptionFS : public FileSystemWrapper {
    }
  }

+  void SetCorruptedChunk(const std::string& fname, size_t offset, size_t len) {
+    assert(corrupt_fname_.empty());
+
+    corrupt_fname_ = fname;
+    corrupt_offset_ = offset;
+    corrupt_len_ = len;
+  }
+
+  void MaybeResetOverlapWithCorruptedChunk(const std::string& fname,
+                                           size_t offset, size_t len) {
+    if (fname == corrupt_fname_ &&
+        ((offset <= corrupt_offset_ && (offset + len) > corrupt_offset_) ||
+         (offset >= corrupt_offset_ &&
+          offset < (corrupt_offset_ + corrupt_len_)))) {
+      corrupt_fname_.clear();
+    }
+  }
+
+  bool VerifyRetry() { return corrupt_len_ > 0 && corrupt_fname_.empty(); }
+
+  int read_count() { return read_count_; }
+
+  int corruption_trigger() { return corruption_trigger_; }
+
 private:
  int corruption_trigger_;
  int read_count_;
+  std::string corrupt_fname_;
+  size_t corrupt_offset_;
+  size_t corrupt_len_;
  Random rnd_;
  bool fs_buffer_;
  bool verify_read_;
+  port::Mutex mutex_;
 };
 }  // anonymous namespace

@ -705,6 +805,7 @@ class DBIOCorruptionTest
  DBIOCorruptionTest() : DBIOFailureTest() {
    BlockBasedTableOptions bbto;
    options_ = CurrentOptions();
+    options_.statistics = CreateDBStatistics();

    base_env_ = env_;
    EXPECT_NE(base_env_, nullptr);
@ -716,6 +817,7 @@ class DBIOCorruptionTest
    bbto.num_file_reads_for_auto_readahead = 0;
    options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
    options_.disable_auto_compactions = true;
+    options_.max_file_opening_threads = 0;

    Reopen(options_);
  }
@ -727,6 +829,8 @@ class DBIOCorruptionTest

  Status ReopenDB() { return TryReopen(options_); }

+  Statistics* stats() { return options_.statistics.get(); }
+
 protected:
  std::unique_ptr<Env> env_guard_;
  std::shared_ptr<CorruptionFS> fs_;
@ -749,8 +853,12 @@ TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) {
  if (std::get<2>(GetParam())) {
    ASSERT_OK(s);
    ASSERT_EQ(val, "val1");
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);
  } else {
    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
  }
 }

@ -773,8 +881,12 @@ TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) {
  }
  if (std::get<2>(GetParam())) {
    ASSERT_OK(iter->status());
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);
  } else {
    ASSERT_TRUE(iter->status().IsCorruption());
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
  }
  delete iter;
 }
@ -799,9 +911,13 @@ TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) {
  if (std::get<2>(GetParam())) {
    ASSERT_EQ(values[0].ToString(), "val1");
    ASSERT_EQ(values[1].ToString(), "val2");
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);
  } else {
    ASSERT_TRUE(statuses[0].IsCorruption());
    ASSERT_TRUE(statuses[1].IsCorruption());
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
  }
 }

@ -818,6 +934,9 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
  if (std::get<2>(GetParam())) {
    ASSERT_OK(s);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);

    std::string val;
    ReadOptions ro;
@ -826,6 +945,7 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
    ASSERT_EQ(val, "val1");
  } else {
    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
  }
 }

@ -838,6 +958,9 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
  Status s = Flush();
  if (std::get<2>(GetParam())) {
    ASSERT_OK(s);
+    ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);

    std::string val;
    ReadOptions ro;
@ -846,6 +969,7 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
    ASSERT_EQ(val, "val1");
  } else {
    ASSERT_NOK(s);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
  }
 }

@ -862,12 +986,142 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {

  if (std::get<2>(GetParam())) {
    ASSERT_OK(ReopenDB());
+    ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_GT(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);
  } else {
    ASSERT_EQ(ReopenDB(), Status::Corruption());
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
  }
  SyncPoint::GetInstance()->DisableProcessing();
 }

+TEST_P(DBIOCorruptionTest, FooterReadCorruptionRetry) {
+  Random rnd(300);
+  bool retry = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ReadFooterFromFileInternal:0", [&](void* arg) {
+        Slice* data = static_cast<Slice*>(arg);
+        if (!retry) {
+          std::memcpy(const_cast<char*>(data->data()),
+                      rnd.RandomString(static_cast<int>(data->size())).c_str(),
+                      data->size());
+          retry = true;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "val1"));
+  Status s = Flush();
+  if (std::get<2>(GetParam())) {
+    ASSERT_OK(s);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);
+
+    std::string val;
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    ASSERT_OK(dbfull()->Get(ro, "key1", &val));
+    ASSERT_EQ(val, "val1");
+  } else {
+    ASSERT_NOK(s);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
+    ASSERT_GT(stats()->getTickerCount(SST_FOOTER_CORRUPTION_COUNT), 0);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBIOCorruptionTest, TablePropertiesCorruptionRetry) {
+  Random rnd(300);
+  bool retry = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ReadTablePropertiesHelper:0", [&](void* arg) {
+        Slice* data = static_cast<Slice*>(arg);
+        if (!retry) {
+          std::memcpy(const_cast<char*>(data->data()),
+                      rnd.RandomString(static_cast<int>(data->size())).c_str(),
+                      data->size());
+          retry = true;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "val1"));
+  Status s = Flush();
+  if (std::get<2>(GetParam())) {
+    ASSERT_OK(s);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
+              1);
+
+    std::string val;
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    ASSERT_OK(dbfull()->Get(ro, "key1", &val));
+    ASSERT_EQ(val, "val1");
+  } else {
+    ASSERT_NOK(s);
+    ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBIOCorruptionTest, DBOpenReadCorruptionRetry) {
+  if (!std::get<2>(GetParam())) {
+    return;
+  }
+  CorruptionFS* fs =
+      static_cast<CorruptionFS*>(env_guard_->GetFileSystem().get());
+
+  for (int sst = 0; sst < 3; ++sst) {
+    for (int key = 0; key < 100; ++key) {
+      std::stringstream ss;
+      ss << std::setw(3) << 100 * sst + key;
+      ASSERT_OK(Put("key" + ss.str(), "val" + ss.str()));
+    }
+    ASSERT_OK(Flush());
+  }
+  Close();
+
+  // DB open will create table readers unless we reduce the table cache
+  // capacity.
+  // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+  // is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 11 so table cache capacity will become 1. This will
+  // prevent file open during DB open and force the file to be opened
+  // during MultiGet
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Progressively increase the IO count trigger for corruption, and verify
+  // that it was retried
+  int corruption_trigger = 1;
+  fs->SetCorruptionTrigger(corruption_trigger);
+  do {
+    fs->SetCorruptionTrigger(corruption_trigger);
+    ASSERT_OK(ReopenDB());
+    for (int sst = 0; sst < 3; ++sst) {
+      for (int key = 0; key < 100; ++key) {
+        std::stringstream ss;
+        ss << std::setw(3) << 100 * sst + key;
+        ASSERT_EQ(Get("key" + ss.str()), "val" + ss.str());
+      }
+    }
+    // Verify that the injected corruption was repaired
+    ASSERT_TRUE(fs->VerifyRetry());
+    corruption_trigger++;
+  } while (fs->corruption_trigger() == INT_MAX);
+}
+
 // The parameters are - 1. Use FS provided buffer, 2. Use async IO ReadOption,
 // 3. Retry with verify_and_reconstruct_read IOOption
 INSTANTIATE_TEST_CASE_P(DBIOCorruptionTest, DBIOCorruptionTest,
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -52,7 +52,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
      user_comparator_(cmp),
      merge_operator_(ioptions.merge_operator.get()),
      iter_(iter),
-      version_(version),
+      blob_reader_(version, read_options.read_tier,
+                   read_options.verify_checksums, read_options.fill_cache,
+                   read_options.io_activity),
      read_callback_(read_callback),
      sequence_(s),
      statistics_(ioptions.stats),
@ -65,20 +67,16 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
      valid_(false),
      current_entry_is_merged_(false),
      is_key_seqnum_zero_(false),
-      prefix_same_as_start_(mutable_cf_options.prefix_extractor
-                                ? read_options.prefix_same_as_start
-                                : false),
+      prefix_same_as_start_(
+          prefix_extractor_ ? read_options.prefix_same_as_start : false),
      pin_thru_lifetime_(read_options.pin_data),
      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
                                     read_options.total_order_seek ||
                                     read_options.auto_prefix_mode),
-      read_tier_(read_options.read_tier),
-      fill_cache_(read_options.fill_cache),
-      verify_checksums_(read_options.verify_checksums),
      expose_blob_index_(expose_blob_index),
+      allow_unprepared_value_(read_options.allow_unprepared_value),
      is_blob_(false),
      arena_mode_(arena_mode),
-      io_activity_(read_options.io_activity),
      cfh_(cfh),
      timestamp_ub_(read_options.timestamp),
      timestamp_lb_(read_options.iter_start_ts),
@ -93,6 +91,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
  status_.PermitUncheckedError();
  assert(timestamp_size_ ==
         user_comparator_.user_comparator()->timestamp_size());
+  // prefix_seek_opt_in_only should force total_order_seek whereever the caller
+  // is duplicating the original ReadOptions
+  assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
 }

 Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
@ -149,7 +150,7 @@ void DBIter::Next() {
  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
  // Release temporarily pinned blocks from last operation
  ReleaseTempPinnedData();
-  ResetBlobValue();
+  ResetBlobData();
  ResetValueAndColumns();
  local_stats_.skip_count_ += num_internal_keys_skipped_;
  local_stats_.skip_count_--;
@ -192,29 +193,21 @@ void DBIter::Next() {
  }
 }

-bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
-                                  const Slice& blob_index) {
-  assert(!is_blob_);
+Status DBIter::BlobReader::RetrieveAndSetBlobValue(const Slice& user_key,
+                                                   const Slice& blob_index) {
  assert(blob_value_.empty());

-  if (expose_blob_index_) {  // Stacked BlobDB implementation
-    is_blob_ = true;
-    return true;
-  }
-
  if (!version_) {
-    status_ = Status::Corruption("Encountered unexpected blob index.");
-    valid_ = false;
-    return false;
+    return Status::Corruption("Encountered unexpected blob index.");
  }

  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
  // avoid having to copy options back and forth.
-  // TODO: plumb Env::IOActivity, Env::IOPriority
+  // TODO: plumb Env::IOPriority
  ReadOptions read_options;
  read_options.read_tier = read_tier_;
-  read_options.fill_cache = fill_cache_;
  read_options.verify_checksums = verify_checksums_;
+  read_options.fill_cache = fill_cache_;
  read_options.io_activity = io_activity_;
  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
  constexpr uint64_t* bytes_read = nullptr;
@ -222,16 +215,51 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
  const Status s = version_->GetBlob(read_options, user_key, blob_index,
                                     prefetch_buffer, &blob_value_, bytes_read);

+  if (!s.ok()) {
+    return s;
+  }
+
+  return Status::OK();
+}
+
+bool DBIter::SetValueAndColumnsFromBlobImpl(const Slice& user_key,
+                                            const Slice& blob_index) {
+  const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index);
  if (!s.ok()) {
    status_ = s;
    valid_ = false;
+    is_blob_ = false;
    return false;
  }

-  is_blob_ = true;
+  SetValueAndColumnsFromPlain(blob_reader_.GetBlobValue());
+
  return true;
 }

+bool DBIter::SetValueAndColumnsFromBlob(const Slice& user_key,
+                                        const Slice& blob_index) {
+  assert(!is_blob_);
+  is_blob_ = true;
+
+  if (expose_blob_index_) {
+    SetValueAndColumnsFromPlain(blob_index);
+    return true;
+  }
+
+  if (allow_unprepared_value_) {
+    assert(value_.empty());
+    assert(wide_columns_.empty());
+
+    assert(lazy_blob_index_.empty());
+    lazy_blob_index_ = blob_index;
+
+    return true;
+  }
+
+  return SetValueAndColumnsFromBlobImpl(user_key, blob_index);
+}
+
 bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
  assert(value_.empty());
  assert(wide_columns_.empty());
@ -277,6 +305,24 @@ bool DBIter::SetValueAndColumnsFromMergeResult(const Status& merge_status,
  return true;
 }

+bool DBIter::PrepareValue() {
+  assert(valid_);
+
+  if (lazy_blob_index_.empty()) {
+    return true;
+  }
+
+  assert(allow_unprepared_value_);
+  assert(is_blob_);
+
+  const bool result =
+      SetValueAndColumnsFromBlobImpl(saved_key_.GetUserKey(), lazy_blob_index_);
+
+  lazy_blob_index_.clear();
+
+  return result;
+}
+
 // PRE: saved_key_ has the current user key if skipping_saved_key
 // POST: saved_key_ should have the next user key if valid_,
 //       if the current entry is a result of merge
@ -406,7 +452,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
          case kTypeValuePreferredSeqno:
          case kTypeBlobIndex:
          case kTypeWideColumnEntity:
-            if (!PrepareValue()) {
+            if (!PrepareValueInternal()) {
              return false;
            }
            if (timestamp_lb_) {
@ -418,12 +464,9 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
            }

            if (ikey_.type == kTypeBlobIndex) {
-              if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+              if (!SetValueAndColumnsFromBlob(ikey_.user_key, iter_.value())) {
                return false;
              }
-
-              SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value()
-                                                             : blob_value_);
            } else if (ikey_.type == kTypeWideColumnEntity) {
              if (!SetValueAndColumnsFromEntity(iter_.value())) {
                return false;
@ -443,7 +486,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
            return true;
            break;
          case kTypeMerge:
-            if (!PrepareValue()) {
+            if (!PrepareValueInternal()) {
              return false;
            }
            saved_key_.SetUserKey(
@ -538,6 +581,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
    } else {
      iter_.Next();
    }
+    // This could be a long-running operation due to tombstones, etc.
+    ROCKSDB_THREAD_YIELD_HOOK();
  } while (iter_.Valid());

  valid_ = false;
@ -588,7 +633,7 @@ bool DBIter::MergeValuesNewToOld() {
      iter_.Next();
      break;
    }
-    if (!PrepareValue()) {
+    if (!PrepareValueInternal()) {
      return false;
    }

@ -617,23 +662,9 @@ bool DBIter::MergeValuesNewToOld() {
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
      PERF_COUNTER_ADD(internal_merge_count, 1);
    } else if (kTypeBlobIndex == ikey.type) {
-      if (expose_blob_index_) {
-        status_ =
-            Status::NotSupported("BlobDB does not support merge operator.");
-        valid_ = false;
+      if (!MergeWithBlobBaseValue(iter_.value(), ikey.user_key)) {
        return false;
      }
-      // hit a put, merge the put value with operands and store the
-      // final result in saved_value_. We are done!
-      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
-        return false;
-      }
-      valid_ = true;
-      if (!MergeWithPlainBaseValue(blob_value_, ikey.user_key)) {
-        return false;
-      }
-
-      ResetBlobValue();

      // iter_ is positioned after put
      iter_.Next();
@ -641,6 +672,7 @@ bool DBIter::MergeValuesNewToOld() {
        valid_ = false;
        return false;
      }
+
      return true;
    } else if (kTypeWideColumnEntity == ikey.type) {
      if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) {
@ -687,7 +719,7 @@ void DBIter::Prev() {
  PERF_COUNTER_ADD(iter_prev_count, 1);
  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
  ReleaseTempPinnedData();
-  ResetBlobValue();
+  ResetBlobData();
  ResetValueAndColumns();
  ResetInternalKeysSkippedCounter();
  bool ok = true;
@ -924,7 +956,7 @@ bool DBIter::FindValueForCurrentKey() {
      return FindValueForCurrentKeyUsingSeek();
    }

-    if (!PrepareValue()) {
+    if (!PrepareValueInternal()) {
      return false;
    }

@ -1039,21 +1071,9 @@ bool DBIter::FindValueForCurrentKey() {
        }
        return true;
      } else if (last_not_merge_type == kTypeBlobIndex) {
-        if (expose_blob_index_) {
-          status_ =
-              Status::NotSupported("BlobDB does not support merge operator.");
-          valid_ = false;
+        if (!MergeWithBlobBaseValue(pinned_value_, saved_key_.GetUserKey())) {
          return false;
        }
-        if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
-          return false;
-        }
-        valid_ = true;
-        if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) {
-          return false;
-        }
-
-        ResetBlobValue();

        return true;
      } else if (last_not_merge_type == kTypeWideColumnEntity) {
@ -1078,13 +1098,9 @@ bool DBIter::FindValueForCurrentKey() {

      break;
    case kTypeBlobIndex:
-      if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+      if (!SetValueAndColumnsFromBlob(saved_key_.GetUserKey(), pinned_value_)) {
        return false;
      }
-
-      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
-                                                     : blob_value_);
-
      break;
    case kTypeWideColumnEntity:
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
@ -1171,7 +1187,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
    }
    return true;
  }
-  if (!PrepareValue()) {
+  if (!PrepareValueInternal()) {
    return false;
  }
  if (timestamp_size_ > 0) {
@ -1188,12 +1204,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
      pinned_value_ = iter_.value();
    }
    if (ikey.type == kTypeBlobIndex) {
-      if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) {
+      if (!SetValueAndColumnsFromBlob(ikey.user_key, pinned_value_)) {
        return false;
      }
-
-      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
-                                                     : blob_value_);
    } else if (ikey.type == kTypeWideColumnEntity) {
      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
        return false;
@ -1241,7 +1254,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
        ikey.type == kTypeDeletionWithTimestamp) {
      break;
    }
-    if (!PrepareValue()) {
+    if (!PrepareValueInternal()) {
      return false;
    }

@ -1259,21 +1272,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
      PERF_COUNTER_ADD(internal_merge_count, 1);
    } else if (ikey.type == kTypeBlobIndex) {
-      if (expose_blob_index_) {
-        status_ =
-            Status::NotSupported("BlobDB does not support merge operator.");
-        valid_ = false;
+      if (!MergeWithBlobBaseValue(iter_.value(), saved_key_.GetUserKey())) {
        return false;
      }
-      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
-        return false;
-      }
-      valid_ = true;
-      if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) {
-        return false;
-      }
-
-      ResetBlobValue();

      return true;
    } else if (ikey.type == kTypeWideColumnEntity) {
@ -1340,6 +1341,35 @@ bool DBIter::MergeWithPlainBaseValue(const Slice& value,
  return SetValueAndColumnsFromMergeResult(s, result_type);
 }

+bool DBIter::MergeWithBlobBaseValue(const Slice& blob_index,
+                                    const Slice& user_key) {
+  assert(!is_blob_);
+
+  if (expose_blob_index_) {
+    status_ =
+        Status::NotSupported("Legacy BlobDB does not support merge operator.");
+    valid_ = false;
+    return false;
+  }
+
+  const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index);
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+    return false;
+  }
+
+  valid_ = true;
+
+  if (!MergeWithPlainBaseValue(blob_reader_.GetBlobValue(), user_key)) {
+    return false;
+  }
+
+  blob_reader_.ResetBlobValue();
+
+  return true;
+}
+
 bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity,
                                          const Slice& user_key) {
  // `op_failure_scope` (an output parameter) is not provided (set to nullptr)
@ -1529,7 +1559,7 @@ void DBIter::Seek(const Slice& target) {

  status_ = Status::OK();
  ReleaseTempPinnedData();
-  ResetBlobValue();
+  ResetBlobData();
  ResetValueAndColumns();
  ResetInternalKeysSkippedCounter();

@ -1605,7 +1635,7 @@ void DBIter::SeekForPrev(const Slice& target) {

  status_ = Status::OK();
  ReleaseTempPinnedData();
-  ResetBlobValue();
+  ResetBlobData();
  ResetValueAndColumns();
  ResetInternalKeysSkippedCounter();

@ -1666,7 +1696,7 @@ void DBIter::SeekToFirst() {
  status_.PermitUncheckedError();
  direction_ = kForward;
  ReleaseTempPinnedData();
-  ResetBlobValue();
+  ResetBlobData();
  ResetValueAndColumns();
  ResetInternalKeysSkippedCounter();
  ClearSavedValue();
@ -1729,7 +1759,7 @@ void DBIter::SeekToLast() {
  status_.PermitUncheckedError();
  direction_ = kReverse;
  ReleaseTempPinnedData();
-  ResetBlobValue();
+  ResetBlobData();
  ResetValueAndColumns();
  ResetInternalKeysSkippedCounter();
  ClearSavedValue();
--- a/db/db_iter.h
+++ b/db/db_iter.h
@ -218,7 +218,34 @@ class DBIter final : public Iterator {
  }
  void set_valid(bool v) { valid_ = v; }

+  bool PrepareValue() override;
+
 private:
+  class BlobReader {
+   public:
+    BlobReader(const Version* version, ReadTier read_tier,
+               bool verify_checksums, bool fill_cache,
+               Env::IOActivity io_activity)
+        : version_(version),
+          read_tier_(read_tier),
+          verify_checksums_(verify_checksums),
+          fill_cache_(fill_cache),
+          io_activity_(io_activity) {}
+
+    const Slice& GetBlobValue() const { return blob_value_; }
+    Status RetrieveAndSetBlobValue(const Slice& user_key,
+                                   const Slice& blob_index);
+    void ResetBlobValue() { blob_value_.Reset(); }
+
+   private:
+    PinnableSlice blob_value_;
+    const Version* version_;
+    ReadTier read_tier_;
+    bool verify_checksums_;
+    bool fill_cache_;
+    Env::IOActivity io_activity_;
+  };
+
  // For all methods in this block:
  // PRE: iter_->Valid() && status_.ok()
  // Return false if there was an error, and status() is non-ok, valid_ = false;
@ -299,15 +326,6 @@ class DBIter final : public Iterator {
               : user_comparator_.CompareWithoutTimestamp(a, b);
  }

-  // Retrieves the blob value for the specified user key using the given blob
-  // index when using the integrated BlobDB implementation.
-  bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index);
-
-  void ResetBlobValue() {
-    is_blob_ = false;
-    blob_value_.Reset();
-  }
-
  void SetValueAndColumnsFromPlain(const Slice& slice) {
    assert(value_.empty());
    assert(wide_columns_.empty());
@ -316,6 +334,11 @@ class DBIter final : public Iterator {
    wide_columns_.emplace_back(kDefaultWideColumnName, slice);
  }

+  bool SetValueAndColumnsFromBlobImpl(const Slice& user_key,
+                                      const Slice& blob_index);
+  bool SetValueAndColumnsFromBlob(const Slice& user_key,
+                                  const Slice& blob_index);
+
  bool SetValueAndColumnsFromEntity(Slice slice);

  bool SetValueAndColumnsFromMergeResult(const Status& merge_status,
@ -326,14 +349,21 @@ class DBIter final : public Iterator {
    wide_columns_.clear();
  }

+  void ResetBlobData() {
+    blob_reader_.ResetBlobValue();
+    lazy_blob_index_.clear();
+    is_blob_ = false;
+  }
+
  // The following methods perform the actual merge operation for the
-  // no base value/plain base value/wide-column base value cases.
+  // no/plain/blob/wide-column base value cases.
  // If user-defined timestamp is enabled, `user_key` includes timestamp.
  bool MergeWithNoBaseValue(const Slice& user_key);
  bool MergeWithPlainBaseValue(const Slice& value, const Slice& user_key);
+  bool MergeWithBlobBaseValue(const Slice& blob_index, const Slice& user_key);
  bool MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key);

-  bool PrepareValue() {
+  bool PrepareValueInternal() {
    if (!iter_.PrepareValue()) {
      assert(!iter_.status().ok());
      valid_ = false;
@ -356,7 +386,7 @@ class DBIter final : public Iterator {
  UserComparatorWrapper user_comparator_;
  const MergeOperator* const merge_operator_;
  IteratorWrapper iter_;
-  const Version* version_;
+  BlobReader blob_reader_;
  ReadCallback* read_callback_;
  // Max visible sequence number. It is normally the snapshot seq unless we have
  // uncommitted data in db as in WriteUnCommitted.
@ -376,7 +406,6 @@ class DBIter final : public Iterator {
  std::string saved_value_;
  Slice pinned_value_;
  // for prefix seek mode to support prev()
-  PinnableSlice blob_value_;
  // Value of the default column
  Slice value_;
  // All columns (i.e. name-value pairs)
@ -410,15 +439,13 @@ class DBIter final : public Iterator {
  // Expect the inner iterator to maintain a total order.
  // prefix_extractor_ must be non-NULL if the value is false.
  const bool expect_total_order_inner_iter_;
-  ReadTier read_tier_;
-  bool fill_cache_;
-  bool verify_checksums_;
  // Whether the iterator is allowed to expose blob references. Set to true when
  // the stacked BlobDB implementation is used, false otherwise.
  bool expose_blob_index_;
+  bool allow_unprepared_value_;
+  Slice lazy_blob_index_;
  bool is_blob_;
  bool arena_mode_;
-  const Env::IOActivity io_activity_;
  // List of operands for merge operator.
  MergeContext merge_context_;
  LocalStatistics local_stats_;
--- a/db/db_kv_checksum_test.cc
+++ b/db/db_kv_checksum_test.cc
@ -684,13 +684,14 @@ class DbMemtableKVChecksumTest : public DbKvChecksumTest {
  DbMemtableKVChecksumTest() : DbKvChecksumTest() {}

 protected:
+  const size_t kValueLenOffset = 12;
  // Indices in the memtable entry that we will not corrupt.
  // For memtable entry format, see comments in MemTable::Add().
  // We do not corrupt key length and value length fields in this test
  // case since it causes segfault and ASAN will complain.
  // For this test case, key and value are all of length 3, so
  // key length field is at index 0 and value length field is at index 12.
-  const std::set<size_t> index_not_to_corrupt{0, 12};
+  const std::set<size_t> index_not_to_corrupt{0, kValueLenOffset};

  void SkipNotToCorruptEntry() {
    if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
@ -737,6 +738,8 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
        buf[corrupt_byte_offset_] += corrupt_byte_addend_;
        ++corrupt_byte_offset_;
      });
+  // Corrupt value only so that MultiGet below can find the key.
+  corrupt_byte_offset_ = kValueLenOffset + 1;
  SyncPoint::GetInstance()->EnableProcessing();
  Options options = CurrentOptions();
  options.memtable_protection_bytes_per_key =
@ -745,12 +748,17 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
    options.merge_operator = MergeOperators::CreateStringAppendOperator();
  }

+  std::string key = "key";
  SkipNotToCorruptEntry();
  while (MoreBytesToCorrupt()) {
    Reopen(options);
    ASSERT_OK(ExecuteWrite(nullptr));
    std::string val;
-    ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption());
+    ASSERT_TRUE(db_->Get(ReadOptions(), key, &val).IsCorruption());
+    std::vector<std::string> vals = {val};
+    std::vector<Status> statuses = db_->MultiGet(
+        ReadOptions(), {db_->DefaultColumnFamily()}, {key}, &vals, nullptr);
+    ASSERT_TRUE(statuses[0].IsCorruption());
    Destroy(options);
    SkipNotToCorruptEntry();
  }
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
  }
 }

+TEST_F(DBMemTableTest, IntegrityChecks) {
+  // We insert keys key000000, key000001 and key000002 into skiplist at fixed
+  // height 1 (smallest height). Then we corrupt the second key to aey000001 to
+  // make it smaller. With `paranoid_memory_checks` set to true, if the
+  // skip list sees key000000 and then aey000001, then it will report out of
+  // order keys with corruption status. With `paranoid_memory_checks` set
+  // to false, read/scan may return wrong results.
+  for (bool allow_data_in_error : {false, true}) {
+    Options options = CurrentOptions();
+    options.allow_data_in_errors = allow_data_in_error;
+    options.paranoid_memory_checks = true;
+    DestroyAndReopen(options);
+    SyncPoint::GetInstance()->SetCallBack(
+        "InlineSkipList::RandomHeight::height", [](void* h) {
+          auto height_ptr = static_cast<int*>(h);
+          *height_ptr = 1;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_OK(Put(Key(0), "val0"));
+    ASSERT_OK(Put(Key(2), "val2"));
+    // p will point to the buffer for encoded key000001
+    char* p = nullptr;
+    SyncPoint::GetInstance()->SetCallBack(
+        "MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) {
+          p = const_cast<char*>(static_cast<Slice*>(encoded)->data());
+        });
+    ASSERT_OK(Put(Key(1), "val1"));
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    ASSERT_TRUE(p);
+    // Offset 0 is key size, key bytes start at offset 1.
+    // "key000001 -> aey000001"
+    p[1] = 'a';
+
+    ReadOptions rops;
+    std::string val;
+    Status s = db_->Get(rops, Key(1), &val);
+    ASSERT_TRUE(s.IsCorruption());
+    std::string key0 = Slice(Key(0)).ToString(true);
+    ASSERT_EQ(s.ToString().find(key0) != std::string::npos,
+              allow_data_in_error);
+    // Without `paranoid_memory_checks`, NotFound will be returned.
+    // This would fail an assertion in InlineSkipList::FindGreaterOrEqual().
+    // If we remove the assertion, this passes.
+    // ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound());
+
+    std::vector<std::string> vals;
+    std::vector<Status> statuses = db_->MultiGet(
+        rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr);
+    ASSERT_TRUE(statuses[0].IsCorruption());
+    ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos,
+              allow_data_in_error);
+
+    std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
+    ASSERT_OK(iter->status());
+    iter->Seek(Key(1));
+    ASSERT_TRUE(iter->status().IsCorruption());
+    ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
+              allow_data_in_error);
+
+    iter->Seek(Key(0));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    // iterating through skip list at height at 1 should catch out-of-order keys
+    iter->Next();
+    ASSERT_TRUE(iter->status().IsCorruption());
+    ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
+              allow_data_in_error);
+    ASSERT_FALSE(iter->Valid());
+
+    iter->SeekForPrev(Key(2));
+    ASSERT_TRUE(iter->status().IsCorruption());
+    ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
+              allow_data_in_error);
+
+    // Internally DB Iter will iterate backwards (call Prev()) after
+    // SeekToLast() to find the correct internal key with the last user key.
+    // Prev() will do integrity checks and catch corruption.
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->status().IsCorruption());
+    ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
+              allow_data_in_error);
+    ASSERT_FALSE(iter->Valid());
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@ -56,6 +56,11 @@ class DBOptionsTest : public DBTestBase {
    EXPECT_OK(GetStringFromMutableCFOptions(
        config_options, MutableCFOptions(options), &options_str));
    EXPECT_OK(StringToMap(options_str, &mutable_map));
+    for (auto& opt : TEST_GetImmutableInMutableCFOptions()) {
+      // Not yet mutable but migrated to MutableCFOptions in preparation for
+      // being mutable
+      mutable_map.erase(opt);
+    }
    return mutable_map;
  }

@ -231,21 +236,33 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
  ASSERT_OK(dbfull()->SetOptions(
      cfh, {{"table_factory.block_size", "16384"},
            {"table_factory.block_restart_interval", "11"}}));
+  // Old c_bbto
+  ASSERT_EQ(c_bbto->block_size, 8192);
+  ASSERT_EQ(c_bbto->block_restart_interval, 7);
+  // New c_bbto
+  c_opts = dbfull()->GetOptions(cfh);
+  c_bbto = c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
  ASSERT_EQ(c_bbto->block_size, 16384);
  ASSERT_EQ(c_bbto->block_restart_interval, 11);

  // Now set an option that is not mutable - options should not change
-  ASSERT_NOK(
-      dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
+  // FIXME: find a way to make this fail again
+  // ASSERT_NOK(
+  //    dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
+  c_opts = dbfull()->GetOptions(cfh);
+  ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
  ASSERT_EQ(c_bbto->no_block_cache, true);
  ASSERT_EQ(c_bbto->block_size, 16384);
  ASSERT_EQ(c_bbto->block_restart_interval, 11);

  // Set some that are mutable and some that are not - options should not change
-  ASSERT_NOK(dbfull()->SetOptions(
-      cfh, {{"table_factory.no_block_cache", "false"},
-            {"table_factory.block_size", "8192"},
-            {"table_factory.block_restart_interval", "7"}}));
+  // FIXME: find a way to make this fail again
+  // ASSERT_NOK(dbfull()->SetOptions(
+  //     cfh, {{"table_factory.no_block_cache", "false"},
+  //           {"table_factory.block_size", "8192"},
+  //           {"table_factory.block_restart_interval", "7"}}));
+  c_opts = dbfull()->GetOptions(cfh);
+  ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
  ASSERT_EQ(c_bbto->no_block_cache, true);
  ASSERT_EQ(c_bbto->block_size, 16384);
  ASSERT_EQ(c_bbto->block_restart_interval, 11);
@ -256,6 +273,8 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
      cfh, {{"table_factory.block_size", "8192"},
            {"table_factory.does_not_exist", "true"},
            {"table_factory.block_restart_interval", "7"}}));
+  c_opts = dbfull()->GetOptions(cfh);
+  ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
  ASSERT_EQ(c_bbto->no_block_cache, true);
  ASSERT_EQ(c_bbto->block_size, 16384);
  ASSERT_EQ(c_bbto->block_restart_interval, 11);
@ -271,6 +290,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
            {"table_factory.block_restart_interval", "13"}}));
  c_opts = dbfull()->GetOptions(cfh);
  ASSERT_EQ(c_opts.blob_file_size, 32768);
+  c_bbto = c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
  ASSERT_EQ(c_bbto->block_size, 16384);
  ASSERT_EQ(c_bbto->block_restart_interval, 13);
  // Set some on the table and a bad one on the ColumnFamily - options should
@ -279,6 +299,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) {
      cfh, {{"table_factory.block_size", "1024"},
            {"no_such_option", "32768"},
            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions<BlockBasedTableOptions>());
  ASSERT_EQ(c_bbto->block_size, 16384);
  ASSERT_EQ(c_bbto->block_restart_interval, 13);
 }
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@ -244,7 +244,7 @@ TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
  ASSERT_EQ(largest.user_key().ToString(), "foo");
  ASSERT_EQ(result.output_level, 1);
  ASSERT_EQ(result.output_path, this->secondary_path_);
-  ASSERT_EQ(result.num_output_records, 2);
+  ASSERT_EQ(result.stats.num_output_records, 2);
  ASSERT_GT(result.bytes_written, 0);
  ASSERT_OK(result.status);
 }
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@ -383,12 +383,16 @@ TEST_F(DBSSTTest, DBWithSstFileManager) {
  ASSERT_EQ(files_moved, 0);

  Close();
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
  Reopen(options);
  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);

  // Verify that we track all the files again after the DB is closed and opened
  Close();
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
  sst_file_manager.reset(NewSstFileManager(env_));
  options.sst_file_manager = sst_file_manager;
  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
@ -439,6 +443,11 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+
+  int64_t untracked_files = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnUntrackFile",
+      [&](void* /*arg*/) { ++untracked_files; });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  Options options = CurrentOptions();
@ -485,6 +494,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
  }
  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
  Close();
+  ASSERT_EQ(untracked_files, files_in_db.size());
+  untracked_files = 0;
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";

  Reopen(options);
  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
@ -492,6 +505,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {

  // Verify that we track all the files again after the DB is closed and opened.
  Close();
+  ASSERT_EQ(untracked_files, files_in_db.size());
+  untracked_files = 0;
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";

  sst_file_manager.reset(NewSstFileManager(env_));
  options.sst_file_manager = sst_file_manager;
@ -507,6 +524,27 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
  ASSERT_EQ(files_deleted, 0);
  ASSERT_EQ(files_scheduled_to_delete, 0);
  Close();
+  ASSERT_EQ(untracked_files, files_in_db.size());
+  untracked_files = 0;
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          ++files_scheduled_to_delete;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_deleted++;
+        }
+      });
  ASSERT_OK(DestroyDB(dbname_, options));
  ASSERT_EQ(files_deleted, blob_files.size());
  ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
@ -649,6 +687,26 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
  }

  Close();
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          ++files_scheduled_to_delete;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_deleted++;
+        }
+      });
  ASSERT_OK(DestroyDB(dbname_, options));
  sfm->WaitForEmptyTrash();
  ASSERT_EQ(files_deleted, 5);
@ -883,8 +941,9 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
  // Create 4 files in L0
  for (char v = 'a'; v <= 'd'; v++) {
    if (v == 'c') {
-      // Maximize the change that the last log file will be preserved in trash
-      // before restarting the DB.
+      // Maximize the chance that the last log file will be preserved in trash
+      // before restarting the DB. (Enable slow deletion but at a very slow
+      // deletion rate)
      // We have to set this on the 2nd to last file for it to delay deletion
      // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
      options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
@ -1902,6 +1961,24 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
  ASSERT_EQ(files_deleted, 1);

  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_deleted++;
+        }
+      });
  ASSERT_OK(DestroyDB(dbname_, options));

  ASSERT_EQ(files_scheduled_to_delete, 4);
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -1826,21 +1826,30 @@ TEST_F(DBTest, GetApproximateMemTableStats) {
  uint64_t count;
  uint64_t size;

+  // Because Random::GetTLSInstance() seed is reset in DBTestBase,
+  // this test is deterministic.
+
  std::string start = Key(50);
  std::string end = Key(60);
  Range r(start, end);
  db_->GetApproximateMemTableStats(r, &count, &size);
-  ASSERT_GT(count, 0);
-  ASSERT_LE(count, N);
-  ASSERT_GT(size, 6000);
-  ASSERT_LT(size, 204800);
+  // When actual count is <= 10, it returns that as the minimum
+  EXPECT_EQ(count, 10);
+  EXPECT_EQ(size, 10440);
+
+  start = Key(20);
+  end = Key(100);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  EXPECT_EQ(count, 72);
+  EXPECT_EQ(size, 75168);

  start = Key(500);
  end = Key(600);
  r = Range(start, end);
  db_->GetApproximateMemTableStats(r, &count, &size);
-  ASSERT_EQ(count, 0);
-  ASSERT_EQ(size, 0);
+  EXPECT_EQ(count, 0);
+  EXPECT_EQ(size, 0);

  ASSERT_OK(Flush());

@ -1848,8 +1857,8 @@ TEST_F(DBTest, GetApproximateMemTableStats) {
  end = Key(60);
  r = Range(start, end);
  db_->GetApproximateMemTableStats(r, &count, &size);
-  ASSERT_EQ(count, 0);
-  ASSERT_EQ(size, 0);
+  EXPECT_EQ(count, 0);
+  EXPECT_EQ(size, 0);

  for (int i = 0; i < N; i++) {
    ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
@ -1857,10 +1866,11 @@ TEST_F(DBTest, GetApproximateMemTableStats) {

  start = Key(100);
  end = Key(1020);
+  // Actually 20 keys in the range ^^
  r = Range(start, end);
  db_->GetApproximateMemTableStats(r, &count, &size);
-  ASSERT_GT(count, 20);
-  ASSERT_GT(size, 6000);
+  EXPECT_EQ(count, 20);
+  EXPECT_EQ(size, 20880);
 }

 TEST_F(DBTest, ApproximateSizes) {
@ -5169,10 +5179,14 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
  options.max_bytes_for_level_multiplier = 4;
  options.max_background_compactions = 1;
  options.num_levels = 5;
+  options.statistics = CreateDBStatistics();

  options.compression_per_level.resize(3);
+  // No compression for L0
  options.compression_per_level[0] = kNoCompression;
+  // No compression for the Ln whre L0 is compacted to
  options.compression_per_level[1] = kNoCompression;
+  // Snpapy compression for Ln+1
  options.compression_per_level[2] = kSnappyCompression;

  OnFileDeletionListener* listener = new OnFileDeletionListener();
@ -5181,7 +5195,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
  DestroyAndReopen(options);

  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
-  // be compressed, so total data size should be more than 80K.
+  // be compressed, so there shouldn't be any compression.
  for (int i = 0; i < 20; i++) {
    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
  }
@ -5191,10 +5205,17 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
-  // Assuming each files' metadata is at least 50 bytes/
-  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
+  ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0);

-  // Insert 400KB. Some data will be compressed
+  // Verify there was no compression
+  auto num_block_compressed =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  ASSERT_EQ(num_block_compressed, 0);
+
+  // Insert 400KB and there will be some files end up in L3. According to the
+  // above compression settings for each level, there will be some compression.
+  ASSERT_OK(options.statistics->Reset());
+  ASSERT_EQ(num_block_compressed, 0);
  for (int i = 21; i < 120; i++) {
    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
  }
@ -5202,9 +5223,14 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GE(NumTableFilesAtLevel(3), 1);
+  ASSERT_GE(NumTableFilesAtLevel(4), 1);
+
+  // Verify there was compression
+  num_block_compressed =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  ASSERT_GT(num_block_compressed, 0);

-  ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
-            120U * 4000U + 50U * 24);
  // Make sure data in files in L3 is not compacted by removing all files
  // in L4 and calculate number of rows
  ASSERT_OK(dbfull()->SetOptions({
@ -5224,6 +5250,12 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
    num_keys++;
  }
  ASSERT_OK(iter->status());
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GE(NumTableFilesAtLevel(3), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(4), 0);
+
  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
 }

--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@ -10,6 +10,7 @@
 #include <atomic>
 #include <cstdlib>
 #include <functional>
+#include <iostream>
 #include <memory>

 #include "db/db_test_util.h"
@ -26,6 +27,7 @@
 #include "rocksdb/utilities/replayer.h"
 #include "rocksdb/wal_filter.h"
 #include "test_util/testutil.h"
+#include "util/defer.h"
 #include "util/random.h"
 #include "utilities/fault_injection_env.h"

@ -34,18 +36,6 @@ namespace ROCKSDB_NAMESPACE {
 class DBTest2 : public DBTestBase {
 public:
  DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
-  std::vector<FileMetaData*> GetLevelFileMetadatas(int level, int cf = 0) {
-    VersionSet* const versions = dbfull()->GetVersionSet();
-    assert(versions);
-    ColumnFamilyData* const cfd =
-        versions->GetColumnFamilySet()->GetColumnFamily(cf);
-    assert(cfd);
-    Version* const current = cfd->current();
-    assert(current);
-    VersionStorageInfo* const storage_info = current->storage_info();
-    assert(storage_info);
-    return storage_info->LevelFiles(level);
-  }
 };

 TEST_F(DBTest2, OpenForReadOnly) {
@ -5595,32 +5585,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) {
  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
  bbto.whole_key_filtering = false;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyAndReopen(options);

-  // Construct two L1 files with keys:
-  // f1:[aaa1 ccc1] f2:[ddd0]
-  ASSERT_OK(Put("aaa1", ""));
-  ASSERT_OK(Put("ccc1", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("ddd0", ""));
-  ASSERT_OK(Flush());
-  CompactRangeOptions cro;
-  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
-  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // This test is also the primary test for prefix_seek_opt_in_only
+  for (bool opt_in : {false, true}) {
+    options.prefix_seek_opt_in_only = opt_in;
+    DestroyAndReopen(options);

-  Iterator* iter = db_->NewIterator(ReadOptions());
-  ASSERT_OK(iter->status());
+    // Construct two L1 files with keys:
+    // f1:[aaa1 ccc1] f2:[ddd0]
+    ASSERT_OK(Put("aaa1", ""));
+    ASSERT_OK(Put("ccc1", ""));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("ddd0", ""));
+    ASSERT_OK(Flush());
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));

-  // Bloom filter is filterd out by f1.
-  // This is just one of several valid position following the contract.
-  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
-  // the behavior of the current implementation. If underlying implementation
-  // changes, the test might fail here.
-  iter->Seek("bbb1");
-  ASSERT_OK(iter->status());
-  ASSERT_FALSE(iter->Valid());
+    ReadOptions ropts;
+    for (bool same : {false, true}) {
+      ropts.prefix_same_as_start = same;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+      ASSERT_OK(iter->status());

-  delete iter;
+      iter->Seek("bbb1");
+      ASSERT_OK(iter->status());
+      if (opt_in && !same) {
+        // Unbounded total order seek
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key(), "ccc1");
+      } else {
+        // Bloom filter is filterd out by f1. When same == false, this is just
+        // one valid position following the contract. Postioning to ccc1 or ddd0
+        // is also valid. This is just to validate the behavior of the current
+        // implementation. If underlying implementation changes, the test might
+        // fail here.
+        ASSERT_FALSE(iter->Valid());
+      }
+    }
+  }
 }

 TEST_F(DBTest2, RowCacheSnapshot) {
@ -5985,6 +5988,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) {
    // create a DB with block prefix index
    BlockBasedTableOptions table_options;
    Options options = CurrentOptions();
+    options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek

    // Sometimes filter is checked based on upper bound. Assert counters
    // for that case. Otherwise, only check data correctness.
@ -6544,6 +6548,235 @@ TEST_P(RenameCurrentTest, Compaction) {
  ASSERT_EQ("d_value", Get("d"));
 }

+TEST_F(DBTest2, VariousFileTemperatures) {
+  constexpr size_t kNumberFileTypes = static_cast<size_t>(kBlobFile) + 1U;
+
+  struct MyTestFS : public FileTemperatureTestFS {
+    explicit MyTestFS(const std::shared_ptr<FileSystem>& fs)
+        : FileTemperatureTestFS(fs) {
+      Reset();
+    }
+
+    IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override {
+      IOStatus ios =
+          FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg);
+      if (ios.ok()) {
+        uint64_t number;
+        FileType type;
+        if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) {
+          if (type == kTableFile) {
+            // Not checked here
+          } else if (type == kWalFile) {
+            if (opts.temperature != expected_wal_temperature) {
+              std::cerr << "Attempt to open " << fname << " with temperature "
+                        << temperature_to_string[opts.temperature]
+                        << " rather than "
+                        << temperature_to_string[expected_wal_temperature]
+                        << std::endl;
+              assert(false);
+            }
+          } else if (type == kDescriptorFile) {
+            if (opts.temperature != expected_manifest_temperature) {
+              std::cerr << "Attempt to open " << fname << " with temperature "
+                        << temperature_to_string[opts.temperature]
+                        << " rather than "
+                        << temperature_to_string[expected_wal_temperature]
+                        << std::endl;
+              assert(false);
+            }
+          } else if (opts.temperature != expected_other_metadata_temperature) {
+            std::cerr << "Attempt to open " << fname << " with temperature "
+                      << temperature_to_string[opts.temperature]
+                      << " rather than "
+                      << temperature_to_string[expected_wal_temperature]
+                      << std::endl;
+            assert(false);
+          }
+          UpdateCount(type, 1);
+        }
+      }
+      return ios;
+    }
+
+    IOStatus RenameFile(const std::string& src, const std::string& dst,
+                        const IOOptions& options,
+                        IODebugContext* dbg) override {
+      IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg);
+      if (ios.ok()) {
+        uint64_t number;
+        FileType src_type;
+        FileType dst_type;
+        assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type));
+        assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type));
+
+        UpdateCount(src_type, -1);
+        UpdateCount(dst_type, 1);
+      }
+      return ios;
+    }
+
+    void UpdateCount(FileType type, int delta) {
+      size_t i = static_cast<size_t>(type);
+      assert(i < kNumberFileTypes);
+      counts[i].FetchAddRelaxed(delta);
+    }
+
+    std::map<FileType, size_t> PopCounts() {
+      std::map<FileType, size_t> ret;
+      for (size_t i = 0; i < kNumberFileTypes; ++i) {
+        int c = counts[i].ExchangeRelaxed(0);
+        if (c > 0) {
+          ret[static_cast<FileType>(i)] = c;
+        }
+      }
+      return ret;
+    }
+
+    FileOptions OptimizeForLogWrite(
+        const FileOptions& file_options,
+        const DBOptions& /*db_options*/) const override {
+      FileOptions opts = file_options;
+      if (optimize_wal_temperature != Temperature::kUnknown) {
+        opts.temperature = optimize_wal_temperature;
+      }
+      return opts;
+    }
+
+    FileOptions OptimizeForManifestWrite(
+        const FileOptions& file_options) const override {
+      FileOptions opts = file_options;
+      if (optimize_manifest_temperature != Temperature::kUnknown) {
+        opts.temperature = optimize_manifest_temperature;
+      }
+      return opts;
+    }
+
+    void Reset() {
+      optimize_manifest_temperature = Temperature::kUnknown;
+      optimize_wal_temperature = Temperature::kUnknown;
+      expected_manifest_temperature = Temperature::kUnknown;
+      expected_other_metadata_temperature = Temperature::kUnknown;
+      expected_wal_temperature = Temperature::kUnknown;
+      for (auto& c : counts) {
+        c.StoreRelaxed(0);
+      }
+    }
+
+    Temperature optimize_manifest_temperature;
+    Temperature optimize_wal_temperature;
+    Temperature expected_manifest_temperature;
+    Temperature expected_other_metadata_temperature;
+    Temperature expected_wal_temperature;
+    std::array<RelaxedAtomic<int>, kNumberFileTypes> counts;
+  };
+
+  // We don't have enough non-unknown temps to confidently distinguish that
+  // a specific setting caused a specific outcome, in a single run. This is a
+  // reasonable work-around without blowing up test time. Only returns
+  // non-unknown temperatures.
+  auto RandomTemp = [] {
+    static std::vector<Temperature> temps = {
+        Temperature::kHot, Temperature::kWarm, Temperature::kCold};
+    return temps[Random::GetTLSInstance()->Uniform(
+        static_cast<int>(temps.size()))];
+  };
+
+  auto test_fs = std::make_shared<MyTestFS>(env_->GetFileSystem());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
+  for (bool use_optimize : {false, true}) {
+    std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl;
+    for (bool use_temp_options : {false, true}) {
+      std::cerr << "use_temp_options: " << std::to_string(use_temp_options)
+                << std::endl;
+
+      Options options = CurrentOptions();
+      // Currently require for last level temperature
+      options.compaction_style = kCompactionStyleUniversal;
+      options.env = env.get();
+      test_fs->Reset();
+      if (use_optimize) {
+        test_fs->optimize_manifest_temperature = RandomTemp();
+        test_fs->expected_manifest_temperature =
+            test_fs->optimize_manifest_temperature;
+        test_fs->optimize_wal_temperature = RandomTemp();
+        test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature;
+      }
+      if (use_temp_options) {
+        options.metadata_write_temperature = RandomTemp();
+        test_fs->expected_manifest_temperature =
+            options.metadata_write_temperature;
+        test_fs->expected_other_metadata_temperature =
+            options.metadata_write_temperature;
+        options.wal_write_temperature = RandomTemp();
+        test_fs->expected_wal_temperature = options.wal_write_temperature;
+        options.last_level_temperature = RandomTemp();
+        options.default_write_temperature = RandomTemp();
+      }
+
+      DestroyAndReopen(options);
+      Defer closer([&] { Close(); });
+
+      using FTC = std::map<FileType, size_t>;
+      // Files on DB startup
+      ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
+                                           {kDescriptorFile, 2},
+                                           {kCurrentFile, 2},
+                                           {kIdentityFile, 1},
+                                           {kOptionsFile, 1}}));
+
+      // Temperature count map
+      using TCM = std::map<Temperature, size_t>;
+      ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({}));
+
+      ASSERT_OK(Put("foo", "1"));
+      ASSERT_OK(Put("bar", "1"));
+      ASSERT_OK(Flush());
+      ASSERT_OK(Put("foo", "2"));
+      ASSERT_OK(Put("bar", "2"));
+      ASSERT_OK(Flush());
+
+      ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
+                TCM({{options.default_write_temperature, 2}}));
+
+      ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+      ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+
+      ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
+                TCM({{options.last_level_temperature, 1}}));
+
+      ASSERT_OK(Put("foo", "3"));
+      ASSERT_OK(Put("bar", "3"));
+      ASSERT_OK(Flush());
+
+      // Just in memtable/WAL
+      ASSERT_OK(Put("dog", "3"));
+
+      {
+        TCM expected;
+        expected[options.default_write_temperature] += 1;
+        expected[options.last_level_temperature] += 1;
+        ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected);
+      }
+
+      // New files during operation
+      ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}}));
+
+      Reopen(options);
+
+      // New files during re-open/recovery
+      ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
+                                           {kTableFile, 1},
+                                           {kDescriptorFile, 1},
+                                           {kCurrentFile, 1},
+                                           {kOptionsFile, 1}}));
+
+      Destroy(options);
+    }
+  }
+}
+
 TEST_F(DBTest2, LastLevelTemperature) {
  class TestListener : public EventListener {
   public:
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@ -366,6 +366,11 @@ Options DBTestBase::GetOptions(
    table_options.block_cache = NewLRUCache(/* too small */ 1);
  }

+  // Test anticipated new default as much as reasonably possible (and remove
+  // this code when obsolete)
+  assert(!table_options.decouple_partitioned_filters);
+  table_options.decouple_partitioned_filters = true;
+
  bool can_allow_mmap = IsMemoryMappedAccessSupported();
  switch (option_config) {
    case kHashSkipList:
@ -1258,6 +1263,20 @@ Status DBTestBase::CountFiles(size_t* count) {
  return Status::OK();
 }

+std::vector<FileMetaData*> DBTestBase::GetLevelFileMetadatas(int level,
+                                                             int cf) {
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  ColumnFamilyData* const cfd =
+      versions->GetColumnFamilySet()->GetColumnFamily(cf);
+  assert(cfd);
+  Version* const current = cfd->current();
+  assert(current);
+  VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+  return storage_info->LevelFiles(level);
+}
+
 Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
                        uint64_t* size) {
  Range r(start, limit);
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@ -831,6 +831,15 @@ class FileTemperatureTestFS : public FileSystemWrapper {
    return count;
  }

+  std::map<Temperature, size_t> CountCurrentSstFilesByTemp() {
+    MutexLock lock(&mu_);
+    std::map<Temperature, size_t> ret;
+    for (const auto& e : current_sst_file_temperatures_) {
+      ret[e.second]++;
+    }
+    return ret;
+  }
+
  void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
    MutexLock lock(&mu_);
    current_sst_file_temperatures_[number] = temp;
@ -842,7 +851,7 @@ class FileTemperatureTestFS : public FileSystemWrapper {
      requested_sst_file_temperatures_;
  std::map<uint64_t, Temperature> current_sst_file_temperatures_;

-  std::string GetFileName(const std::string& fname) {
+  static std::string GetFileName(const std::string& fname) {
    auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
    // workaround only for Windows that the file path could contain both Windows
    // FilePathSeparator and '/'
@ -1264,6 +1273,8 @@ class DBTestBase : public testing::Test {

  Status CountFiles(size_t* count);

+  std::vector<FileMetaData*> GetLevelFileMetadatas(int level, int cf = 0);
+
  Status Size(const Slice& start, const Slice& limit, uint64_t* size) {
    return Size(start, limit, 0, size);
  }
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@ -213,7 +213,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
  options.num_levels = num_levels_;
  options.write_buffer_size = 105 << 10;  // 105KB
  options.arena_block_size = 4 << 10;
-  options.target_file_size_base = 32 << 10;  // 32KB
  // trigger compaction if there are >= 4 files
  options.level0_file_num_compaction_trigger = 4;
  KeepFilterFactory* filter = new KeepFilterFactory(true);
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@ -1472,6 +1472,126 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
  ASSERT_OK(dbfull()->SyncWAL());
 }

+TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_number = 5;
+  options.track_and_verify_wals_in_manifest = true;
+  options.max_bgerror_resume_count = 0;  // manual resume
+  options.recycle_log_file_num = 3;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  // Disable truncating recycled WALs to new size in posix env
+  // (approximating a crash)
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close",
+      [](void* arg) { *(static_cast<size_t*>(arg)) = 0; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Re-open with desired options
+  DestroyAndReopen(options);
+  Defer closer([this]() { Close(); });
+
+  // Ensure WAL recycling wasn't sanitized away
+  ASSERT_EQ(db_->GetOptions().recycle_log_file_num,
+            options.recycle_log_file_num);
+
+  // Prepare external files for later ingestion
+  std::string sst_files_dir = dbname_ + "/sst_files/";
+  ASSERT_OK(DestroyDir(env_, sst_files_dir));
+  ASSERT_OK(env_->CreateDir(sst_files_dir));
+  std::string external_file1 = sst_files_dir + "file1.sst";
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(external_file1));
+    ASSERT_OK(sst_file_writer.Put("external1", "ex1"));
+    ExternalSstFileInfo file_info;
+    ASSERT_OK(sst_file_writer.Finish(&file_info));
+  }
+  std::string external_file2 = sst_files_dir + "file2.sst";
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(external_file2));
+    ASSERT_OK(sst_file_writer.Put("external2", "ex2"));
+    ExternalSstFileInfo file_info;
+    ASSERT_OK(sst_file_writer.Finish(&file_info));
+  }
+
+  // Populate some WALs to be recycled such that there will be extra data
+  // from an old incarnation of the WAL on recovery
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500)));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500)));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500)));
+  ASSERT_OK(Flush());
+
+  // Verify expected log files (still there for recycling)
+  std::vector<FileAttributes> files;
+  int log_count = 0;
+  ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
+  for (const auto& f : files) {
+    if (EndsWith(f.name, ".log")) {
+      EXPECT_GT(f.size_bytes, 500);
+      ++log_count;
+    }
+  }
+  EXPECT_EQ(log_count, 3);
+
+  // (Re-used recipe) Generate two inactive WALs and one active WAL, with a
+  // gap in sequence numbers to interfere with recovery
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "val1"));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  // Need a gap in sequence numbers, so e.g. ingest external file
+  // with an open snapshot
+  {
+    ManagedSnapshot snapshot(db_);
+    ASSERT_OK(
+        db_->IngestExternalFile({external_file1}, IngestExternalFileOptions()));
+  }
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(db_->SyncWAL());
+  // Need an SST file that is logically after that WAL, so that dropping WAL
+  // data is not a valid point in time.
+  {
+    ManagedSnapshot snapshot(db_);
+    ASSERT_OK(
+        db_->IngestExternalFile({external_file2}, IngestExternalFileOptions()));
+  }
+
+  // Approximate a crash, with respect to recycled WAL data extending past
+  // the end of the current WAL data (see SyncPoint callback above)
+  Close();
+
+  // Verify recycled log files haven't been truncated
+  files.clear();
+  log_count = 0;
+  ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
+  for (const auto& f : files) {
+    if (EndsWith(f.name, ".log")) {
+      EXPECT_GT(f.size_bytes, 500);
+      ++log_count;
+    }
+  }
+  EXPECT_EQ(log_count, 3);
+
+  // Verify no data loss after reopen.
+  Reopen(options);
+  EXPECT_EQ("val1", Get("key1"));
+  EXPECT_EQ("val2", Get("key2"));  // Passes because of adjacent seqnos
+  EXPECT_EQ("ex1", Get("external1"));
+  EXPECT_EQ("val3", Get("key3"));  // <- ONLY FAILURE! (Not a point in time)
+  EXPECT_EQ("ex2", Get("external2"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(DBWALTest, SyncWalPartialFailure) {
  class MyTestFileSystem : public FileSystemWrapper {
   public:
@ -1532,7 +1652,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
  // * one inactive WAL, not synced, and
  // * one active WAL, not synced
  // with a single thread, to exercise as much logic as we reasonably can.
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork());
+  ASSERT_OK(db_->PauseBackgroundWork());
  ASSERT_OK(Put("key1", "val1"));
  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
  ASSERT_OK(db_->SyncWAL());
@ -2811,6 +2931,29 @@ TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
  ASSERT_EQ("new_v", Get("k"));
  Destroy(options);
 }
+
+TEST_F(DBWALTest, WALWriteErrorNoRecovery) {
+  Options options = CurrentOptions();
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  options.env = fault_fs_env.get();
+  options.manual_wal_flush = true;
+  DestroyAndReopen(options);
+  fault_fs->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kWrite, 7 /* seed*/, 1 /* one_in */,
+      true /* retryable */, false /* has_data_loss*/);
+  fault_fs->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
+
+  ASSERT_OK(Put("k", "v"));
+  Status s;
+  s = db_->FlushWAL(false);
+  ASSERT_TRUE(s.IsIOError());
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+  ASSERT_FALSE(dbfull()->TEST_IsRecoveryInProgress());
+  fault_fs->DisableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
+  Destroy(options);
+}
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@ -172,6 +172,70 @@ TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
  Close();
 }

+TEST_F(DBBasicTestWithTimestamp, MultiGetMultipleCfs) {
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.avoid_flush_during_shutdown = true;
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  Options options1 = CurrentOptions();
+  options1.env = env_;
+  options1.comparator = &test_cmp;
+  ColumnFamilyHandle* handle = nullptr;
+  Status s = db_->CreateColumnFamily(options1, "data", &handle);
+  ASSERT_OK(s);
+
+  std::string ts = Timestamp(1, 0);
+  WriteBatch wb(0, 0, 0, kTimestampSize);
+  ASSERT_OK(wb.Put("a", "value"));
+  ASSERT_OK(wb.Put(handle, "a", "value"));
+  const auto ts_sz_func = [kTimestampSize](uint32_t /*cf_id*/) {
+    return kTimestampSize;
+  };
+  ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
+  ASSERT_OK(db_->Write(WriteOptions(), &wb));
+
+  int num_keys = 2;
+  std::vector<Slice> keys;
+  std::vector<std::string> expected_values;
+  for (int i = 0; i < num_keys; i++) {
+    keys.push_back("a");
+    expected_values.push_back("value");
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  handles.push_back(db_->DefaultColumnFamily());
+  handles.push_back(handle);
+
+  {
+    Slice read_ts_slice(ts);
+    ReadOptions read_opts;
+    read_opts.timestamp = &read_ts_slice;
+
+    std::vector<PinnableSlice> values;
+    values.resize(num_keys);
+    std::vector<Status> statuses;
+    statuses.resize(num_keys);
+    std::vector<std::string> timestamps;
+    timestamps.resize(num_keys);
+
+    db_->MultiGet(read_opts, num_keys, handles.data(), keys.data(),
+                  values.data(), timestamps.data(), statuses.data());
+
+    for (int i = 0; i < num_keys; i++) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(expected_values[i], values[i].ToString());
+      ASSERT_EQ(ts, timestamps[i]);
+    }
+  }
+
+  delete handle;
+  Close();
+}
+
 TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
  Options options = CurrentOptions();
  options.env = env_;
@ -768,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {

 TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
  Options options = CurrentOptions();
+  options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
  options.env = env_;
  options.create_if_missing = true;
  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
@ -945,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
  TestComparator test_cmp(kTimestampSize);
  options.comparator = &test_cmp;
  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  DestroyAndReopen(options);
  const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@ -272,11 +272,23 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,

 void IterKey::EnlargeBuffer(size_t key_size) {
  // If size is smaller than buffer size, continue using current buffer,
-  // or the static allocated one, as default
+  // or the inline one, as default
  assert(key_size > buf_size_);
  // Need to enlarge the buffer.
  ResetBuffer();
  buf_ = new char[key_size];
  buf_size_ = key_size;
 }
+
+void IterKey::EnlargeSecondaryBufferIfNeeded(size_t key_size) {
+  // If size is smaller than buffer size, continue using current buffer,
+  // or the inline one, as default
+  if (key_size <= secondary_buf_size_) {
+    return;
+  }
+  // Need to enlarge the secondary buffer.
+  ResetSecondaryBuffer();
+  secondary_buf_ = new char[key_size];
+  secondary_buf_size_ = key_size;
+}
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -10,6 +10,7 @@
 #pragma once
 #include <stdio.h>

+#include <array>
 #include <memory>
 #include <optional>
 #include <string>
@ -330,17 +331,16 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
 // output              : <user_provided_key>
 inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
                                             size_t ts_sz) {
-  Slice ret = internal_key;
-  ret.remove_suffix(kNumInternalBytes + ts_sz);
-  return ret;
+  assert(internal_key.size() >= kNumInternalBytes + ts_sz);
+  return Slice(internal_key.data(),
+               internal_key.size() - (kNumInternalBytes + ts_sz));
 }

 // input [user key]: <user_provided_key | ts>
 // output:           <user_provided_key>
 inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
-  Slice ret = user_key;
-  ret.remove_suffix(ts_sz);
-  return ret;
+  assert(user_key.size() >= ts_sz);
+  return Slice(user_key.data(), user_key.size() - ts_sz);
 }

 // input [user key]: <user_provided_key | ts>
@ -563,18 +563,28 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
 //    allocation for smaller keys.
 // 3. It tracks user key or internal key, and allow conversion between them.
 class IterKey {
+  static constexpr size_t kInlineBufferSize = 39;
+  // This is only used by user-defined timestamps in MemTable only feature,
+  // which only supports uint64_t timestamps.
+  static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00";
+
 public:
  IterKey()
      : buf_(space_),
        key_(buf_),
        key_size_(0),
-        buf_size_(sizeof(space_)),
-        is_user_key_(true) {}
+        buf_size_(kInlineBufferSize),
+        is_user_key_(true),
+        secondary_buf_(space_for_secondary_buf_),
+        secondary_buf_size_(kInlineBufferSize) {}
  // No copying allowed
  IterKey(const IterKey&) = delete;
  void operator=(const IterKey&) = delete;

-  ~IterKey() { ResetBuffer(); }
+  ~IterKey() {
+    ResetBuffer();
+    ResetSecondaryBuffer();
+  }

  // The bool will be picked up by the next calls to SetKey
  void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
@ -642,13 +652,15 @@ class IterKey {
                               const char* non_shared_data,
                               const size_t non_shared_len,
                               const size_t ts_sz) {
-    std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
-    std::string key_with_ts;
-    std::vector<Slice> key_parts_with_ts;
+    // This function is only used by the UDT in memtable feature, which only
+    // support built in comparators with uint64 timestamps.
+    assert(ts_sz == sizeof(uint64_t));
+    size_t next_key_slice_index = 0;
    if (IsUserKey()) {
-      key_parts_with_ts = {Slice(key_, shared_len),
-                           Slice(non_shared_data, non_shared_len),
-                           Slice(kTsMin)};
+      key_slices_[next_key_slice_index++] = Slice(key_, shared_len);
+      key_slices_[next_key_slice_index++] =
+          Slice(non_shared_data, non_shared_len);
+      key_slices_[next_key_slice_index++] = Slice(kTsMin, ts_sz);
    } else {
      assert(shared_len + non_shared_len >= kNumInternalBytes);
      // Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
@ -665,30 +677,46 @@ class IterKey {

      // One Slice among the three Slices will get split into two Slices, plus
      // a timestamp slice.
-      key_parts_with_ts.reserve(5);
      bool ts_added = false;
      // Add slice parts and find the right location to add the min timestamp.
      MaybeAddKeyPartsWithTimestamp(
          key_, shared_user_key_len,
          shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
-          shared_len + non_shared_len - kNumInternalBytes, kTsMin,
-          key_parts_with_ts, &ts_added);
+          shared_len + non_shared_len - kNumInternalBytes, ts_sz,
+          &next_key_slice_index, &ts_added);
      MaybeAddKeyPartsWithTimestamp(
          key_ + user_key_len, shared_internal_bytes_len,
          non_shared_len < kNumInternalBytes,
-          shared_internal_bytes_len + non_shared_len - kNumInternalBytes,
-          kTsMin, key_parts_with_ts, &ts_added);
+          shared_internal_bytes_len + non_shared_len - kNumInternalBytes, ts_sz,
+          &next_key_slice_index, &ts_added);
      MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
                                    non_shared_len >= kNumInternalBytes,
-                                    non_shared_len - kNumInternalBytes, kTsMin,
-                                    key_parts_with_ts, &ts_added);
+                                    non_shared_len - kNumInternalBytes, ts_sz,
+                                    &next_key_slice_index, &ts_added);
      assert(ts_added);
    }
+    SetKeyImpl(next_key_slice_index,
+               /* total_bytes= */ shared_len + non_shared_len + ts_sz);
+  }

-    Slice new_key(SliceParts(&key_parts_with_ts.front(),
-                             static_cast<int>(key_parts_with_ts.size())),
-                  &key_with_ts);
-    SetKey(new_key);
+  Slice SetKeyWithPaddedMinTimestamp(const Slice& key, size_t ts_sz) {
+    // This function is only used by the UDT in memtable feature, which only
+    // support built in comparators with uint64 timestamps.
+    assert(ts_sz == sizeof(uint64_t));
+    size_t num_key_slices = 0;
+    if (is_user_key_) {
+      key_slices_[0] = key;
+      key_slices_[1] = Slice(kTsMin, ts_sz);
+      num_key_slices = 2;
+    } else {
+      assert(key.size() >= kNumInternalBytes);
+      size_t user_key_size = key.size() - kNumInternalBytes;
+      key_slices_[0] = Slice(key.data(), user_key_size);
+      key_slices_[1] = Slice(kTsMin, ts_sz);
+      key_slices_[2] = Slice(key.data() + user_key_size, kNumInternalBytes);
+      num_key_slices = 3;
+    }
+    return SetKeyImpl(num_key_slices, key.size() + ts_sz);
  }

  Slice SetKey(const Slice& key, bool copy = true) {
@ -719,15 +747,6 @@ class IterKey {
    return Slice(key_, key_n);
  }

-  // Copy the key into IterKey own buf_
-  void OwnKey() {
-    assert(IsKeyPinned() == true);
-
-    Reserve(key_size_);
-    memcpy(buf_, key_, key_size_);
-    key_ = buf_;
-  }
-
  // Update the sequence number in the internal key.  Guarantees not to
  // invalidate slices to the key (and the user key).
  void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
@ -739,10 +758,15 @@ class IterKey {
             ts->size());
    }
    uint64_t newval = (seq << 8) | t;
-    EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
+    if (key_ == buf_) {
+      EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
+    } else {
+      assert(key_ == secondary_buf_);
+      EncodeFixed64(&secondary_buf_[key_size_ - kNumInternalBytes], newval);
+    }
  }

-  bool IsKeyPinned() const { return (key_ != buf_); }
+  bool IsKeyPinned() const { return key_ != buf_ && key_ != secondary_buf_; }

  // If `ts` is provided, user_key should not contain timestamp,
  // and `ts` is appended after user_key.
@ -807,8 +831,24 @@ class IterKey {
  const char* key_;
  size_t key_size_;
  size_t buf_size_;
-  char space_[39];  // Avoid allocation for short keys
+  char space_[kInlineBufferSize];  // Avoid allocation for short keys
  bool is_user_key_;
+  // Below variables are only used by user-defined timestamps in MemTable only
+  // feature for iterating keys in an index block or a data block.
+  //
+  // We will alternate between buf_ and secondary_buf_ to hold the key. key_
+  // will be modified in accordance to point to the right one. This is to avoid
+  // an extra copy when we need to copy some shared bytes from previous key
+  // (delta encoding), and we need to pad a min timestamp at the right location.
+  char space_for_secondary_buf_[kInlineBufferSize];  // Avoid allocation for
+                                                     // short keys
+  char* secondary_buf_;
+  size_t secondary_buf_size_;
+  // Use to track the pieces that together make the whole key. We then copy
+  // these pieces in order either into buf_ or secondary_buf_ depending on where
+  // the previous key is held.
+  std::array<Slice, 5> key_slices_;
+  // End of variables used by user-defined timestamps in MemTable only feature.

  Slice SetKeyImpl(const Slice& key, bool copy) {
    size_t size = key.size();
@ -825,18 +865,64 @@ class IterKey {
    return Slice(key_, key_size_);
  }

+  Slice SetKeyImpl(size_t num_key_slices, size_t total_bytes) {
+    assert(num_key_slices <= 5);
+    char* buf_start = nullptr;
+    if (key_ == buf_) {
+      // If the previous key is in buf_, we copy key_slices_ in order into
+      // secondary_buf_.
+      EnlargeSecondaryBufferIfNeeded(total_bytes);
+      buf_start = secondary_buf_;
+      key_ = secondary_buf_;
+    } else {
+      // Copy key_slices_ in order into buf_.
+      EnlargeBufferIfNeeded(total_bytes);
+      buf_start = buf_;
+      key_ = buf_;
+    }
+#ifndef NDEBUG
+    size_t actual_total_bytes = 0;
+#endif  // NDEBUG
+    for (size_t i = 0; i < num_key_slices; i++) {
+      size_t key_slice_size = key_slices_[i].size();
+      memcpy(buf_start, key_slices_[i].data(), key_slice_size);
+      buf_start += key_slice_size;
+#ifndef NDEBUG
+      actual_total_bytes += key_slice_size;
+#endif  // NDEBUG
+    }
+#ifndef NDEBUG
+    assert(actual_total_bytes == total_bytes);
+#endif  // NDEBUG
+    key_size_ = total_bytes;
+    return Slice(key_, key_size_);
+  }
+
  void ResetBuffer() {
+    if (key_ == buf_) {
+      key_size_ = 0;
+    }
    if (buf_ != space_) {
      delete[] buf_;
      buf_ = space_;
    }
-    buf_size_ = sizeof(space_);
-    key_size_ = 0;
+    buf_size_ = kInlineBufferSize;
+  }
+
+  void ResetSecondaryBuffer() {
+    if (key_ == secondary_buf_) {
+      key_size_ = 0;
+    }
+    if (secondary_buf_ != space_for_secondary_buf_) {
+      delete[] secondary_buf_;
+      secondary_buf_ = space_for_secondary_buf_;
+    }
+    secondary_buf_size_ = kInlineBufferSize;
  }

  // Enlarge the buffer size if needed based on key_size.
-  // By default, static allocated buffer is used. Once there is a key
-  // larger than the static allocated buffer, another buffer is dynamically
+  // By default, inline buffer is used. Once there is a key
+  // larger than the inline buffer, another buffer is dynamically
  // allocated, until a larger key buffer is requested. In that case, we
  // reallocate buffer and delete the old one.
  void EnlargeBufferIfNeeded(size_t key_size) {
@ -847,23 +933,27 @@ class IterKey {
    }
  }

+  void EnlargeSecondaryBufferIfNeeded(size_t key_size);
+
  void EnlargeBuffer(size_t key_size);

  void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
                                     const size_t slice_sz, bool add_timestamp,
-                                     const size_t left_sz,
-                                     const std::string& min_timestamp,
-                                     std::vector<Slice>& key_parts,
+                                     const size_t left_sz, const size_t ts_sz,
+                                     size_t* next_key_slice_idx,
                                     bool* ts_added) {
+    assert(next_key_slice_idx);
    if (add_timestamp && !*ts_added) {
      assert(slice_sz >= left_sz);
-      key_parts.emplace_back(slice_data, left_sz);
-      key_parts.emplace_back(min_timestamp);
-      key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz);
+      key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, left_sz);
+      key_slices_[(*next_key_slice_idx)++] = Slice(kTsMin, ts_sz);
+      key_slices_[(*next_key_slice_idx)++] =
+          Slice(slice_data + left_sz, slice_sz - left_sz);
      *ts_added = true;
    } else {
-      key_parts.emplace_back(slice_data, slice_sz);
+      key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, slice_sz);
    }
+    assert(*next_key_slice_idx <= 5);
  }
 };

@ -937,22 +1027,13 @@ struct RangeTombstone {
  // User-defined timestamp is enabled, `sk` and `ek` should be user key
  // with timestamp, `ts` will replace the timestamps in `sk` and
  // `ek`.
-  // When `logical_strip_timestamp` is true, the timestamps in `sk` and `ek`
-  // will be replaced with min timestamp.
-  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts,
-                 bool logical_strip_timestamp)
-      : seq_(sn) {
+  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts) : seq_(sn) {
    const size_t ts_sz = ts.size();
    assert(ts_sz > 0);
    pinned_start_key_.reserve(sk.size());
    pinned_end_key_.reserve(ek.size());
-    if (logical_strip_timestamp) {
-      AppendUserKeyWithMinTimestamp(&pinned_start_key_, sk, ts_sz);
-      AppendUserKeyWithMinTimestamp(&pinned_end_key_, ek, ts_sz);
-    } else {
-      AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts);
-      AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts);
-    }
+    AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts);
+    AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts);
    start_key_ = pinned_start_key_;
    end_key_ = pinned_end_key_;
    ts_ = Slice(pinned_start_key_.data() + sk.size() - ts_sz, ts_sz);
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@ -381,7 +381,7 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
 //    BackgroundErrorReason reason) will be called to handle other error cases
 //    such as delegating to SstFileManager to handle no space error.
 void ErrorHandler::SetBGError(const Status& bg_status,
-                              BackgroundErrorReason reason) {
+                              BackgroundErrorReason reason, bool wal_related) {
  db_mutex_->AssertHeld();
  Status tmp_status = bg_status;
  IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
@ -389,8 +389,8 @@ void ErrorHandler::SetBGError(const Status& bg_status,
  if (bg_io_err.ok()) {
    return;
  }
-  ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
-                 bg_io_err.ToString().c_str());
+  ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s, reason %d",
+                 bg_io_err.ToString().c_str(), static_cast<int>(reason));

  RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT},
              {} /* int_histograms */);
@ -412,6 +412,31 @@ void ErrorHandler::SetBGError(const Status& bg_status,
    recover_context_ = context;
    return;
  }
+  if (wal_related) {
+    assert(reason == BackgroundErrorReason::kWriteCallback ||
+           reason == BackgroundErrorReason::kMemTable ||
+           reason == BackgroundErrorReason::kFlush);
+  }
+  if (db_options_.manual_wal_flush && wal_related && bg_io_err.IsIOError()) {
+    // With manual_wal_flush, a WAL write failure can drop buffered WAL writes.
+    // Memtables and WAL then become inconsistent. A successful memtable flush
+    // on one CF can cause CFs to be inconsistent upon restart. Before we fix
+    // the bug in auto recovery from WAL write failures that can flush one CF
+    // at a time, we set the error severity to fatal to disallow auto recovery.
+    // TODO: remove parameter `wal_related` once we can automatically recover
+    //  from WAL write failures.
+    bool auto_recovery = false;
+    Status bg_err(new_bg_io_err, Status::Severity::kFatalError);
+    CheckAndSetRecoveryAndBGError(bg_err);
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "ErrorHandler: A potentially WAL error happened, set "
+                   "background IO error as fatal error\n");
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &bg_err, db_mutex_, &auto_recovery);
+    recover_context_ = context;
+    return;
+  }
+
  if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
      (bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile ||
       bg_io_err.GetRetryable())) {
--- a/Show More
+++ b/Show More