diff --git a/HISTORY.md b/HISTORY.md index 8bfd738766..005e6f4812 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -35,6 +35,7 @@ * Iterator performance is improved for `DeleteRange()` users. Internally, iterator will skip to the end of a range tombstone when possible, instead of looping through each key and check individually if a key is range deleted. * Eliminated some allocations and copies in the blob read path. Also, `PinnableSlice` now only points to the blob value and pins the backing resource (cache entry or buffer) in all cases, instead of containing a copy of the blob value. See #10625 and #10647. * In case of scans with async_io enabled, few optimizations have been added to issue more asynchronous requests in parallel in order to avoid synchronous prefetching. +* `DeleteRange()` users should see improvement in get/iterator performance from mutable memtable (see #10547). ## 7.6.0 (08/19/2022) ### New Features diff --git a/db/memtable.cc b/db/memtable.cc index 53f40bb980..2acf9b6da0 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -560,16 +560,28 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( read_seq); } - auto* unfragmented_iter = new MemTableIterator( - *this, read_options, nullptr /* arena */, true /* use_range_del_table */); - auto fragmented_tombstone_list = - std::make_shared( - std::unique_ptr(unfragmented_iter), - comparator_.comparator); + // takes current cache + std::shared_ptr cache = + std::atomic_load_explicit(cached_range_tombstone_.Access(), + std::memory_order_relaxed); + // construct fragmented tombstone list if necessary + if (!cache->initialized.load(std::memory_order_acquire)) { + cache->reader_mutex.lock(); + if (!cache->tombstones) { + auto* unfragmented_iter = + new MemTableIterator(*this, read_options, nullptr /* arena */, + true /* use_range_del_table */); + cache->tombstones = std::make_unique( + FragmentedRangeTombstoneList( + std::unique_ptr(unfragmented_iter), + comparator_.comparator)); + cache->initialized.store(true, std::memory_order_release); + } + cache->reader_mutex.unlock(); + } - auto* fragmented_iter = new FragmentedRangeTombstoneIterator( - fragmented_tombstone_list, comparator_.comparator, read_seq); - return fragmented_iter; + return new FragmentedRangeTombstoneIterator(cache, comparator_.comparator, + read_seq); } void MemTable::ConstructFragmentedRangeTombstones() { @@ -819,6 +831,30 @@ Status MemTable::Add(SequenceNumber s, ValueType type, } } if (type == kTypeRangeDeletion) { + auto new_cache = std::make_shared(); + size_t size = cached_range_tombstone_.Size(); + if (allow_concurrent) { + range_del_mutex_.lock(); + } + for (size_t i = 0; i < size; ++i) { + std::shared_ptr* local_cache_ref_ptr = + cached_range_tombstone_.AccessAtCore(i); + auto new_local_cache_ref = std::make_shared< + const std::shared_ptr>(new_cache); + // It is okay for some reader to load old cache during invalidation as + // the new sequence number is not published yet. + // Each core will have a shared_ptr to a shared_ptr to the cached + // fragmented range tombstones, so that ref count is maintianed locally + // per-core using the per-core shared_ptr. + std::atomic_store_explicit( + local_cache_ref_ptr, + std::shared_ptr( + new_local_cache_ref, new_cache.get()), + std::memory_order_relaxed); + } + if (allow_concurrent) { + range_del_mutex_.unlock(); + } is_range_del_table_empty_.store(false, std::memory_order_relaxed); } UpdateOldestKeyTime(); diff --git a/db/memtable.h b/db/memtable.h index 74b3c64bbb..86c2160af9 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -536,6 +536,11 @@ class MemTable { size_t protection_bytes_per_key, bool allow_data_in_errors = false); + // makes sure there is a single range tombstone writer to invalidate cache + std::mutex range_del_mutex_; + CoreLocalArray> + cached_range_tombstone_; + private: enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 8c90ff5b61..356eee181c 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -251,6 +251,22 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( Invalidate(); } +FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones_cache, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + SequenceNumber _lower_bound) + : tombstone_start_cmp_(icmp.user_comparator()), + tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), + ucmp_(icmp.user_comparator()), + tombstones_cache_ref_(tombstones_cache), + tombstones_(tombstones_cache_ref_->tombstones.get()), + upper_bound_(_upper_bound), + lower_bound_(_lower_bound) { + assert(tombstones_ != nullptr); + Invalidate(); +} + void FragmentedRangeTombstoneIterator::SeekToFirst() { pos_ = tombstones_->begin(); seq_pos_ = tombstones_->seq_begin(); diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index f323db5d75..0c8cbf1817 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -17,6 +17,15 @@ #include "table/internal_iterator.h" namespace ROCKSDB_NAMESPACE { +struct FragmentedRangeTombstoneList; + +struct FragmentedRangeTombstoneListCache { + // ensure only the first reader needs to initialize l + std::mutex reader_mutex; + std::unique_ptr tombstones = nullptr; + // readers will first check this bool to avoid + std::atomic initialized = false; +}; struct FragmentedRangeTombstoneList { public: @@ -113,6 +122,10 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { const std::shared_ptr& tombstones, const InternalKeyComparator& icmp, SequenceNumber upper_bound, SequenceNumber lower_bound = 0); + FragmentedRangeTombstoneIterator( + const std::shared_ptr& tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound = 0); void SeekToFirst() override; void SeekToLast() override; @@ -260,6 +273,7 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { const InternalKeyComparator* icmp_; const Comparator* ucmp_; std::shared_ptr tombstones_ref_; + std::shared_ptr tombstones_cache_ref_; const FragmentedRangeTombstoneList* tombstones_; SequenceNumber upper_bound_; SequenceNumber lower_bound_; diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index b7fcdd786a..1e7c19044f 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -469,7 +469,7 @@ class DB { // a `Status::InvalidArgument` is returned. // // This feature is now usable in production, with the following caveats: - // 1) Accumulating many range tombstones in the memtable will degrade read + // 1) Accumulating too many range tombstones in the memtable will degrade read // performance; this can be avoided by manually flushing occasionally. // 2) Limiting the maximum number of open files in the presence of range // tombstones can degrade read performance. To avoid this problem, set