Optimize GetApproximateSizes() to use lesser CPU cycles.

Summary: CPU profiling reveals GetApproximateSizes as a bottleneck for performance. The current implementation is sub-optimal, it scans every file in every level to compute the result. We can take advantage of the fact that all levels above 0 are sorted in the increasing order of key ranges and use binary search to locate the starting index. This can reduce the number of comparisons required to compute the result. Test Plan: We have good test coverage. Run the tests. Reviewers: sdong, igor, rven, dynamike Subscribers: dynamike, maykov, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D37755
2024-11-26 16:30:56 +00:00 · 2015-04-29 15:36:21 -07:00 · 2015-04-29 15:36:21 -07:00 · d4540654e9
parent fd96b55402
commit d4540654e9
3 changed files with 102 additions and 39 deletions
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -3587,7 +3587,6 @@ void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,

 void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
                                 const Range* range, int n, uint64_t* sizes) {
-  // TODO(opt): better implementation
  Version* v;
  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
@ -3599,12 +3598,9 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,

  for (int i = 0; i < n; i++) {
    // Convert user_key into a corresponding internal key.
-    InternalKey k1, k2;
-    k1.SetMaxPossibleForUserKey(range[i].start);
-    k2.SetMaxPossibleForUserKey(range[i].limit);
-    uint64_t start = versions_->ApproximateOffsetOf(v, k1);
-    uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
-    sizes[i] = (limit >= start ? limit - start : 0);
+    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
  }

  {
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -2802,40 +2802,101 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num,
  return result;
 }

+uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
+                                     const Slice& end) {
+  // pre-condition
+  assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);

-uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
-  uint64_t result = 0;
+  uint64_t size = 0;
  const auto* vstorage = v->storage_info();
-  for (int level = 0; level < vstorage->num_levels(); level++) {
-    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
-    for (size_t i = 0; i < files.size(); i++) {
-      if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <=
-          0) {
-        // Entire file is before "ikey", so just add the file size
-        result += files[i]->fd.GetFileSize();
-      } else if (v->cfd_->internal_comparator().Compare(files[i]->smallest,
-                                                        ikey) > 0) {
-        // Entire file is after "ikey", so ignore
-        if (level > 0) {
-          // Files other than level 0 are sorted by meta->smallest, so
-          // no further files in this level will contain data for
-          // "ikey".
-          break;
-        }
-      } else {
-        // "ikey" falls in the range for this table.  Add the
-        // approximate offset of "ikey" within the table.
-        TableReader* table_reader_ptr;
-        Iterator* iter = v->cfd_->table_cache()->NewIterator(
-            ReadOptions(), env_options_, v->cfd_->internal_comparator(),
-            files[i]->fd, &table_reader_ptr);
-        if (table_reader_ptr != nullptr) {
-          result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
-        }
-        delete iter;
+
+  for (int level = 0; level < vstorage->num_non_empty_levels(); level++) {
+    const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
+    if (!files_brief.num_files) {
+      // empty level, skip exploration
+      continue;
+    }
+
+    if (!level) {
+      // level 0 data is sorted order, handle the use case explicitly
+      size += ApproximateSizeLevel0(v, files_brief, start, end);
+      continue;
+    }
+
+    assert(level > 0);
+    assert(files_brief.num_files > 0);
+
+    // identify the file position for starting key
+    const uint64_t idx_start =
+        FindFileInRange(v->cfd_->internal_comparator(), files_brief, start,
+                        /*start=*/0, files_brief.num_files - 1);
+    assert(idx_start < files_brief.num_files);
+
+    // scan all files from the starting position until the ending position
+    // inferred from the sorted order
+    for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
+      uint64_t val;
+      val = ApproximateSize(v, files_brief.files[i], end);
+      if (!val) {
+        // the files after this will not have the range
+        break;
+      }
+
+      size += val;
+
+      if (i == idx_start) {
+        // subtract the bytes needed to be scanned to get to the starting
+        // key
+        val = ApproximateSize(v, files_brief.files[i], start);
+        assert(size >= val);
+        size -= val;
      }
    }
  }
+
+  return size;
+}
+
+uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
+                                           const LevelFilesBrief& files_brief,
+                                           const Slice& key_start,
+                                           const Slice& key_end) {
+  // level 0 files are not in sorted order, we need to iterate through
+  // the list to compute the total bytes that require scanning
+  uint64_t size = 0;
+  for (size_t i = 0; i < files_brief.num_files; i++) {
+    const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start);
+    const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end);
+    assert(end >= start);
+    size += end - start;
+  }
+  return size;
+}
+
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+                                     const Slice& key) {
+  // pre-condition
+  assert(v);
+
+  uint64_t result = 0;
+  if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
+    // Entire file is before "key", so just add the file size
+    result = f.fd.GetFileSize();
+  } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
+    // Entire file is after "key", so ignore
+    result = 0;
+  } else {
+    // "key" falls in the range for this table.  Add the
+    // approximate offset of "key" within the table.
+    TableReader* table_reader_ptr;
+    Iterator* iter = v->cfd_->table_cache()->NewIterator(
+        ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd,
+        &table_reader_ptr);
+    if (table_reader_ptr != nullptr) {
+      result = table_reader_ptr->ApproximateOffsetOf(key);
+    }
+    delete iter;
+  }
  return result;
 }

--- a/db/version_set.h
+++ b/db/version_set.h
@ -618,9 +618,8 @@ class VersionSet {
  // Add all files listed in any live version to *live.
  void AddLiveFiles(std::vector<FileDescriptor>* live_list);

-  // Return the approximate offset in the database of the data for
-  // "key" as of version "v".
-  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+  // Return the approximate size of data to be scanned for range [start, end)
+  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end);

  // Return the size of the current manifest file
  uint64_t manifest_file_size() const { return manifest_file_size_; }
@ -657,6 +656,13 @@ class VersionSet {
    }
  };

+  // ApproximateSize helper
+  uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
+                                 const Slice& start, const Slice& end);
+
+  uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
+                           const Slice& key);
+
  // Save current contents to *log
  Status WriteSnapshot(log::Writer* log);