Reduce number of allocations when compressing and simplify the code.

Before we were allocating at least once: twice with large table and thrice when we used a scratch buffer. With this approach we always allocate once. name old speed new speed delta BM_UFlat/0 [html ] 2.45GB/s ± 0% 2.45GB/s ± 0% -0.13% (p=0.000 n=11+11) BM_UFlat/1 [urls ] 1.19GB/s ± 0% 1.22GB/s ± 0% +2.48% (p=0.000 n=11+11) BM_UFlat/2 [jpg ] 17.2GB/s ± 2% 17.3GB/s ± 1% ~ (p=0.193 n=11+11) BM_UFlat/3 [jpg_200 ] 1.52GB/s ± 0% 1.51GB/s ± 0% -0.78% (p=0.000 n=10+9) BM_UFlat/4 [pdf ] 12.5GB/s ± 1% 12.5GB/s ± 1% ~ (p=0.881 n=9+9) BM_UFlat/5 [html4 ] 1.86GB/s ± 0% 1.86GB/s ± 0% ~ (p=0.123 n=11+11) BM_UFlat/6 [txt1 ] 793MB/s ± 0% 799MB/s ± 0% +0.78% (p=0.000 n=11+9) BM_UFlat/7 [txt2 ] 739MB/s ± 0% 744MB/s ± 0% +0.77% (p=0.000 n=11+11) BM_UFlat/8 [txt3 ] 839MB/s ± 0% 845MB/s ± 0% +0.71% (p=0.000 n=11+11) BM_UFlat/9 [txt4 ] 678MB/s ± 0% 685MB/s ± 0% +1.01% (p=0.000 n=11+11) BM_UFlat/10 [pb ] 3.08GB/s ± 0% 3.12GB/s ± 0% +1.21% (p=0.000 n=11+11) BM_UFlat/11 [gaviota ] 975MB/s ± 0% 976MB/s ± 0% +0.11% (p=0.000 n=11+11) BM_UFlat/12 [cp ] 1.73GB/s ± 1% 1.74GB/s ± 1% +0.46% (p=0.010 n=11+11) BM_UFlat/13 [c ] 1.53GB/s ± 0% 1.53GB/s ± 0% ~ (p=0.987 n=11+10) BM_UFlat/14 [lsp ] 1.65GB/s ± 0% 1.63GB/s ± 1% -1.04% (p=0.000 n=11+11) BM_UFlat/15 [xls ] 1.08GB/s ± 0% 1.15GB/s ± 0% +6.12% (p=0.000 n=10+11) BM_UFlat/16 [xls_200 ] 944MB/s ± 0% 920MB/s ± 3% -2.51% (p=0.000 n=9+11) BM_UFlat/17 [bin ] 1.86GB/s ± 0% 1.87GB/s ± 0% +0.68% (p=0.000 n=10+11) BM_UFlat/18 [bin_200 ] 1.91GB/s ± 3% 1.92GB/s ± 5% ~ (p=0.356 n=11+11) BM_UFlat/19 [sum ] 1.31GB/s ± 0% 1.40GB/s ± 0% +6.53% (p=0.000 n=11+11) BM_UFlat/20 [man ] 1.42GB/s ± 0% 1.42GB/s ± 0% +0.33% (p=0.000 n=10+10)
2018-10-16 12:28:52 -07:00 · 2018-10-16 12:28:52 -07:00 · 53a38e5e33
parent df5548c0b3
commit 53a38e5e33
3 changed files with 55 additions and 62 deletions
--- a/snappy-internal.h
+++ b/snappy-internal.h
@ -36,19 +36,26 @@
 namespace snappy {
 namespace internal {

+// Working memory performs a single allocation to hold all scratch space
+// required for compression.
 class WorkingMemory {
 public:
-  WorkingMemory() : large_table_(NULL) { }
-  ~WorkingMemory() { delete[] large_table_; }
+  explicit WorkingMemory(size_t input_size);
+  ~WorkingMemory();

  // Allocates and clears a hash table using memory in "*this",
  // stores the number of buckets in "*table_size" and returns a pointer to
  // the base of the hash table.
-  uint16* GetHashTable(size_t input_size, int* table_size);
+  uint16* GetHashTable(size_t fragment_size, int* table_size) const;
+  char* GetScratchInput() const { return input_; }
+  char* GetScratchOutput() const { return output_; }

 private:
-  uint16 small_table_[1<<10];    // 2KB
-  uint16* large_table_;          // Allocated only when needed
+  char* mem_;      // the allocated memory, never nullptr
+  size_t size_;    // the size of the allocated memory, never 0
+  uint16* table_;  // the pointer to the hashtable
+  char* input_;    // the pointer to the input scratch buffer
+  char* output_;   // the pointer to the output scratch buffer

  // No copying
  WorkingMemory(const WorkingMemory&);
--- a/snappy.cc
+++ b/snappy.cc
@ -418,31 +418,41 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
  }
 }

-namespace internal {
-uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
-  // Use smaller hash table when input.size() is smaller, since we
-  // fill the table, incurring O(hash table size) overhead for
-  // compression, and if the input is short, we won't need that
-  // many hash table entries anyway.
+namespace {
+uint32 CalculateTableSize(uint32 input_size) {
  assert(kMaxHashTableSize >= 256);
-  size_t htsize = 256;
-  while (htsize < kMaxHashTableSize && htsize < input_size) {
-    htsize <<= 1;
+  if (input_size > kMaxHashTableSize) {
+    return kMaxHashTableSize;
  }
-
-  uint16* table;
-  if (htsize <= ARRAYSIZE(small_table_)) {
-    table = small_table_;
-  } else {
-    if (large_table_ == NULL) {
-      large_table_ = new uint16[kMaxHashTableSize];
-    }
-    table = large_table_;
+  if (input_size < 256) {
+    return 256;
  }
+  return 1u << (32 - __builtin_clz(input_size - 1));
+}
+}  // namespace

+namespace internal {
+WorkingMemory::WorkingMemory(size_t input_size) {
+  const size_t max_fragment_size = std::min(input_size, kBlockSize);
+  const size_t table_size = CalculateTableSize(max_fragment_size);
+  size_ = table_size * sizeof(*table_) + max_fragment_size +
+          MaxCompressedLength(max_fragment_size);
+  mem_ = std::allocator<char>().allocate(size_);
+  table_ = reinterpret_cast<uint16*>(mem_);
+  input_ = mem_ + table_size * sizeof(*table_);
+  output_ = input_ + max_fragment_size;
+}
+
+WorkingMemory::~WorkingMemory() {
+  std::allocator<char>().deallocate(mem_, size_);
+}
+
+uint16* WorkingMemory::GetHashTable(size_t fragment_size,
+                                    int* table_size) const {
+  const size_t htsize = CalculateTableSize(fragment_size);
+  memset(table_, 0, htsize * sizeof(*table_));
  *table_size = htsize;
-  memset(table, 0, htsize * sizeof(*table));
-  return table;
+  return table_;
 }
 }  // end namespace internal

@ -942,17 +952,6 @@ bool GetUncompressedLength(Source* source, uint32* result) {
  return decompressor.ReadUncompressedLength(result);
 }

-struct Deleter {
-  Deleter() : size_(0) {}
-  explicit Deleter(size_t size) : size_(size) {}
-
-  void operator()(char* ptr) const {
-    std::allocator<char>().deallocate(ptr, size_);
-  }
-
-  size_t size_;
-};
-
 size_t Compress(Source* reader, Sink* writer) {
  size_t written = 0;
  size_t N = reader->Available();
@ -962,9 +961,7 @@ size_t Compress(Source* reader, Sink* writer) {
  writer->Append(ulength, p-ulength);
  written += (p - ulength);

-  internal::WorkingMemory wmem;
-  std::unique_ptr<char, Deleter> scratch;
-  std::unique_ptr<char, Deleter> scratch_output;
+  internal::WorkingMemory wmem(N);

  while (N > 0) {
    // Get next block to compress (without copying if possible)
@ -980,26 +977,19 @@ size_t Compress(Source* reader, Sink* writer) {
      pending_advance = num_to_read;
      fragment_size = num_to_read;
    } else {
-      // Read into scratch buffer
-      if (scratch == NULL) {
-        // If this is the last iteration, we want to allocate N bytes
-        // of space, otherwise the max possible kBlockSize space.
-        // num_to_read contains exactly the correct value
-        scratch = {
-            std::allocator<char>().allocate(num_to_read), Deleter(num_to_read)};
-      }
-      memcpy(scratch.get(), fragment, bytes_read);
+      char* scratch = wmem.GetScratchInput();
+      memcpy(scratch, fragment, bytes_read);
      reader->Skip(bytes_read);

      while (bytes_read < num_to_read) {
        fragment = reader->Peek(&fragment_size);
        size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
-        memcpy(scratch.get() + bytes_read, fragment, n);
+        memcpy(scratch + bytes_read, fragment, n);
        bytes_read += n;
        reader->Skip(n);
      }
      assert(bytes_read == num_to_read);
-      fragment = scratch.get();
+      fragment = scratch;
      fragment_size = num_to_read;
    }
    assert(fragment_size == num_to_read);
@ -1013,17 +1003,13 @@ size_t Compress(Source* reader, Sink* writer) {

    // Need a scratch buffer for the output, in case the byte sink doesn't
    // have room for us directly.
-    if (scratch_output == NULL) {
-      scratch_output =
-          {std::allocator<char>().allocate(max_output), Deleter(max_output)};
-    } else {
-      // Since we encode kBlockSize regions followed by a region
-      // which is <= kBlockSize in length, a previously allocated
-      // scratch_output[] region is big enough for this iteration.
-    }
-    char* dest = writer->GetAppendBuffer(max_output, scratch_output.get());
-    char* end = internal::CompressFragment(fragment, fragment_size,
-                                           dest, table, table_size);
+
+    // Since we encode kBlockSize regions followed by a region
+    // which is <= kBlockSize in length, a previously allocated
+    // scratch_output[] region is big enough for this iteration.
+    char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
+    char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
+                                           table_size);
    writer->Append(dest, end - dest);
    written += (end - dest);

--- a/snappy_unittest.cc
+++ b/snappy_unittest.cc
@ -445,7 +445,7 @@ static void VerifyNonBlockedCompression(const string& input) {
  Varint::Append32(&prefix, input.size());

  // Setup compression table
-  snappy::internal::WorkingMemory wmem;
+  snappy::internal::WorkingMemory wmem(input.size());
  int table_size;
  uint16* table = wmem.GetHashTable(input.size(), &table_size);