Reduce number of allocations when compressing and simplify the code.

Before we were allocating at least once: twice with large table and
thrice when we used a scratch buffer. With this approach we always
allocate once.

  name                                          old speed               new speed               delta
  BM_UFlat/0      [html             ]           2.45GB/s ± 0%           2.45GB/s ± 0%   -0.13%        (p=0.000 n=11+11)
  BM_UFlat/1      [urls             ]           1.19GB/s ± 0%           1.22GB/s ± 0%   +2.48%        (p=0.000 n=11+11)
  BM_UFlat/2      [jpg              ]           17.2GB/s ± 2%           17.3GB/s ± 1%     ~           (p=0.193 n=11+11)
  BM_UFlat/3      [jpg_200          ]           1.52GB/s ± 0%           1.51GB/s ± 0%   -0.78%         (p=0.000 n=10+9)
  BM_UFlat/4      [pdf              ]           12.5GB/s ± 1%           12.5GB/s ± 1%     ~             (p=0.881 n=9+9)
  BM_UFlat/5      [html4            ]           1.86GB/s ± 0%           1.86GB/s ± 0%     ~           (p=0.123 n=11+11)
  BM_UFlat/6      [txt1             ]            793MB/s ± 0%            799MB/s ± 0%   +0.78%         (p=0.000 n=11+9)
  BM_UFlat/7      [txt2             ]            739MB/s ± 0%            744MB/s ± 0%   +0.77%        (p=0.000 n=11+11)
  BM_UFlat/8      [txt3             ]            839MB/s ± 0%            845MB/s ± 0%   +0.71%        (p=0.000 n=11+11)
  BM_UFlat/9      [txt4             ]            678MB/s ± 0%            685MB/s ± 0%   +1.01%        (p=0.000 n=11+11)
  BM_UFlat/10     [pb               ]           3.08GB/s ± 0%           3.12GB/s ± 0%   +1.21%        (p=0.000 n=11+11)
  BM_UFlat/11     [gaviota          ]            975MB/s ± 0%            976MB/s ± 0%   +0.11%        (p=0.000 n=11+11)
  BM_UFlat/12     [cp               ]           1.73GB/s ± 1%           1.74GB/s ± 1%   +0.46%        (p=0.010 n=11+11)
  BM_UFlat/13     [c                ]           1.53GB/s ± 0%           1.53GB/s ± 0%     ~           (p=0.987 n=11+10)
  BM_UFlat/14     [lsp              ]           1.65GB/s ± 0%           1.63GB/s ± 1%   -1.04%        (p=0.000 n=11+11)
  BM_UFlat/15     [xls              ]           1.08GB/s ± 0%           1.15GB/s ± 0%   +6.12%        (p=0.000 n=10+11)
  BM_UFlat/16     [xls_200          ]            944MB/s ± 0%            920MB/s ± 3%   -2.51%         (p=0.000 n=9+11)
  BM_UFlat/17     [bin              ]           1.86GB/s ± 0%           1.87GB/s ± 0%   +0.68%        (p=0.000 n=10+11)
  BM_UFlat/18     [bin_200          ]           1.91GB/s ± 3%           1.92GB/s ± 5%     ~           (p=0.356 n=11+11)
  BM_UFlat/19     [sum              ]           1.31GB/s ± 0%           1.40GB/s ± 0%   +6.53%        (p=0.000 n=11+11)
  BM_UFlat/20     [man              ]           1.42GB/s ± 0%           1.42GB/s ± 0%   +0.33%        (p=0.000 n=10+10)
This commit is contained in:
alkis 2018-10-16 12:28:52 -07:00 committed by Victor Costan
parent df5548c0b3
commit 53a38e5e33
3 changed files with 55 additions and 62 deletions

View File

@ -36,19 +36,26 @@
namespace snappy { namespace snappy {
namespace internal { namespace internal {
// Working memory performs a single allocation to hold all scratch space
// required for compression.
class WorkingMemory { class WorkingMemory {
public: public:
WorkingMemory() : large_table_(NULL) { } explicit WorkingMemory(size_t input_size);
~WorkingMemory() { delete[] large_table_; } ~WorkingMemory();
// Allocates and clears a hash table using memory in "*this", // Allocates and clears a hash table using memory in "*this",
// stores the number of buckets in "*table_size" and returns a pointer to // stores the number of buckets in "*table_size" and returns a pointer to
// the base of the hash table. // the base of the hash table.
uint16* GetHashTable(size_t input_size, int* table_size); uint16* GetHashTable(size_t fragment_size, int* table_size) const;
char* GetScratchInput() const { return input_; }
char* GetScratchOutput() const { return output_; }
private: private:
uint16 small_table_[1<<10]; // 2KB char* mem_; // the allocated memory, never nullptr
uint16* large_table_; // Allocated only when needed size_t size_; // the size of the allocated memory, never 0
uint16* table_; // the pointer to the hashtable
char* input_; // the pointer to the input scratch buffer
char* output_; // the pointer to the output scratch buffer
// No copying // No copying
WorkingMemory(const WorkingMemory&); WorkingMemory(const WorkingMemory&);

View File

@ -418,31 +418,41 @@ bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
} }
} }
namespace internal { namespace {
uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) { uint32 CalculateTableSize(uint32 input_size) {
// Use smaller hash table when input.size() is smaller, since we
// fill the table, incurring O(hash table size) overhead for
// compression, and if the input is short, we won't need that
// many hash table entries anyway.
assert(kMaxHashTableSize >= 256); assert(kMaxHashTableSize >= 256);
size_t htsize = 256; if (input_size > kMaxHashTableSize) {
while (htsize < kMaxHashTableSize && htsize < input_size) { return kMaxHashTableSize;
htsize <<= 1;
} }
if (input_size < 256) {
uint16* table; return 256;
if (htsize <= ARRAYSIZE(small_table_)) {
table = small_table_;
} else {
if (large_table_ == NULL) {
large_table_ = new uint16[kMaxHashTableSize];
}
table = large_table_;
} }
return 1u << (32 - __builtin_clz(input_size - 1));
}
} // namespace
namespace internal {
WorkingMemory::WorkingMemory(size_t input_size) {
const size_t max_fragment_size = std::min(input_size, kBlockSize);
const size_t table_size = CalculateTableSize(max_fragment_size);
size_ = table_size * sizeof(*table_) + max_fragment_size +
MaxCompressedLength(max_fragment_size);
mem_ = std::allocator<char>().allocate(size_);
table_ = reinterpret_cast<uint16*>(mem_);
input_ = mem_ + table_size * sizeof(*table_);
output_ = input_ + max_fragment_size;
}
WorkingMemory::~WorkingMemory() {
std::allocator<char>().deallocate(mem_, size_);
}
uint16* WorkingMemory::GetHashTable(size_t fragment_size,
int* table_size) const {
const size_t htsize = CalculateTableSize(fragment_size);
memset(table_, 0, htsize * sizeof(*table_));
*table_size = htsize; *table_size = htsize;
memset(table, 0, htsize * sizeof(*table)); return table_;
return table;
} }
} // end namespace internal } // end namespace internal
@ -942,17 +952,6 @@ bool GetUncompressedLength(Source* source, uint32* result) {
return decompressor.ReadUncompressedLength(result); return decompressor.ReadUncompressedLength(result);
} }
struct Deleter {
Deleter() : size_(0) {}
explicit Deleter(size_t size) : size_(size) {}
void operator()(char* ptr) const {
std::allocator<char>().deallocate(ptr, size_);
}
size_t size_;
};
size_t Compress(Source* reader, Sink* writer) { size_t Compress(Source* reader, Sink* writer) {
size_t written = 0; size_t written = 0;
size_t N = reader->Available(); size_t N = reader->Available();
@ -962,9 +961,7 @@ size_t Compress(Source* reader, Sink* writer) {
writer->Append(ulength, p-ulength); writer->Append(ulength, p-ulength);
written += (p - ulength); written += (p - ulength);
internal::WorkingMemory wmem; internal::WorkingMemory wmem(N);
std::unique_ptr<char, Deleter> scratch;
std::unique_ptr<char, Deleter> scratch_output;
while (N > 0) { while (N > 0) {
// Get next block to compress (without copying if possible) // Get next block to compress (without copying if possible)
@ -980,26 +977,19 @@ size_t Compress(Source* reader, Sink* writer) {
pending_advance = num_to_read; pending_advance = num_to_read;
fragment_size = num_to_read; fragment_size = num_to_read;
} else { } else {
// Read into scratch buffer char* scratch = wmem.GetScratchInput();
if (scratch == NULL) { memcpy(scratch, fragment, bytes_read);
// If this is the last iteration, we want to allocate N bytes
// of space, otherwise the max possible kBlockSize space.
// num_to_read contains exactly the correct value
scratch = {
std::allocator<char>().allocate(num_to_read), Deleter(num_to_read)};
}
memcpy(scratch.get(), fragment, bytes_read);
reader->Skip(bytes_read); reader->Skip(bytes_read);
while (bytes_read < num_to_read) { while (bytes_read < num_to_read) {
fragment = reader->Peek(&fragment_size); fragment = reader->Peek(&fragment_size);
size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read); size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
memcpy(scratch.get() + bytes_read, fragment, n); memcpy(scratch + bytes_read, fragment, n);
bytes_read += n; bytes_read += n;
reader->Skip(n); reader->Skip(n);
} }
assert(bytes_read == num_to_read); assert(bytes_read == num_to_read);
fragment = scratch.get(); fragment = scratch;
fragment_size = num_to_read; fragment_size = num_to_read;
} }
assert(fragment_size == num_to_read); assert(fragment_size == num_to_read);
@ -1013,17 +1003,13 @@ size_t Compress(Source* reader, Sink* writer) {
// Need a scratch buffer for the output, in case the byte sink doesn't // Need a scratch buffer for the output, in case the byte sink doesn't
// have room for us directly. // have room for us directly.
if (scratch_output == NULL) {
scratch_output = // Since we encode kBlockSize regions followed by a region
{std::allocator<char>().allocate(max_output), Deleter(max_output)}; // which is <= kBlockSize in length, a previously allocated
} else { // scratch_output[] region is big enough for this iteration.
// Since we encode kBlockSize regions followed by a region char* dest = writer->GetAppendBuffer(max_output, wmem.GetScratchOutput());
// which is <= kBlockSize in length, a previously allocated char* end = internal::CompressFragment(fragment, fragment_size, dest, table,
// scratch_output[] region is big enough for this iteration. table_size);
}
char* dest = writer->GetAppendBuffer(max_output, scratch_output.get());
char* end = internal::CompressFragment(fragment, fragment_size,
dest, table, table_size);
writer->Append(dest, end - dest); writer->Append(dest, end - dest);
written += (end - dest); written += (end - dest);

View File

@ -445,7 +445,7 @@ static void VerifyNonBlockedCompression(const string& input) {
Varint::Append32(&prefix, input.size()); Varint::Append32(&prefix, input.size());
// Setup compression table // Setup compression table
snappy::internal::WorkingMemory wmem; snappy::internal::WorkingMemory wmem(input.size());
int table_size; int table_size;
uint16* table = wmem.GetHashTable(input.size(), &table_size); uint16* table = wmem.GetHashTable(input.size(), &table_size);