Use sized deallocation when releasing Zippy's scratch buffers.

name                                          old time/op             new time/op             delta
BM_UFlat/0      [html             ]            41.7µs ± 0%             41.7µs ± 0%    ~             (p=0.222 n=5+5)
BM_UFlat/1      [urls             ]             587µs ± 0%              574µs ± 0%  -2.31%          (p=0.008 n=5+5)
BM_UFlat/2      [jpg              ]            7.24µs ± 2%             7.25µs ± 2%    ~             (p=0.690 n=5+5)
BM_UFlat/3      [jpg_200          ]              130ns ± 0%              131ns ± 1%    ~             (p=0.556 n=4+5)
BM_UFlat/4      [pdf              ]            8.21µs ± 0%             8.24µs ± 1%    ~             (p=0.278 n=5+5)
BM_UFlat/5      [html4            ]             219µs ± 0%              220µs ± 0%  +0.45%          (p=0.008 n=5+5)
BM_UFlat/6      [txt1             ]             192µs ± 0%              190µs ± 0%  -0.86%          (p=0.008 n=5+5)
BM_UFlat/7      [txt2             ]             169µs ± 0%              168µs ± 0%  -0.54%          (p=0.008 n=5+5)
BM_UFlat/8      [txt3             ]             509µs ± 0%              505µs ± 0%  -0.66%          (p=0.008 n=5+5)
BM_UFlat/9      [txt4             ]             710µs ± 0%              702µs ± 0%  -1.14%          (p=0.008 n=5+5)
BM_UFlat/10     [pb               ]            38.2µs ± 0%             37.9µs ± 0%  -0.82%          (p=0.008 n=5+5)
BM_UFlat/11     [gaviota          ]             189µs ± 0%              189µs ± 0%    ~             (p=0.746 n=5+5)
BM_UFlat/12     [cp               ]            14.2µs ± 0%             14.2µs ± 1%    ~             (p=0.421 n=5+5)
BM_UFlat/13     [c                ]            7.29µs ± 0%             7.34µs ± 1%  +0.69%          (p=0.016 n=5+5)
BM_UFlat/14     [lsp              ]            2.27µs ± 0%             2.28µs ± 0%  +0.34%          (p=0.008 n=5+5)
BM_UFlat/15     [xls              ]             954µs ± 0%              900µs ± 0%  -5.67%          (p=0.008 n=5+5)
BM_UFlat/16     [xls_200          ]              213ns ± 1%              217ns ± 2%    ~             (p=0.056 n=5+5)
BM_UFlat/17     [bin              ]             276µs ± 0%              274µs ± 0%  -0.94%          (p=0.008 n=5+5)
BM_UFlat/18     [bin_200          ]              101ns ± 1%              101ns ± 1%    ~             (p=0.524 n=5+5)
BM_UFlat/19     [sum              ]            29.3µs ± 0%             27.3µs ± 0%  -6.98%          (p=0.008 n=5+5)
BM_UFlat/20     [man              ]            2.95µs ± 0%             2.95µs ± 0%    ~             (p=0.651 n=5+5)

For microbenchmarks, the overhead of allocating/deallocating should be
small (the relevant metadata for TCMalloc's PageMap will be in cache),
but this helps demonstrate that the refactoring does not adversely
impact performance.
This commit is contained in:
ckennelly 2018-10-15 13:26:59 -07:00 committed by Victor Costan
parent 1b7466e143
commit df5548c0b3
1 changed files with 21 additions and 11 deletions

View File

@ -942,6 +942,17 @@ bool GetUncompressedLength(Source* source, uint32* result) {
return decompressor.ReadUncompressedLength(result);
}
struct Deleter {
Deleter() : size_(0) {}
explicit Deleter(size_t size) : size_(size) {}
void operator()(char* ptr) const {
std::allocator<char>().deallocate(ptr, size_);
}
size_t size_;
};
size_t Compress(Source* reader, Sink* writer) {
size_t written = 0;
size_t N = reader->Available();
@ -952,8 +963,8 @@ size_t Compress(Source* reader, Sink* writer) {
written += (p - ulength);
internal::WorkingMemory wmem;
char* scratch = NULL;
char* scratch_output = NULL;
std::unique_ptr<char, Deleter> scratch;
std::unique_ptr<char, Deleter> scratch_output;
while (N > 0) {
// Get next block to compress (without copying if possible)
@ -974,20 +985,21 @@ size_t Compress(Source* reader, Sink* writer) {
// If this is the last iteration, we want to allocate N bytes
// of space, otherwise the max possible kBlockSize space.
// num_to_read contains exactly the correct value
scratch = new char[num_to_read];
scratch = {
std::allocator<char>().allocate(num_to_read), Deleter(num_to_read)};
}
memcpy(scratch, fragment, bytes_read);
memcpy(scratch.get(), fragment, bytes_read);
reader->Skip(bytes_read);
while (bytes_read < num_to_read) {
fragment = reader->Peek(&fragment_size);
size_t n = std::min<size_t>(fragment_size, num_to_read - bytes_read);
memcpy(scratch + bytes_read, fragment, n);
memcpy(scratch.get() + bytes_read, fragment, n);
bytes_read += n;
reader->Skip(n);
}
assert(bytes_read == num_to_read);
fragment = scratch;
fragment = scratch.get();
fragment_size = num_to_read;
}
assert(fragment_size == num_to_read);
@ -1002,13 +1014,14 @@ size_t Compress(Source* reader, Sink* writer) {
// Need a scratch buffer for the output, in case the byte sink doesn't
// have room for us directly.
if (scratch_output == NULL) {
scratch_output = new char[max_output];
scratch_output =
{std::allocator<char>().allocate(max_output), Deleter(max_output)};
} else {
// Since we encode kBlockSize regions followed by a region
// which is <= kBlockSize in length, a previously allocated
// scratch_output[] region is big enough for this iteration.
}
char* dest = writer->GetAppendBuffer(max_output, scratch_output);
char* dest = writer->GetAppendBuffer(max_output, scratch_output.get());
char* end = internal::CompressFragment(fragment, fragment_size,
dest, table, table_size);
writer->Append(dest, end - dest);
@ -1020,9 +1033,6 @@ size_t Compress(Source* reader, Sink* writer) {
Report("snappy_compress", written, uncompressed_size);
delete[] scratch;
delete[] scratch_output;
return written;
}