2013-10-17 00:33:49 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "util/blob_store.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
// BlobChunk
|
|
|
|
bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const {
|
|
|
|
// overlapping!?
|
|
|
|
assert(!Overlap(chunk));
|
|
|
|
// size == 0 is a marker, not a block
|
|
|
|
return size != 0 &&
|
|
|
|
bucket_id == chunk.bucket_id &&
|
|
|
|
offset + size == chunk.offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool BlobChunk::Overlap(const BlobChunk &chunk) const {
|
|
|
|
return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id &&
|
|
|
|
((offset >= chunk.offset && offset < chunk.offset + chunk.size) ||
|
|
|
|
(chunk.offset >= offset && chunk.offset < offset + size));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Blob
|
|
|
|
string Blob::ToString() const {
|
|
|
|
string ret;
|
|
|
|
for (auto chunk : chunks) {
|
|
|
|
PutFixed32(&ret, chunk.bucket_id);
|
|
|
|
PutFixed32(&ret, chunk.offset);
|
|
|
|
PutFixed32(&ret, chunk.size);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
Blob::Blob(const std::string& blob) {
|
|
|
|
for (uint32_t i = 0; i < blob.size(); ) {
|
|
|
|
uint32_t t[3] = {0};
|
|
|
|
for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size();
|
|
|
|
++j, i += sizeof(uint32_t)) {
|
|
|
|
t[j] = DecodeFixed32(blob.data() + i);
|
|
|
|
}
|
|
|
|
chunks.push_back(BlobChunk(t[0], t[1], t[2]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// FreeList
|
|
|
|
Status FreeList::Free(const Blob& blob) {
|
|
|
|
// add it back to the free list
|
|
|
|
for (auto chunk : blob.chunks) {
|
2013-10-23 00:44:00 +00:00
|
|
|
free_blocks_ += chunk.size;
|
|
|
|
if (fifo_free_chunks_.size() &&
|
|
|
|
fifo_free_chunks_.back().ImmediatelyBefore(chunk)) {
|
|
|
|
fifo_free_chunks_.back().size += chunk.size;
|
2013-10-17 00:33:49 +00:00
|
|
|
} else {
|
2013-10-23 00:44:00 +00:00
|
|
|
fifo_free_chunks_.push_back(chunk);
|
2013-10-17 00:33:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status FreeList::Allocate(uint32_t blocks, Blob* blob) {
|
2013-10-23 00:44:00 +00:00
|
|
|
if (free_blocks_ < blocks) {
|
2013-10-17 00:33:49 +00:00
|
|
|
return Status::Incomplete("");
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:44:00 +00:00
|
|
|
blob->chunks.clear();
|
|
|
|
free_blocks_ -= blocks;
|
|
|
|
|
|
|
|
while (blocks > 0) {
|
|
|
|
assert(fifo_free_chunks_.size() > 0);
|
|
|
|
auto& front = fifo_free_chunks_.front();
|
|
|
|
if (front.size > blocks) {
|
|
|
|
blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks));
|
|
|
|
front.offset += blocks;
|
|
|
|
front.size -= blocks;
|
|
|
|
blocks = 0;
|
|
|
|
} else {
|
|
|
|
blob->chunks.push_back(front);
|
|
|
|
blocks -= front.size;
|
|
|
|
fifo_free_chunks_.pop_front();
|
|
|
|
}
|
2013-10-17 00:33:49 +00:00
|
|
|
}
|
2013-10-23 00:44:00 +00:00
|
|
|
assert(blocks == 0);
|
2013-10-17 00:33:49 +00:00
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool FreeList::Overlap(const Blob &blob) const {
|
|
|
|
for (auto chunk : blob.chunks) {
|
2013-10-23 00:44:00 +00:00
|
|
|
for (auto itr = fifo_free_chunks_.begin();
|
|
|
|
itr != fifo_free_chunks_.end();
|
|
|
|
++itr) {
|
|
|
|
if (itr->Overlap(chunk)) {
|
2013-10-17 00:33:49 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// BlobStore
|
|
|
|
BlobStore::BlobStore(const string& directory,
|
|
|
|
uint64_t block_size,
|
|
|
|
uint32_t blocks_per_bucket,
|
2013-10-23 21:38:52 +00:00
|
|
|
uint32_t max_buckets,
|
2013-10-17 00:33:49 +00:00
|
|
|
Env* env) :
|
|
|
|
directory_(directory),
|
|
|
|
block_size_(block_size),
|
|
|
|
blocks_per_bucket_(blocks_per_bucket),
|
2013-10-23 21:38:52 +00:00
|
|
|
env_(env),
|
|
|
|
max_buckets_(max_buckets) {
|
2013-10-17 00:33:49 +00:00
|
|
|
env_->CreateDirIfMissing(directory_);
|
|
|
|
|
|
|
|
storage_options_.use_mmap_writes = false;
|
|
|
|
storage_options_.use_mmap_reads = false;
|
|
|
|
|
2013-10-23 21:38:52 +00:00
|
|
|
buckets_size_ = 0;
|
|
|
|
buckets_ = new unique_ptr<RandomRWFile>[max_buckets_];
|
|
|
|
|
2013-10-17 00:33:49 +00:00
|
|
|
CreateNewBucket();
|
|
|
|
}
|
|
|
|
|
|
|
|
BlobStore::~BlobStore() {
|
|
|
|
// TODO we don't care about recovery for now
|
2013-10-23 21:38:52 +00:00
|
|
|
delete [] buckets_;
|
2013-10-17 00:33:49 +00:00
|
|
|
}
|
|
|
|
|
2013-10-23 00:44:00 +00:00
|
|
|
Status BlobStore::Put(const Slice& value, Blob* blob) {
|
2013-10-17 00:33:49 +00:00
|
|
|
// convert size to number of blocks
|
2013-10-23 00:44:00 +00:00
|
|
|
Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob);
|
2013-10-17 00:33:49 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2013-11-13 04:05:28 +00:00
|
|
|
auto size_left = (uint64_t) value.size();
|
2013-10-17 00:33:49 +00:00
|
|
|
|
|
|
|
uint64_t offset = 0; // in bytes, not blocks
|
|
|
|
for (auto chunk : blob->chunks) {
|
2013-10-23 00:44:00 +00:00
|
|
|
uint64_t write_size = min(chunk.size * block_size_, size_left);
|
2013-10-23 21:38:52 +00:00
|
|
|
assert(chunk.bucket_id < buckets_size_);
|
2013-10-17 00:33:49 +00:00
|
|
|
s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_,
|
2013-10-23 00:44:00 +00:00
|
|
|
Slice(value.data() + offset,
|
2013-10-17 00:33:49 +00:00
|
|
|
write_size));
|
|
|
|
if (!s.ok()) {
|
|
|
|
Delete(*blob);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
offset += write_size;
|
2013-10-23 00:44:00 +00:00
|
|
|
size_left -= write_size;
|
2013-10-17 00:33:49 +00:00
|
|
|
if (write_size < chunk.size * block_size_) {
|
|
|
|
// if we have any space left in the block, fill it up with zeros
|
|
|
|
string zero_string(chunk.size * block_size_ - write_size, 0);
|
|
|
|
s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ +
|
|
|
|
write_size,
|
|
|
|
Slice(zero_string));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:44:00 +00:00
|
|
|
if (size_left > 0) {
|
2013-10-17 00:33:49 +00:00
|
|
|
Delete(*blob);
|
2014-02-12 19:42:54 +00:00
|
|
|
return Status::Corruption("Tried to write more data than fits in the blob");
|
2013-10-17 00:33:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlobStore::Get(const Blob& blob,
|
|
|
|
string* value) const {
|
2013-10-23 21:38:52 +00:00
|
|
|
{
|
|
|
|
// assert that it doesn't overlap with free list
|
|
|
|
// it will get compiled out for release
|
|
|
|
MutexLock l(&free_list_mutex_);
|
|
|
|
assert(!free_list_.Overlap(blob));
|
|
|
|
}
|
2013-10-17 00:33:49 +00:00
|
|
|
|
2013-10-23 00:44:00 +00:00
|
|
|
value->resize(blob.Size() * block_size_);
|
2013-10-17 00:33:49 +00:00
|
|
|
|
|
|
|
uint64_t offset = 0; // in bytes, not blocks
|
|
|
|
for (auto chunk : blob.chunks) {
|
|
|
|
Slice result;
|
2013-10-23 21:38:52 +00:00
|
|
|
assert(chunk.bucket_id < buckets_size_);
|
2013-10-17 00:33:49 +00:00
|
|
|
Status s;
|
|
|
|
s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_,
|
|
|
|
chunk.size * block_size_,
|
|
|
|
&result,
|
|
|
|
&value->at(offset));
|
2014-02-12 19:42:54 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
value->clear();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (result.size() < chunk.size * block_size_) {
|
2013-10-17 00:33:49 +00:00
|
|
|
value->clear();
|
2014-02-12 19:42:54 +00:00
|
|
|
return Status::Corruption("Could not read in from file");
|
2013-10-17 00:33:49 +00:00
|
|
|
}
|
|
|
|
offset += chunk.size * block_size_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// remove the '\0's at the end of the string
|
|
|
|
value->erase(find(value->begin(), value->end(), '\0'), value->end());
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlobStore::Delete(const Blob& blob) {
|
2013-10-23 00:44:00 +00:00
|
|
|
MutexLock l(&free_list_mutex_);
|
2013-10-17 00:33:49 +00:00
|
|
|
return free_list_.Free(blob);
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:44:00 +00:00
|
|
|
Status BlobStore::Sync() {
|
2013-10-23 21:38:52 +00:00
|
|
|
for (size_t i = 0; i < buckets_size_; ++i) {
|
2013-10-23 00:44:00 +00:00
|
|
|
Status s = buckets_[i].get()->Sync();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2013-10-17 00:33:49 +00:00
|
|
|
Status BlobStore::Allocate(uint32_t blocks, Blob* blob) {
|
2013-10-23 00:44:00 +00:00
|
|
|
MutexLock l(&free_list_mutex_);
|
2013-10-17 00:33:49 +00:00
|
|
|
Status s;
|
|
|
|
|
|
|
|
s = free_list_.Allocate(blocks, blob);
|
|
|
|
if (!s.ok()) {
|
2013-10-18 05:15:57 +00:00
|
|
|
s = CreateNewBucket();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2013-10-17 00:33:49 +00:00
|
|
|
s = free_list_.Allocate(blocks, blob);
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:44:00 +00:00
|
|
|
// called with free_list_mutex_ held
|
2013-10-17 00:33:49 +00:00
|
|
|
Status BlobStore::CreateNewBucket() {
|
2013-10-23 21:38:52 +00:00
|
|
|
MutexLock l(&buckets_mutex_);
|
|
|
|
|
|
|
|
if (buckets_size_ >= max_buckets_) {
|
2014-02-12 19:42:54 +00:00
|
|
|
return Status::NotSupported("Max size exceeded\n");
|
2013-10-23 21:38:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int new_bucket_id = buckets_size_;
|
2013-10-17 00:33:49 +00:00
|
|
|
|
|
|
|
char fname[200];
|
|
|
|
sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id);
|
|
|
|
|
|
|
|
Status s = env_->NewRandomRWFile(string(fname),
|
|
|
|
&buckets_[new_bucket_id],
|
|
|
|
storage_options_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-10-21 17:23:56 +00:00
|
|
|
// whether Allocate succeeds or not, does not affect the overall correctness
|
|
|
|
// of this function - calling Allocate is really optional
|
|
|
|
// (also, tmpfs does not support allocate)
|
2013-10-18 05:15:57 +00:00
|
|
|
buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_);
|
2013-10-17 00:33:49 +00:00
|
|
|
|
2013-10-23 21:38:52 +00:00
|
|
|
buckets_size_ = new_bucket_id + 1;
|
|
|
|
|
2013-10-17 00:33:49 +00:00
|
|
|
return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_));
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|