2013-10-16 21:59:46 +00:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2013-10-05 05:32:05 +00:00
|
|
|
#pragma once
|
2014-01-02 19:26:57 +00:00
|
|
|
|
2013-05-24 19:52:45 +00:00
|
|
|
#include <atomic>
|
2012-03-09 00:23:21 +00:00
|
|
|
#include <deque>
|
2014-07-03 22:47:02 +00:00
|
|
|
#include <limits>
|
2011-03-18 22:37:00 +00:00
|
|
|
#include <set>
|
2014-01-17 05:56:26 +00:00
|
|
|
#include <utility>
|
2013-06-05 18:22:38 +00:00
|
|
|
#include <vector>
|
2014-02-26 18:03:34 +00:00
|
|
|
#include <string>
|
2013-12-31 02:33:57 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/snapshot.h"
|
2014-01-24 22:30:28 +00:00
|
|
|
#include "db/column_family.h"
|
2013-11-12 19:53:26 +00:00
|
|
|
#include "db/version_edit.h"
|
2014-01-02 19:26:57 +00:00
|
|
|
#include "memtable_list.h"
|
|
|
|
#include "port/port.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/memtablerep.h"
|
|
|
|
#include "rocksdb/transaction_log.h"
|
2014-01-14 22:49:31 +00:00
|
|
|
#include "util/autovector.h"
|
2014-07-03 22:47:02 +00:00
|
|
|
#include "util/stop_watch.h"
|
2014-02-27 19:38:55 +00:00
|
|
|
#include "util/thread_local.h"
|
2014-09-05 00:40:41 +00:00
|
|
|
#include "util/scoped_arena_iterator.h"
|
2014-01-30 04:40:41 +00:00
|
|
|
#include "db/internal_stats.h"
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 18:20:25 +00:00
|
|
|
#include "db/write_controller.h"
|
2012-08-14 22:20:36 +00:00
|
|
|
|
2013-10-04 04:49:15 +00:00
|
|
|
namespace rocksdb {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
class MemTable;
|
|
|
|
class TableCache;
|
|
|
|
class Version;
|
|
|
|
class VersionEdit;
|
|
|
|
class VersionSet;
|
2014-01-10 01:52:11 +00:00
|
|
|
class CompactionFilterV2;
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 23:38:00 +00:00
|
|
|
class Arena;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
class DBImpl : public DB {
|
|
|
|
public:
|
2014-02-05 21:12:23 +00:00
|
|
|
DBImpl(const DBOptions& options, const std::string& dbname);
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual ~DBImpl();
|
|
|
|
|
|
|
|
// Implementations of the DB interface
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Put;
|
|
|
|
virtual Status Put(const WriteOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
const Slice& value);
|
|
|
|
using DB::Merge;
|
|
|
|
virtual Status Merge(const WriteOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Delete;
|
|
|
|
virtual Status Delete(const WriteOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Write;
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Get;
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual Status Get(const ReadOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string* value);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::MultiGet;
|
|
|
|
virtual std::vector<Status> MultiGet(
|
|
|
|
const ReadOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values);
|
2013-07-06 01:49:18 +00:00
|
|
|
|
2014-01-02 17:08:12 +00:00
|
|
|
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
2014-01-06 21:31:06 +00:00
|
|
|
const std::string& column_family,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle** handle);
|
|
|
|
virtual Status DropColumnFamily(ColumnFamilyHandle* column_family);
|
2014-01-02 17:08:12 +00:00
|
|
|
|
2013-07-26 19:57:01 +00:00
|
|
|
// Returns false if key doesn't exist in the database and true if it may.
|
|
|
|
// If value_found is not passed in as null, then return the value if found in
|
|
|
|
// memory. On return, if value was found, then value_found will be set to true
|
|
|
|
// , otherwise false.
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::KeyMayExist;
|
2013-07-26 19:57:01 +00:00
|
|
|
virtual bool KeyMayExist(const ReadOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value, bool* value_found = nullptr);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::NewIterator;
|
|
|
|
virtual Iterator* NewIterator(const ReadOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* column_family);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
virtual Status NewIterators(
|
|
|
|
const ReadOptions& options,
|
2014-03-08 00:12:34 +00:00
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
std::vector<Iterator*>* iterators);
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual const Snapshot* GetSnapshot();
|
|
|
|
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetProperty;
|
2014-02-11 01:04:44 +00:00
|
|
|
virtual bool GetProperty(ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
const Slice& property, std::string* value);
|
2014-07-28 22:28:53 +00:00
|
|
|
using DB::GetIntProperty;
|
|
|
|
virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& property, uint64_t* value) override;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetApproximateSizes;
|
2014-02-11 01:04:44 +00:00
|
|
|
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
const Range* range, int n, uint64_t* sizes);
|
|
|
|
using DB::CompactRange;
|
2014-02-11 01:04:44 +00:00
|
|
|
virtual Status CompactRange(ColumnFamilyHandle* column_family,
|
2014-01-24 17:27:29 +00:00
|
|
|
const Slice* begin, const Slice* end,
|
2014-07-17 00:39:18 +00:00
|
|
|
bool reduce_level = false, int target_level = -1,
|
|
|
|
uint32_t target_path_id = 0);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
|
|
|
|
using DB::NumberLevels;
|
2014-02-11 01:04:44 +00:00
|
|
|
virtual int NumberLevels(ColumnFamilyHandle* column_family);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::MaxMemCompactionLevel;
|
2014-02-11 01:04:44 +00:00
|
|
|
virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Level0StopWriteTrigger;
|
2014-02-11 01:04:44 +00:00
|
|
|
virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family);
|
[RocksDB] BackupableDB
Summary:
In this diff I present you BackupableDB v1. You can easily use it to backup your DB and it will do incremental snapshots for you.
Let's first describe how you would use BackupableDB. It's inheriting StackableDB interface so you can easily construct it with your DB object -- it will add a method RollTheSnapshot() to the DB object. When you call RollTheSnapshot(), current snapshot of the DB will be stored in the backup dir. To restore, you can just call RestoreDBFromBackup() on a BackupableDB (which is a static method) and it will restore all files from the backup dir. In the next version, it will even support automatic backuping every X minutes.
There are multiple things you can configure:
1. backup_env and db_env can be different, which is awesome because then you can easily backup to HDFS or wherever you feel like.
2. sync - if true, it *guarantees* backup consistency on machine reboot
3. number of snapshots to keep - this will keep last N snapshots around if you want, for some reason, be able to restore from an earlier snapshot. All the backuping is done in incremental fashion - if we already have 00010.sst, we will not copy it again. *IMPORTANT* -- This is based on assumption that 00010.sst never changes - two files named 00010.sst from the same DB will always be exactly the same. Is this true? I always copy manifest, current and log files.
4. You can decide if you want to flush the memtables before you backup, or you're fine with backing up the log files -- either way, you get a complete and consistent view of the database at a time of backup.
5. More things you can find in BackupableDBOptions
Here is the directory structure I use:
backup_dir/CURRENT_SNAPSHOT - just 4 bytes holding the latest snapshot
0, 1, 2, ... - files containing serialized version of each snapshot - containing a list of files
files/*.sst - sst files shared between snapshots - if one snapshot references 00010.sst and another one needs to backup it from the DB, it will just reference the same file
files/ 0/, 1/, 2/, ... - snapshot directories containing private snapshot files - current, manifest and log files
All the files are ref counted and deleted immediatelly when they get out of scope.
Some other stuff in this diff:
1. Added GetEnv() method to the DB. Discussed with @haobo and we agreed that it seems right thing to do.
2. Fixed StackableDB interface. The way it was set up before, I was not able to implement BackupableDB.
Test Plan:
I have a unittest, but please don't look at this yet. I just hacked it up to help me with debugging. I will write a lot of good tests and update the diff.
Also, `make asan_check`
Reviewers: dhruba, haobo, emayanke
Reviewed By: dhruba
CC: leveldb, haobo
Differential Revision: https://reviews.facebook.net/D14295
2013-12-09 22:06:52 +00:00
|
|
|
virtual const std::string& GetName() const;
|
2013-11-25 20:39:23 +00:00
|
|
|
virtual Env* GetEnv() const;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetOptions;
|
2014-02-11 01:04:44 +00:00
|
|
|
virtual const Options& GetOptions(ColumnFamilyHandle* column_family) const;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Flush;
|
|
|
|
virtual Status Flush(const FlushOptions& options,
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* column_family);
|
2014-04-15 20:39:26 +00:00
|
|
|
|
|
|
|
virtual SequenceNumber GetLatestSequenceNumber() const;
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2012-09-15 00:11:35 +00:00
|
|
|
virtual Status DisableFileDeletions();
|
2014-01-02 11:33:42 +00:00
|
|
|
virtual Status EnableFileDeletions(bool force);
|
2014-08-26 23:26:29 +00:00
|
|
|
virtual int IsFileDeletionsEnabled() const;
|
2013-11-08 23:23:46 +00:00
|
|
|
// All the returned filenames start with "/"
|
2012-11-06 19:21:57 +00:00
|
|
|
virtual Status GetLiveFiles(std::vector<std::string>&,
|
2013-10-03 21:38:32 +00:00
|
|
|
uint64_t* manifest_file_size,
|
|
|
|
bool flush_memtable = true);
|
2013-08-06 19:54:37 +00:00
|
|
|
virtual Status GetSortedWalFiles(VectorLogPtr& files);
|
2014-04-15 20:39:26 +00:00
|
|
|
|
2014-02-28 19:50:36 +00:00
|
|
|
virtual Status GetUpdatesSince(
|
|
|
|
SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
|
|
|
|
const TransactionLogIterator::ReadOptions&
|
|
|
|
read_options = TransactionLogIterator::ReadOptions());
|
2013-08-22 21:32:53 +00:00
|
|
|
virtual Status DeleteFile(std::string name);
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
|
2014-04-15 20:39:26 +00:00
|
|
|
#endif // ROCKSDB_LITE
|
2012-11-26 21:56:45 +00:00
|
|
|
|
2014-03-20 21:18:29 +00:00
|
|
|
// checks if all live files exist on file system and that their file sizes
|
|
|
|
// match to our in-memory records
|
|
|
|
virtual Status CheckConsistency();
|
|
|
|
|
2013-12-03 14:39:07 +00:00
|
|
|
virtual Status GetDbIdentity(std::string& identity);
|
|
|
|
|
2014-02-01 00:45:20 +00:00
|
|
|
Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
|
2014-07-17 00:39:18 +00:00
|
|
|
int output_level, uint32_t output_path_id,
|
|
|
|
const Slice* begin, const Slice* end);
|
2014-01-15 00:19:09 +00:00
|
|
|
|
2014-04-15 22:59:34 +00:00
|
|
|
#ifndef ROCKSDB_LITE
|
2011-03-18 22:37:00 +00:00
|
|
|
// Extra methods (for testing) that are not in the public DB interface
|
2014-04-15 20:39:26 +00:00
|
|
|
// Implemented in db_impl_debug.cc
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-06-05 18:22:38 +00:00
|
|
|
// Compact any files in the named level that overlap [*begin, *end]
|
2014-02-07 22:47:16 +00:00
|
|
|
Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
|
|
|
|
ColumnFamilyHandle* column_family = nullptr);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
// Force current memtable contents to be flushed.
|
2014-03-18 19:25:08 +00:00
|
|
|
Status TEST_FlushMemTable(bool wait = true);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2012-06-23 02:30:03 +00:00
|
|
|
// Wait for memtable compaction
|
2014-02-07 22:47:16 +00:00
|
|
|
Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
|
2012-06-23 02:30:03 +00:00
|
|
|
|
|
|
|
// Wait for any compaction
|
|
|
|
Status TEST_WaitForCompact();
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Return an internal iterator over the current state of the database.
|
|
|
|
// The keys of this iterator are internal keys (see format.h).
|
|
|
|
// The returned iterator should be deleted when no longer needed.
|
2014-09-05 00:40:41 +00:00
|
|
|
Iterator* TEST_NewInternalIterator(
|
|
|
|
Arena* arena, ColumnFamilyHandle* column_family = nullptr);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2011-03-22 18:32:49 +00:00
|
|
|
// Return the maximum overlapping data (in bytes) at next level for any
|
|
|
|
// file at a level >= 1.
|
2014-02-07 22:47:16 +00:00
|
|
|
int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
|
|
|
|
nullptr);
|
2011-03-22 18:32:49 +00:00
|
|
|
|
2013-01-11 01:18:50 +00:00
|
|
|
// Return the current manifest file no.
|
|
|
|
uint64_t TEST_Current_Manifest_FileNo();
|
2013-05-06 18:41:01 +00:00
|
|
|
|
|
|
|
// Trigger's a background call for testing.
|
|
|
|
void TEST_PurgeObsoleteteWAL();
|
|
|
|
|
2013-10-17 20:33:39 +00:00
|
|
|
// get total level0 file size. Only for testing.
|
2014-01-16 00:18:04 +00:00
|
|
|
uint64_t TEST_GetLevel0TotalSize();
|
2013-10-17 20:33:39 +00:00
|
|
|
|
2013-11-07 02:46:28 +00:00
|
|
|
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
|
|
|
|
{
|
|
|
|
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
|
|
|
|
}
|
|
|
|
|
2014-02-07 22:47:16 +00:00
|
|
|
void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
|
|
|
|
std::vector<std::vector<FileMetaData>>* metadata);
|
2014-02-12 18:43:27 +00:00
|
|
|
|
2014-04-29 17:27:58 +00:00
|
|
|
Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
|
|
|
|
SequenceNumber* sequence);
|
|
|
|
|
|
|
|
Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
|
2014-09-05 22:20:05 +00:00
|
|
|
|
|
|
|
void TEST_LockMutex();
|
|
|
|
|
|
|
|
void TEST_UnlockMutex();
|
|
|
|
|
|
|
|
// REQUIRES: mutex locked
|
|
|
|
void* TEST_BeginWrite();
|
|
|
|
|
|
|
|
// REQUIRES: mutex locked
|
|
|
|
// pass the pointer that you got from TEST_BeginWrite()
|
|
|
|
void TEST_EndWrite(void* w);
|
2014-04-15 20:39:26 +00:00
|
|
|
#endif // NDEBUG
|
|
|
|
|
2014-07-02 16:54:20 +00:00
|
|
|
// Structure to store information for candidate files to delete.
|
|
|
|
struct CandidateFileInfo {
|
|
|
|
std::string file_name;
|
|
|
|
uint32_t path_id;
|
|
|
|
CandidateFileInfo(std::string name, uint32_t path)
|
|
|
|
: file_name(name), path_id(path) {}
|
|
|
|
bool operator==(const CandidateFileInfo& other) const {
|
|
|
|
return file_name == other.file_name && path_id == other.path_id;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2013-12-20 17:57:58 +00:00
|
|
|
// needed for CleanupIteratorState
|
2013-11-15 02:03:57 +00:00
|
|
|
struct DeletionState {
|
|
|
|
inline bool HaveSomethingToDelete() const {
|
2013-12-31 02:33:57 +00:00
|
|
|
return candidate_files.size() ||
|
2013-11-15 02:03:57 +00:00
|
|
|
sst_delete_files.size() ||
|
|
|
|
log_delete_files.size();
|
|
|
|
}
|
2013-11-27 22:56:20 +00:00
|
|
|
|
2013-11-15 02:03:57 +00:00
|
|
|
// a list of all files that we'll consider deleting
|
|
|
|
// (every once in a while this is filled up with all files
|
|
|
|
// in the DB directory)
|
2014-07-02 16:54:20 +00:00
|
|
|
std::vector<CandidateFileInfo> candidate_files;
|
2013-11-15 02:03:57 +00:00
|
|
|
|
|
|
|
// the list of all live sst files that cannot be deleted
|
2014-07-02 16:54:20 +00:00
|
|
|
std::vector<FileDescriptor> sst_live;
|
2013-11-15 02:03:57 +00:00
|
|
|
|
|
|
|
// a list of sst files that we need to delete
|
|
|
|
std::vector<FileMetaData*> sst_delete_files;
|
|
|
|
|
|
|
|
// a list of log files that we need to delete
|
|
|
|
std::vector<uint64_t> log_delete_files;
|
|
|
|
|
2013-11-27 22:56:20 +00:00
|
|
|
// a list of memtables to be free
|
2014-01-02 19:26:57 +00:00
|
|
|
autovector<MemTable*> memtables_to_free;
|
2013-11-27 22:56:20 +00:00
|
|
|
|
2014-02-27 19:38:55 +00:00
|
|
|
autovector<SuperVersion*> superversions_to_free;
|
2013-12-20 17:57:58 +00:00
|
|
|
|
2014-01-30 23:23:13 +00:00
|
|
|
SuperVersion* new_superversion; // if nullptr no new superversion
|
2013-12-20 17:57:58 +00:00
|
|
|
|
2013-11-15 02:03:57 +00:00
|
|
|
// the current manifest_file_number, log_number and prev_log_number
|
|
|
|
// that corresponds to the set of files in 'live'.
|
2014-03-18 04:50:15 +00:00
|
|
|
uint64_t manifest_file_number, pending_manifest_file_number, log_number,
|
|
|
|
prev_log_number;
|
2013-11-15 02:03:57 +00:00
|
|
|
|
2014-01-02 19:26:57 +00:00
|
|
|
explicit DeletionState(bool create_superversion = false) {
|
2013-11-15 02:03:57 +00:00
|
|
|
manifest_file_number = 0;
|
2014-03-18 04:50:15 +00:00
|
|
|
pending_manifest_file_number = 0;
|
2013-11-15 02:03:57 +00:00
|
|
|
log_number = 0;
|
|
|
|
prev_log_number = 0;
|
2014-01-30 23:23:13 +00:00
|
|
|
new_superversion = create_superversion ? new SuperVersion() : nullptr;
|
2013-12-20 17:57:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
~DeletionState() {
|
|
|
|
// free pending memtables
|
|
|
|
for (auto m : memtables_to_free) {
|
|
|
|
delete m;
|
|
|
|
}
|
2014-02-27 19:38:55 +00:00
|
|
|
// free superversions
|
|
|
|
for (auto s : superversions_to_free) {
|
|
|
|
delete s;
|
|
|
|
}
|
2013-12-20 17:57:58 +00:00
|
|
|
// if new_superversion was not used, it will be non-nullptr and needs
|
|
|
|
// to be freed here
|
|
|
|
delete new_superversion;
|
2013-11-15 02:03:57 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Returns the list of live files in 'live' and the list
|
2013-12-31 02:33:57 +00:00
|
|
|
// of all files in the filesystem in 'candidate_files'.
|
2013-11-15 02:03:57 +00:00
|
|
|
// If force == false and the last call was less than
|
2014-09-05 18:48:17 +00:00
|
|
|
// db_options_.delete_obsolete_files_period_micros microseconds ago,
|
2013-11-15 02:03:57 +00:00
|
|
|
// it will not fill up the deletion_state
|
|
|
|
void FindObsoleteFiles(DeletionState& deletion_state,
|
|
|
|
bool force,
|
|
|
|
bool no_full_scan = false);
|
|
|
|
|
|
|
|
// Diffs the files listed in filenames and those that do not
|
|
|
|
// belong to live files are posibly removed. Also, removes all the
|
|
|
|
// files in sst_delete_files and log_delete_files.
|
|
|
|
// It is not necessary to hold the mutex when invoking this method.
|
|
|
|
void PurgeObsoleteFiles(DeletionState& deletion_state);
|
|
|
|
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandle* DefaultColumnFamily() const;
|
|
|
|
|
2012-12-18 21:05:39 +00:00
|
|
|
protected:
|
2012-11-06 03:18:49 +00:00
|
|
|
Env* const env_;
|
|
|
|
const std::string dbname_;
|
2013-01-20 10:07:13 +00:00
|
|
|
unique_ptr<VersionSet> versions_;
|
2014-09-05 18:48:17 +00:00
|
|
|
const DBOptions db_options_;
|
2014-07-28 19:05:36 +00:00
|
|
|
Statistics* stats_;
|
2012-11-06 03:18:49 +00:00
|
|
|
|
2014-02-03 23:28:03 +00:00
|
|
|
Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
|
2014-09-05 00:40:41 +00:00
|
|
|
SuperVersion* super_version, Arena* arena);
|
2013-02-15 23:28:24 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
|
|
|
friend class DB;
|
2014-03-27 18:59:37 +00:00
|
|
|
friend class InternalStats;
|
2014-04-15 20:39:26 +00:00
|
|
|
#ifndef ROCKSDB_LITE
|
2014-01-17 05:56:26 +00:00
|
|
|
friend class TailingIterator;
|
2014-05-30 21:31:55 +00:00
|
|
|
friend class ForwardIterator;
|
2014-04-15 20:39:26 +00:00
|
|
|
#endif
|
2014-03-09 05:12:13 +00:00
|
|
|
friend struct SuperVersion;
|
2012-03-09 00:23:21 +00:00
|
|
|
struct CompactionState;
|
2014-09-05 22:20:05 +00:00
|
|
|
|
2014-08-12 05:10:32 +00:00
|
|
|
struct WriteContext;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
Status NewDB();
|
|
|
|
|
|
|
|
// Recover the descriptor from persistent storage. May do a significant
|
|
|
|
// amount of work to recover recently logged updates. Any changes to
|
|
|
|
// be made to the descriptor are added to *edit.
|
2014-01-22 18:59:07 +00:00
|
|
|
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
bool read_only = false, bool error_if_log_file_exist = false);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
void MaybeIgnoreError(Status* s) const;
|
|
|
|
|
2012-11-26 21:56:45 +00:00
|
|
|
const Status CreateArchivalDirectory();
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Delete any unneeded files and stale in-memory entries.
|
|
|
|
void DeleteObsoleteFiles();
|
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
// Flush the in-memory write buffer to storage. Switches to a new
|
2011-03-18 22:37:00 +00:00
|
|
|
// log-file/memtable and writes a new descriptor iff successful.
|
2014-01-31 01:48:42 +00:00
|
|
|
Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress,
|
2014-03-10 05:01:13 +00:00
|
|
|
DeletionState& deletion_state,
|
|
|
|
LogBuffer* log_buffer);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Refactor Recover() code
Summary:
This diff does two things:
* Rethinks how we call Recover() with read_only option. Before, we call it with pointer to memtable where we'd like to apply those changes to. This memtable is set in db_impl_readonly.cc and it's actually DBImpl::mem_. Why don't we just apply updates to mem_ right away? It seems more intuitive.
* Changes when we apply updates to manifest. Before, the process is to recover all the logs, flush it to sst files and then do one giant commit that atomically adds all recovered sst files and sets the next log number. This works good enough, but causes some small troubles for my column family approach, since I can't have one VersionEdit apply to more than single column family[1]. The change here is to commit the files recovered from logs right away. Here is the state of the world before the change:
1. Recover log 5, add new sst files to edit
2. Recover log 7, add new sst files to edit
3. Recover log 8, add new sst files to edit
4. Commit all added sst files to manifest and mark log files 5, 7 and 8 as recoverd (via SetLogNumber(9) function)
After the change, we'll do:
1. Recover log 5, commit the new sst files and set log 5 as recovered
2. Recover log 7, commit the new sst files and set log 7 as recovered
3. Recover log 8, commit the new sst files and set log 8 as recovered
The added (small) benefit is that if we fail after (2), the new recovery will only have to recover log 8. In previous case, we'll have to restart the recovery from the beginning. The bigger benefit will be to enable easier integration of multiple column families in Recovery code path.
[1] I'm happy to dicuss this decison, but I believe this is the cleanest way to go. It also makes backward compatibility much easier. We don't have a requirement of adding multiple column families atomically.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15237
2014-01-22 18:45:26 +00:00
|
|
|
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
|
|
|
|
bool read_only);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2012-10-19 21:00:53 +00:00
|
|
|
// The following two methods are used to flush a memtable to
|
|
|
|
// storage. The first one is used atdatabase RecoveryTime (when the
|
|
|
|
// database is opened) and is heavyweight because it holds the mutex
|
|
|
|
// for the entire period. The second method WriteLevel0Table supports
|
|
|
|
// concurrent flush memtables to storage.
|
2014-02-04 01:48:07 +00:00
|
|
|
Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
|
|
|
VersionEdit* edit);
|
2014-02-06 23:42:16 +00:00
|
|
|
Status WriteLevel0Table(ColumnFamilyData* cfd, autovector<MemTable*>& mems,
|
2014-03-11 00:25:10 +00:00
|
|
|
VersionEdit* edit, uint64_t* filenumber,
|
2014-03-10 05:01:13 +00:00
|
|
|
LogBuffer* log_buffer);
|
2014-09-05 22:20:05 +00:00
|
|
|
// Information kept for every waiting writer
|
|
|
|
struct Writer {
|
|
|
|
Status status;
|
|
|
|
WriteBatch* batch;
|
|
|
|
bool sync;
|
|
|
|
bool disableWAL;
|
|
|
|
bool in_batch_group;
|
|
|
|
bool done;
|
|
|
|
uint64_t timeout_hint_us;
|
|
|
|
port::CondVar cv;
|
|
|
|
|
|
|
|
explicit Writer(port::Mutex* mu) : cv(mu) {}
|
|
|
|
};
|
|
|
|
|
2014-08-12 05:10:32 +00:00
|
|
|
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
|
|
|
|
// thread should grab the mutex_ and be the first on writers queue.
|
|
|
|
// BeginWrite is used for it.
|
|
|
|
// Be aware! Writer's job can be done by other thread (see DBImpl::Write
|
|
|
|
// for examples), so check it via w.done before applying changes.
|
|
|
|
//
|
|
|
|
// Writer* w: writer to be placed in the queue
|
|
|
|
// uint64_t expiration_time: maximum time to be in the queue
|
|
|
|
// See also: EndWrite
|
|
|
|
Status BeginWrite(Writer* w, uint64_t expiration_time);
|
|
|
|
|
|
|
|
// After doing write job, we need to remove already used writers from
|
|
|
|
// writers_ queue and notify head of the queue about it.
|
|
|
|
// EndWrite is used for this.
|
|
|
|
//
|
|
|
|
// Writer* w: Writer, that was added by BeginWrite function
|
|
|
|
// Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
|
|
|
|
// does)
|
|
|
|
// we should pass last_writer as a parameter to
|
|
|
|
// EndWrite
|
|
|
|
// (if you don't touch other writers, just pass w)
|
|
|
|
// Status status: Status of write operation
|
|
|
|
// See also: BeginWrite
|
|
|
|
void EndWrite(Writer* w, Writer* last_writer, Status status);
|
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 18:20:25 +00:00
|
|
|
void DelayWrite(uint64_t expiration_time);
|
|
|
|
|
|
|
|
Status MakeRoomForWrite(ColumnFamilyData* cfd, WriteContext* context,
|
2014-07-03 22:47:02 +00:00
|
|
|
uint64_t expiration_time);
|
2014-04-07 18:29:48 +00:00
|
|
|
|
2014-08-12 05:10:32 +00:00
|
|
|
Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
|
|
|
|
WriteContext* context);
|
|
|
|
|
2014-01-14 22:49:31 +00:00
|
|
|
void BuildBatchGroup(Writer** last_writer,
|
|
|
|
autovector<WriteBatch*>* write_batch_group);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2012-07-06 18:42:09 +00:00
|
|
|
// Force current memtable contents to be flushed.
|
2014-01-31 01:48:42 +00:00
|
|
|
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
|
2012-07-06 18:42:09 +00:00
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
// Wait for memtable flushed
|
2014-01-31 01:48:42 +00:00
|
|
|
Status WaitForFlushMemTable(ColumnFamilyData* cfd);
|
2012-07-06 18:42:09 +00:00
|
|
|
|
2014-07-03 23:28:03 +00:00
|
|
|
void RecordFlushIOStats();
|
|
|
|
void RecordCompactionIOStats();
|
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
void MaybeScheduleFlushOrCompaction();
|
2013-09-13 21:38:37 +00:00
|
|
|
static void BGWorkCompaction(void* db);
|
|
|
|
static void BGWorkFlush(void* db);
|
|
|
|
void BackgroundCallCompaction();
|
|
|
|
void BackgroundCallFlush();
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
2014-03-04 22:32:55 +00:00
|
|
|
Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state,
|
|
|
|
LogBuffer* log_buffer);
|
2014-03-10 05:01:13 +00:00
|
|
|
Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state,
|
|
|
|
LogBuffer* log_buffer);
|
2013-09-02 06:23:40 +00:00
|
|
|
void CleanupCompaction(CompactionState* compact, Status status);
|
2013-11-08 23:23:46 +00:00
|
|
|
Status DoCompactionWork(CompactionState* compact,
|
2014-03-10 05:01:13 +00:00
|
|
|
DeletionState& deletion_state,
|
|
|
|
LogBuffer* log_buffer);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2014-04-07 22:03:15 +00:00
|
|
|
// This function is called as part of compaction. It enables Flush process to
|
|
|
|
// preempt compaction, since it's higher prioirty
|
|
|
|
// Returns: micros spent executing
|
|
|
|
uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
|
|
|
|
DeletionState& deletion_state,
|
|
|
|
LogBuffer* log_buffer);
|
|
|
|
|
2014-01-10 01:52:11 +00:00
|
|
|
// Call compaction filter if is_compaction_v2 is not true. Then iterate
|
|
|
|
// through input and compact the kv-pairs
|
|
|
|
Status ProcessKeyValueCompaction(
|
2014-06-19 16:31:14 +00:00
|
|
|
bool is_snapshot_supported,
|
2014-01-10 01:52:11 +00:00
|
|
|
SequenceNumber visible_at_tip,
|
|
|
|
SequenceNumber earliest_snapshot,
|
|
|
|
SequenceNumber latest_snapshot,
|
|
|
|
DeletionState& deletion_state,
|
|
|
|
bool bottommost_level,
|
|
|
|
int64_t& imm_micros,
|
|
|
|
Iterator* input,
|
|
|
|
CompactionState* compact,
|
|
|
|
bool is_compaction_v2,
|
|
|
|
LogBuffer* log_buffer);
|
|
|
|
|
|
|
|
// Call compaction_filter_v2->Filter() on kv-pairs in compact
|
|
|
|
void CallCompactionFilterV2(CompactionState* compact,
|
|
|
|
CompactionFilterV2* compaction_filter_v2);
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
Status OpenCompactionOutputFile(CompactionState* compact);
|
|
|
|
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
2014-04-03 04:49:51 +00:00
|
|
|
Status InstallCompactionResults(CompactionState* compact,
|
|
|
|
LogBuffer* log_buffer);
|
2012-10-19 21:00:53 +00:00
|
|
|
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
|
|
|
|
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
|
2012-11-29 00:42:36 +00:00
|
|
|
|
2014-04-15 20:39:26 +00:00
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
void PurgeObsoleteWALFiles() {
|
|
|
|
// this function is used for archiving WAL files. we don't need this in
|
|
|
|
// ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
#else
|
2012-11-26 21:56:45 +00:00
|
|
|
void PurgeObsoleteWALFiles();
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2014-03-24 04:49:14 +00:00
|
|
|
Status GetSortedWalsOfType(const std::string& path,
|
|
|
|
VectorLogPtr& log_files,
|
|
|
|
WalFileType type);
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2013-08-06 19:54:37 +00:00
|
|
|
// Requires: all_logs should be sorted with earliest log file first
|
|
|
|
// Retains all log files in all_logs which contain updates with seq no.
|
|
|
|
// Greater Than or Equal to the requested SequenceNumber.
|
|
|
|
Status RetainProbableWalFiles(VectorLogPtr& all_logs,
|
|
|
|
const SequenceNumber target);
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2013-08-06 19:54:37 +00:00
|
|
|
Status ReadFirstRecord(const WalFileType type, const uint64_t number,
|
2014-04-29 17:27:58 +00:00
|
|
|
SequenceNumber* sequence);
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2014-04-29 17:27:58 +00:00
|
|
|
Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
|
2014-04-15 20:39:26 +00:00
|
|
|
#endif // ROCKSDB_LITE
|
2013-05-10 22:21:04 +00:00
|
|
|
|
2013-05-28 19:35:43 +00:00
|
|
|
void PrintStatistics();
|
|
|
|
|
2013-10-05 05:32:05 +00:00
|
|
|
// dump rocksdb.stats to LOG
|
2013-05-10 22:21:04 +00:00
|
|
|
void MaybeDumpStats();
|
|
|
|
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
2014-04-30 00:13:46 +00:00
|
|
|
// Return true if the current db supports snapshot. If the current
|
|
|
|
// DB does not support snapshot, then calling GetSnapshot() will always
|
|
|
|
// return nullptr.
|
|
|
|
//
|
|
|
|
// @see GetSnapshot()
|
|
|
|
virtual bool IsSnapshotSupported() const;
|
|
|
|
|
2013-06-30 06:21:36 +00:00
|
|
|
// Return the minimum empty level that could hold the total data in the
|
|
|
|
// input level. Return the input level, if such level could not be found.
|
2014-02-01 00:45:20 +00:00
|
|
|
int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level);
|
2013-06-30 06:21:36 +00:00
|
|
|
|
2013-09-04 20:13:08 +00:00
|
|
|
// Move the files in the input level to the target level.
|
|
|
|
// If target_level < 0, automatically calculate the minimum level that could
|
|
|
|
// hold the data set.
|
2014-02-01 00:45:20 +00:00
|
|
|
Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
|
2013-06-30 06:21:36 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// table_cache_ provides its own synchronization
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 17:07:55 +00:00
|
|
|
std::shared_ptr<Cache> table_cache_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-02-15 23:28:24 +00:00
|
|
|
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
|
2011-03-18 22:37:00 +00:00
|
|
|
FileLock* db_lock_;
|
|
|
|
|
|
|
|
// State below is protected by mutex_
|
|
|
|
port::Mutex mutex_;
|
|
|
|
port::AtomicPointer shutting_down_;
|
2014-06-03 00:23:55 +00:00
|
|
|
// This condition variable is signaled on these conditions:
|
|
|
|
// * whenever bg_compaction_scheduled_ goes down to 0
|
|
|
|
// * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't
|
|
|
|
// made any progress
|
|
|
|
// * whenever a compaction made any progress
|
|
|
|
// * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is
|
|
|
|
// done, even if it didn't make any progress)
|
|
|
|
// * whenever there is an error in background flush or compaction
|
|
|
|
port::CondVar bg_cv_;
|
2011-06-22 02:36:45 +00:00
|
|
|
uint64_t logfile_number_;
|
2013-01-20 10:07:13 +00:00
|
|
|
unique_ptr<log::Writer> log_;
|
2014-04-15 16:57:25 +00:00
|
|
|
bool log_empty_;
|
2014-02-11 01:04:44 +00:00
|
|
|
ColumnFamilyHandleImpl* default_cf_handle_;
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 19:57:29 +00:00
|
|
|
InternalStats* default_cf_internal_stats_;
|
2014-01-28 19:05:04 +00:00
|
|
|
unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
|
2014-04-30 18:33:40 +00:00
|
|
|
struct LogFileNumberSize {
|
|
|
|
explicit LogFileNumberSize(uint64_t _number)
|
|
|
|
: number(_number), size(0), getting_flushed(false) {}
|
|
|
|
void AddSize(uint64_t new_size) { size += new_size; }
|
|
|
|
uint64_t number;
|
|
|
|
uint64_t size;
|
|
|
|
bool getting_flushed;
|
|
|
|
};
|
|
|
|
std::deque<LogFileNumberSize> alive_log_files_;
|
|
|
|
uint64_t total_log_size_;
|
|
|
|
// only used for dynamically adjusting max_total_wal_size. it is a sum of
|
|
|
|
// [write_buffer_size * max_write_buffer_number] over all column families
|
|
|
|
uint64_t max_total_in_memory_state_;
|
2014-06-07 00:26:23 +00:00
|
|
|
// If true, we have only one (default) column family. We use this to optimize
|
|
|
|
// some code-paths
|
|
|
|
bool single_column_family_mode_;
|
2013-12-20 17:57:58 +00:00
|
|
|
|
2014-01-27 19:02:21 +00:00
|
|
|
std::unique_ptr<Directory> db_directory_;
|
|
|
|
|
2012-03-09 00:23:21 +00:00
|
|
|
// Queue of writers.
|
|
|
|
std::deque<Writer*> writers_;
|
2013-03-28 22:19:28 +00:00
|
|
|
WriteBatch tmp_batch_;
|
2012-03-09 00:23:21 +00:00
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 18:20:25 +00:00
|
|
|
WriteController write_controller_;
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
SnapshotList snapshots_;
|
|
|
|
|
2014-04-29 17:27:58 +00:00
|
|
|
// cache for ReadFirstRecord() calls
|
|
|
|
std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
|
|
|
|
port::Mutex read_first_record_cache_mutex_;
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Set of table files to protect from deletion because they are
|
|
|
|
// part of ongoing compactions.
|
2014-07-02 16:54:20 +00:00
|
|
|
// map from pending file number ID to their path IDs.
|
|
|
|
FileNumToPathIdMap pending_outputs_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
2014-03-11 17:36:30 +00:00
|
|
|
// At least one compaction or flush job is pending but not yet scheduled
|
|
|
|
// because of the max background thread limit.
|
|
|
|
bool bg_schedule_needed_;
|
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-21 23:10:39 +00:00
|
|
|
// count how many background compactions are running or have been scheduled
|
2012-10-19 21:00:53 +00:00
|
|
|
int bg_compaction_scheduled_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-21 23:10:39 +00:00
|
|
|
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
|
|
|
|
// compactions (if manual_compaction_ is not null). This mechanism enables
|
|
|
|
// manual compactions to wait until all other compactions are finished.
|
|
|
|
int bg_manual_only_;
|
|
|
|
|
2013-09-13 21:38:37 +00:00
|
|
|
// number of background memtable flush jobs, submitted to the HIGH pool
|
|
|
|
int bg_flush_scheduled_;
|
|
|
|
|
2011-06-07 14:40:26 +00:00
|
|
|
// Information for a manual compaction
|
|
|
|
struct ManualCompaction {
|
2014-02-01 00:45:20 +00:00
|
|
|
ColumnFamilyData* cfd;
|
2014-01-15 00:19:09 +00:00
|
|
|
int input_level;
|
|
|
|
int output_level;
|
2014-07-17 00:39:18 +00:00
|
|
|
uint32_t output_path_id;
|
2011-10-05 23:30:28 +00:00
|
|
|
bool done;
|
2014-01-22 20:46:24 +00:00
|
|
|
Status status;
|
2012-10-19 21:00:53 +00:00
|
|
|
bool in_progress; // compaction request being processed?
|
2013-02-15 23:28:24 +00:00
|
|
|
const InternalKey* begin; // nullptr means beginning of key range
|
|
|
|
const InternalKey* end; // nullptr means end of key range
|
2011-10-05 23:30:28 +00:00
|
|
|
InternalKey tmp_storage; // Used to keep track of compaction progress
|
2011-06-07 14:40:26 +00:00
|
|
|
};
|
|
|
|
ManualCompaction* manual_compaction_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Have we encountered a background error in paranoid mode?
|
|
|
|
Status bg_error_;
|
|
|
|
|
2012-09-15 00:11:35 +00:00
|
|
|
// shall we disable deletion of obsolete files
|
2014-01-02 11:33:42 +00:00
|
|
|
// if 0 the deletion is enabled.
|
|
|
|
// if non-zero, files will not be getting deleted
|
|
|
|
// This enables two different threads to call
|
|
|
|
// EnableFileDeletions() and DisableFileDeletions()
|
|
|
|
// without any synchronization
|
|
|
|
int disable_delete_obsolete_files_;
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2012-10-16 15:53:46 +00:00
|
|
|
// last time when DeleteObsoleteFiles was invoked
|
|
|
|
uint64_t delete_obsolete_files_last_run_;
|
|
|
|
|
2013-05-06 18:41:01 +00:00
|
|
|
// last time when PurgeObsoleteWALFiles ran.
|
|
|
|
uint64_t purge_wal_files_last_run_;
|
|
|
|
|
2013-05-10 22:21:04 +00:00
|
|
|
// last time stats were dumped to LOG
|
2013-05-24 19:52:45 +00:00
|
|
|
std::atomic<uint64_t> last_stats_dump_time_microsec_;
|
2013-05-10 22:21:04 +00:00
|
|
|
|
2013-11-07 02:46:28 +00:00
|
|
|
// obsolete files will be deleted every this seconds if ttl deletion is
|
|
|
|
// enabled and archive size_limit is disabled.
|
|
|
|
uint64_t default_interval_to_delete_obsolete_WAL_;
|
|
|
|
|
2012-11-06 19:21:57 +00:00
|
|
|
bool flush_on_destroy_; // Used when disableWAL is true.
|
|
|
|
|
2012-08-17 23:06:05 +00:00
|
|
|
static const int KEEP_LOG_FILE_NUM = 1000;
|
2014-07-03 22:47:02 +00:00
|
|
|
static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
|
2012-09-06 00:44:13 +00:00
|
|
|
std::string db_absolute_path_;
|
2012-08-17 23:06:05 +00:00
|
|
|
|
2013-03-15 00:00:04 +00:00
|
|
|
// The options to access storage files
|
2014-09-04 23:18:36 +00:00
|
|
|
const EnvOptions env_options_;
|
2013-03-15 00:00:04 +00:00
|
|
|
|
2013-06-30 06:21:36 +00:00
|
|
|
// A value of true temporarily disables scheduling of background work
|
|
|
|
bool bg_work_gate_closed_;
|
|
|
|
|
|
|
|
// Guard against multiple concurrent refitting
|
|
|
|
bool refitting_level_;
|
|
|
|
|
2014-02-27 19:38:55 +00:00
|
|
|
// Indicate DB was opened successfully
|
|
|
|
bool opened_successfully_;
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// No copying allowed
|
|
|
|
DBImpl(const DBImpl&);
|
|
|
|
void operator=(const DBImpl&);
|
|
|
|
|
2013-03-21 22:59:47 +00:00
|
|
|
// Return the earliest snapshot where seqno is visible.
|
|
|
|
// Store the snapshot right before that, if any, in prev_snapshot
|
|
|
|
inline SequenceNumber findEarliestVisibleSnapshot(
|
|
|
|
SequenceNumber in,
|
|
|
|
std::vector<SequenceNumber>& snapshots,
|
|
|
|
SequenceNumber* prev_snapshot);
|
2013-07-06 01:49:18 +00:00
|
|
|
|
2013-12-20 17:57:58 +00:00
|
|
|
// Background threads call this function, which is just a wrapper around
|
2014-01-29 21:28:50 +00:00
|
|
|
// the cfd->InstallSuperVersion() function. Background threads carry
|
2013-12-20 17:57:58 +00:00
|
|
|
// deletion_state which can have new_superversion already allocated.
|
2014-01-24 22:30:28 +00:00
|
|
|
void InstallSuperVersion(ColumnFamilyData* cfd,
|
|
|
|
DeletionState& deletion_state);
|
2013-12-20 17:57:58 +00:00
|
|
|
|
2014-08-05 18:27:34 +00:00
|
|
|
// Find Super version and reference it. Based on options, it might return
|
|
|
|
// the thread local cached one.
|
|
|
|
inline SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
|
|
|
|
|
|
|
|
// Un-reference the super version and return it to thread local cache if
|
|
|
|
// needed. If it is the last reference of the super version. Clean it up
|
|
|
|
// after un-referencing it.
|
|
|
|
inline void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
|
|
|
|
SuperVersion* sv);
|
|
|
|
|
2014-04-15 20:39:26 +00:00
|
|
|
#ifndef ROCKSDB_LITE
|
2014-02-15 01:02:10 +00:00
|
|
|
using DB::GetPropertiesOfAllTables;
|
|
|
|
virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
|
|
|
TablePropertiesCollection* props)
|
2014-02-14 00:28:21 +00:00
|
|
|
override;
|
2014-04-15 20:39:26 +00:00
|
|
|
#endif // ROCKSDB_LITE
|
2014-02-14 00:28:21 +00:00
|
|
|
|
2013-07-26 19:57:01 +00:00
|
|
|
// Function that Get and KeyMayExist call with no_io true or false
|
|
|
|
// Note: 'value_found' from KeyMayExist propagates here
|
2014-02-11 01:04:44 +00:00
|
|
|
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, std::string* value,
|
|
|
|
bool* value_found = nullptr);
|
2014-08-05 18:27:34 +00:00
|
|
|
|
|
|
|
bool GetIntPropertyInternal(ColumnFamilyHandle* column_family,
|
|
|
|
DBPropertyType property_type,
|
|
|
|
bool need_out_of_mutex, uint64_t* value);
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Sanitize db options. The caller should delete result.info_log if
|
|
|
|
// it is not equal to src.info_log.
|
|
|
|
extern Options SanitizeOptions(const std::string& db,
|
|
|
|
const InternalKeyComparator* icmp,
|
|
|
|
const Options& src);
|
2014-02-05 00:31:18 +00:00
|
|
|
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
|
2013-10-30 17:52:33 +00:00
|
|
|
|
2014-08-07 17:05:04 +00:00
|
|
|
// Fix user-supplied options to be reasonable
|
|
|
|
template <class T, class V>
|
|
|
|
static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
|
|
|
|
if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
|
|
|
|
if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
|
|
|
|
}
|
|
|
|
|
2014-08-13 20:45:13 +00:00
|
|
|
// Dump db file summary, implemented in util/
|
|
|
|
extern void DumpDBFileSummary(const DBOptions& options,
|
|
|
|
const std::string& dbname);
|
|
|
|
|
2013-10-04 04:49:15 +00:00
|
|
|
} // namespace rocksdb
|