2013-10-16 21:59:46 +00:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2013-10-05 05:32:05 +00:00
|
|
|
#pragma once
|
2013-05-24 19:52:45 +00:00
|
|
|
#include <atomic>
|
2012-03-09 00:23:21 +00:00
|
|
|
#include <deque>
|
2011-03-18 22:37:00 +00:00
|
|
|
#include <set>
|
2014-01-17 05:56:26 +00:00
|
|
|
#include <utility>
|
2013-06-05 18:22:38 +00:00
|
|
|
#include <vector>
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/snapshot.h"
|
2014-01-24 22:30:28 +00:00
|
|
|
#include "db/column_family.h"
|
2013-11-12 19:53:26 +00:00
|
|
|
#include "db/version_edit.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/memtablerep.h"
|
|
|
|
#include "rocksdb/transaction_log.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "port/port.h"
|
2012-08-14 22:20:36 +00:00
|
|
|
#include "util/stats_logger.h"
|
2012-10-19 21:00:53 +00:00
|
|
|
#include "memtablelist.h"
|
2014-01-14 22:49:31 +00:00
|
|
|
#include "util/autovector.h"
|
2014-01-30 04:40:41 +00:00
|
|
|
#include "db/internal_stats.h"
|
2012-08-14 22:20:36 +00:00
|
|
|
|
2013-10-04 04:49:15 +00:00
|
|
|
namespace rocksdb {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
class MemTable;
|
|
|
|
class TableCache;
|
|
|
|
class Version;
|
|
|
|
class VersionEdit;
|
|
|
|
class VersionSet;
|
|
|
|
|
|
|
|
class DBImpl : public DB {
|
|
|
|
public:
|
|
|
|
DBImpl(const Options& options, const std::string& dbname);
|
|
|
|
virtual ~DBImpl();
|
|
|
|
|
|
|
|
// Implementations of the DB interface
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Put;
|
|
|
|
virtual Status Put(const WriteOptions& options,
|
|
|
|
const ColumnFamilyHandle& column_family, const Slice& key,
|
|
|
|
const Slice& value);
|
|
|
|
using DB::Merge;
|
|
|
|
virtual Status Merge(const WriteOptions& options,
|
|
|
|
const ColumnFamilyHandle& column_family,
|
|
|
|
const Slice& key, const Slice& value);
|
|
|
|
using DB::Delete;
|
|
|
|
virtual Status Delete(const WriteOptions& options,
|
|
|
|
const ColumnFamilyHandle& column_family,
|
|
|
|
const Slice& key);
|
|
|
|
using DB::Write;
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::Get;
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual Status Get(const ReadOptions& options,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
const ColumnFamilyHandle& column_family, const Slice& key,
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string* value);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::MultiGet;
|
|
|
|
virtual std::vector<Status> MultiGet(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle>& column_family,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values);
|
2013-07-06 01:49:18 +00:00
|
|
|
|
2014-01-02 17:08:12 +00:00
|
|
|
virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
|
2014-01-06 21:31:06 +00:00
|
|
|
const std::string& column_family,
|
2014-01-02 17:08:12 +00:00
|
|
|
ColumnFamilyHandle* handle);
|
|
|
|
virtual Status DropColumnFamily(const ColumnFamilyHandle& column_family);
|
|
|
|
|
2013-07-26 19:57:01 +00:00
|
|
|
// Returns false if key doesn't exist in the database and true if it may.
|
|
|
|
// If value_found is not passed in as null, then return the value if found in
|
|
|
|
// memory. On return, if value was found, then value_found will be set to true
|
|
|
|
// , otherwise false.
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::KeyMayExist;
|
2013-07-26 19:57:01 +00:00
|
|
|
virtual bool KeyMayExist(const ReadOptions& options,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
const ColumnFamilyHandle& column_family,
|
|
|
|
const Slice& key, std::string* value,
|
2013-07-26 19:57:01 +00:00
|
|
|
bool* value_found = nullptr);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::NewIterator;
|
|
|
|
virtual Iterator* NewIterator(const ReadOptions& options,
|
|
|
|
const ColumnFamilyHandle& column_family);
|
|
|
|
virtual Status NewIterators(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle>& column_family,
|
|
|
|
std::vector<Iterator*>* iterators);
|
2011-03-18 22:37:00 +00:00
|
|
|
virtual const Snapshot* GetSnapshot();
|
|
|
|
virtual void ReleaseSnapshot(const Snapshot* snapshot);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetProperty;
|
|
|
|
virtual bool GetProperty(const ColumnFamilyHandle& column_family,
|
|
|
|
const Slice& property, std::string* value);
|
|
|
|
using DB::GetApproximateSizes;
|
|
|
|
virtual void GetApproximateSizes(const ColumnFamilyHandle& column_family,
|
|
|
|
const Range* range, int n, uint64_t* sizes);
|
|
|
|
using DB::CompactRange;
|
2014-01-24 17:27:29 +00:00
|
|
|
virtual Status CompactRange(const ColumnFamilyHandle& column_family,
|
|
|
|
const Slice* begin, const Slice* end,
|
2014-01-22 20:46:24 +00:00
|
|
|
bool reduce_level = false, int target_level = -1);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
|
|
|
|
using DB::NumberLevels;
|
|
|
|
virtual int NumberLevels(const ColumnFamilyHandle& column_family);
|
|
|
|
using DB::MaxMemCompactionLevel;
|
|
|
|
virtual int MaxMemCompactionLevel(const ColumnFamilyHandle& column_family);
|
|
|
|
using DB::Level0StopWriteTrigger;
|
|
|
|
virtual int Level0StopWriteTrigger(const ColumnFamilyHandle& column_family);
|
[RocksDB] BackupableDB
Summary:
In this diff I present you BackupableDB v1. You can easily use it to backup your DB and it will do incremental snapshots for you.
Let's first describe how you would use BackupableDB. It's inheriting StackableDB interface so you can easily construct it with your DB object -- it will add a method RollTheSnapshot() to the DB object. When you call RollTheSnapshot(), current snapshot of the DB will be stored in the backup dir. To restore, you can just call RestoreDBFromBackup() on a BackupableDB (which is a static method) and it will restore all files from the backup dir. In the next version, it will even support automatic backuping every X minutes.
There are multiple things you can configure:
1. backup_env and db_env can be different, which is awesome because then you can easily backup to HDFS or wherever you feel like.
2. sync - if true, it *guarantees* backup consistency on machine reboot
3. number of snapshots to keep - this will keep last N snapshots around if you want, for some reason, be able to restore from an earlier snapshot. All the backuping is done in incremental fashion - if we already have 00010.sst, we will not copy it again. *IMPORTANT* -- This is based on assumption that 00010.sst never changes - two files named 00010.sst from the same DB will always be exactly the same. Is this true? I always copy manifest, current and log files.
4. You can decide if you want to flush the memtables before you backup, or you're fine with backing up the log files -- either way, you get a complete and consistent view of the database at a time of backup.
5. More things you can find in BackupableDBOptions
Here is the directory structure I use:
backup_dir/CURRENT_SNAPSHOT - just 4 bytes holding the latest snapshot
0, 1, 2, ... - files containing serialized version of each snapshot - containing a list of files
files/*.sst - sst files shared between snapshots - if one snapshot references 00010.sst and another one needs to backup it from the DB, it will just reference the same file
files/ 0/, 1/, 2/, ... - snapshot directories containing private snapshot files - current, manifest and log files
All the files are ref counted and deleted immediatelly when they get out of scope.
Some other stuff in this diff:
1. Added GetEnv() method to the DB. Discussed with @haobo and we agreed that it seems right thing to do.
2. Fixed StackableDB interface. The way it was set up before, I was not able to implement BackupableDB.
Test Plan:
I have a unittest, but please don't look at this yet. I just hacked it up to help me with debugging. I will write a lot of good tests and update the diff.
Also, `make asan_check`
Reviewers: dhruba, haobo, emayanke
Reviewed By: dhruba
CC: leveldb, haobo
Differential Revision: https://reviews.facebook.net/D14295
2013-12-09 22:06:52 +00:00
|
|
|
virtual const std::string& GetName() const;
|
2013-11-25 20:39:23 +00:00
|
|
|
virtual Env* GetEnv() const;
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
using DB::GetOptions;
|
|
|
|
virtual const Options& GetOptions(const ColumnFamilyHandle& column_family)
|
|
|
|
const;
|
|
|
|
using DB::Flush;
|
|
|
|
virtual Status Flush(const FlushOptions& options,
|
|
|
|
const ColumnFamilyHandle& column_family);
|
2012-09-15 00:11:35 +00:00
|
|
|
virtual Status DisableFileDeletions();
|
2014-01-02 11:33:42 +00:00
|
|
|
virtual Status EnableFileDeletions(bool force);
|
2013-11-08 23:23:46 +00:00
|
|
|
// All the returned filenames start with "/"
|
2012-11-06 19:21:57 +00:00
|
|
|
virtual Status GetLiveFiles(std::vector<std::string>&,
|
2013-10-03 21:38:32 +00:00
|
|
|
uint64_t* manifest_file_size,
|
|
|
|
bool flush_memtable = true);
|
2013-08-06 19:54:37 +00:00
|
|
|
virtual Status GetSortedWalFiles(VectorLogPtr& files);
|
2013-10-25 02:09:02 +00:00
|
|
|
virtual SequenceNumber GetLatestSequenceNumber() const;
|
2012-11-30 01:28:37 +00:00
|
|
|
virtual Status GetUpdatesSince(SequenceNumber seq_number,
|
2013-01-24 18:54:26 +00:00
|
|
|
unique_ptr<TransactionLogIterator>* iter);
|
2013-08-22 21:32:53 +00:00
|
|
|
virtual Status DeleteFile(std::string name);
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
2013-12-03 19:14:09 +00:00
|
|
|
virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
|
2012-11-26 21:56:45 +00:00
|
|
|
|
2013-12-03 14:39:07 +00:00
|
|
|
virtual Status GetDbIdentity(std::string& identity);
|
|
|
|
|
2014-01-22 20:46:24 +00:00
|
|
|
Status RunManualCompaction(int input_level,
|
|
|
|
int output_level,
|
|
|
|
const Slice* begin,
|
|
|
|
const Slice* end);
|
2014-01-15 00:19:09 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Extra methods (for testing) that are not in the public DB interface
|
|
|
|
|
2013-06-05 18:22:38 +00:00
|
|
|
// Compact any files in the named level that overlap [*begin, *end]
|
2014-01-22 20:46:24 +00:00
|
|
|
Status TEST_CompactRange(int level,
|
|
|
|
const Slice* begin,
|
|
|
|
const Slice* end);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
// Force current memtable contents to be flushed.
|
|
|
|
Status TEST_FlushMemTable();
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2012-06-23 02:30:03 +00:00
|
|
|
// Wait for memtable compaction
|
2013-10-14 22:12:15 +00:00
|
|
|
Status TEST_WaitForFlushMemTable();
|
2012-06-23 02:30:03 +00:00
|
|
|
|
|
|
|
// Wait for any compaction
|
|
|
|
Status TEST_WaitForCompact();
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Return an internal iterator over the current state of the database.
|
|
|
|
// The keys of this iterator are internal keys (see format.h).
|
|
|
|
// The returned iterator should be deleted when no longer needed.
|
|
|
|
Iterator* TEST_NewInternalIterator();
|
|
|
|
|
2011-03-22 18:32:49 +00:00
|
|
|
// Return the maximum overlapping data (in bytes) at next level for any
|
|
|
|
// file at a level >= 1.
|
2011-03-22 23:24:02 +00:00
|
|
|
int64_t TEST_MaxNextLevelOverlappingBytes();
|
2011-03-22 18:32:49 +00:00
|
|
|
|
2012-11-16 23:28:14 +00:00
|
|
|
// Simulate a db crash, no elegant closing of database.
|
|
|
|
void TEST_Destroy_DBImpl();
|
|
|
|
|
2013-01-11 01:18:50 +00:00
|
|
|
// Return the current manifest file no.
|
|
|
|
uint64_t TEST_Current_Manifest_FileNo();
|
2013-05-06 18:41:01 +00:00
|
|
|
|
|
|
|
// Trigger's a background call for testing.
|
|
|
|
void TEST_PurgeObsoleteteWAL();
|
|
|
|
|
2013-10-17 20:33:39 +00:00
|
|
|
// get total level0 file size. Only for testing.
|
2014-01-16 00:18:04 +00:00
|
|
|
uint64_t TEST_GetLevel0TotalSize();
|
2013-10-17 20:33:39 +00:00
|
|
|
|
2013-11-07 02:46:28 +00:00
|
|
|
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
|
|
|
|
{
|
|
|
|
default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
|
|
|
|
}
|
|
|
|
|
2013-12-20 17:57:58 +00:00
|
|
|
// needed for CleanupIteratorState
|
2013-11-15 02:03:57 +00:00
|
|
|
struct DeletionState {
|
|
|
|
inline bool HaveSomethingToDelete() const {
|
2013-12-20 17:57:58 +00:00
|
|
|
return all_files.size() ||
|
2013-11-15 02:03:57 +00:00
|
|
|
sst_delete_files.size() ||
|
|
|
|
log_delete_files.size();
|
|
|
|
}
|
2013-11-27 22:56:20 +00:00
|
|
|
|
2013-11-15 02:03:57 +00:00
|
|
|
// a list of all files that we'll consider deleting
|
|
|
|
// (every once in a while this is filled up with all files
|
|
|
|
// in the DB directory)
|
|
|
|
std::vector<std::string> all_files;
|
|
|
|
|
|
|
|
// the list of all live sst files that cannot be deleted
|
|
|
|
std::vector<uint64_t> sst_live;
|
|
|
|
|
|
|
|
// a list of sst files that we need to delete
|
|
|
|
std::vector<FileMetaData*> sst_delete_files;
|
|
|
|
|
|
|
|
// a list of log files that we need to delete
|
|
|
|
std::vector<uint64_t> log_delete_files;
|
|
|
|
|
2013-11-27 22:56:20 +00:00
|
|
|
// a list of memtables to be free
|
|
|
|
std::vector<MemTable *> memtables_to_free;
|
|
|
|
|
2013-12-20 17:57:58 +00:00
|
|
|
SuperVersion* superversion_to_free; // if nullptr nothing to free
|
|
|
|
|
|
|
|
SuperVersion* new_superversion; // if nullptr no new superversion
|
|
|
|
|
2013-11-15 02:03:57 +00:00
|
|
|
// the current manifest_file_number, log_number and prev_log_number
|
|
|
|
// that corresponds to the set of files in 'live'.
|
|
|
|
uint64_t manifest_file_number, log_number, prev_log_number;
|
|
|
|
|
2013-12-20 17:57:58 +00:00
|
|
|
explicit DeletionState(const int num_memtables = 0,
|
|
|
|
bool create_superversion = false) {
|
2013-11-15 02:03:57 +00:00
|
|
|
manifest_file_number = 0;
|
|
|
|
log_number = 0;
|
|
|
|
prev_log_number = 0;
|
2013-11-27 22:56:20 +00:00
|
|
|
memtables_to_free.reserve(num_memtables);
|
2013-12-20 17:57:58 +00:00
|
|
|
superversion_to_free = nullptr;
|
|
|
|
new_superversion =
|
|
|
|
create_superversion ? new SuperVersion(num_memtables) : nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
~DeletionState() {
|
|
|
|
// free pending memtables
|
|
|
|
for (auto m : memtables_to_free) {
|
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
// free superversion. if nullptr, this will be noop
|
|
|
|
delete superversion_to_free;
|
|
|
|
// if new_superversion was not used, it will be non-nullptr and needs
|
|
|
|
// to be freed here
|
|
|
|
delete new_superversion;
|
2013-11-15 02:03:57 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Returns the list of live files in 'live' and the list
|
|
|
|
// of all files in the filesystem in 'all_files'.
|
|
|
|
// If force == false and the last call was less than
|
|
|
|
// options_.delete_obsolete_files_period_micros microseconds ago,
|
|
|
|
// it will not fill up the deletion_state
|
|
|
|
void FindObsoleteFiles(DeletionState& deletion_state,
|
|
|
|
bool force,
|
|
|
|
bool no_full_scan = false);
|
|
|
|
|
|
|
|
// Diffs the files listed in filenames and those that do not
|
|
|
|
// belong to live files are posibly removed. Also, removes all the
|
|
|
|
// files in sst_delete_files and log_delete_files.
|
|
|
|
// It is not necessary to hold the mutex when invoking this method.
|
|
|
|
void PurgeObsoleteFiles(DeletionState& deletion_state);
|
|
|
|
|
2012-12-18 21:05:39 +00:00
|
|
|
protected:
|
2012-11-06 03:18:49 +00:00
|
|
|
Env* const env_;
|
|
|
|
const std::string dbname_;
|
2013-01-20 10:07:13 +00:00
|
|
|
unique_ptr<VersionSet> versions_;
|
2012-11-06 03:18:49 +00:00
|
|
|
const InternalKeyComparator internal_comparator_;
|
|
|
|
const Options options_; // options_.comparator == &internal_comparator_
|
|
|
|
|
|
|
|
const Comparator* user_comparator() const {
|
|
|
|
return internal_comparator_.user_comparator();
|
|
|
|
}
|
2013-02-15 23:28:24 +00:00
|
|
|
|
2014-01-27 22:33:50 +00:00
|
|
|
ColumnFamilyData* GetDefaultColumnFamily() { return default_cfd_; }
|
2012-11-06 03:18:49 +00:00
|
|
|
|
2013-02-15 23:28:24 +00:00
|
|
|
Iterator* NewInternalIterator(const ReadOptions&,
|
|
|
|
SequenceNumber* latest_snapshot);
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
|
|
|
friend class DB;
|
2014-01-17 05:56:26 +00:00
|
|
|
friend class TailingIterator;
|
2012-03-09 00:23:21 +00:00
|
|
|
struct CompactionState;
|
|
|
|
struct Writer;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
Status NewDB();
|
|
|
|
|
|
|
|
// Recover the descriptor from persistent storage. May do a significant
|
|
|
|
// amount of work to recover recently logged updates. Any changes to
|
|
|
|
// be made to the descriptor are added to *edit.
|
2014-01-22 18:59:07 +00:00
|
|
|
Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
bool read_only = false, bool error_if_log_file_exist = false);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
void MaybeIgnoreError(Status* s) const;
|
|
|
|
|
2012-11-26 21:56:45 +00:00
|
|
|
const Status CreateArchivalDirectory();
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Delete any unneeded files and stale in-memory entries.
|
|
|
|
void DeleteObsoleteFiles();
|
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
// Flush the in-memory write buffer to storage. Switches to a new
|
2011-03-18 22:37:00 +00:00
|
|
|
// log-file/memtable and writes a new descriptor iff successful.
|
2013-11-08 23:23:46 +00:00
|
|
|
Status FlushMemTableToOutputFile(bool* madeProgress,
|
|
|
|
DeletionState& deletion_state);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Refactor Recover() code
Summary:
This diff does two things:
* Rethinks how we call Recover() with read_only option. Before, we call it with pointer to memtable where we'd like to apply those changes to. This memtable is set in db_impl_readonly.cc and it's actually DBImpl::mem_. Why don't we just apply updates to mem_ right away? It seems more intuitive.
* Changes when we apply updates to manifest. Before, the process is to recover all the logs, flush it to sst files and then do one giant commit that atomically adds all recovered sst files and sets the next log number. This works good enough, but causes some small troubles for my column family approach, since I can't have one VersionEdit apply to more than single column family[1]. The change here is to commit the files recovered from logs right away. Here is the state of the world before the change:
1. Recover log 5, add new sst files to edit
2. Recover log 7, add new sst files to edit
3. Recover log 8, add new sst files to edit
4. Commit all added sst files to manifest and mark log files 5, 7 and 8 as recoverd (via SetLogNumber(9) function)
After the change, we'll do:
1. Recover log 5, commit the new sst files and set log 5 as recovered
2. Recover log 7, commit the new sst files and set log 7 as recovered
3. Recover log 8, commit the new sst files and set log 8 as recovered
The added (small) benefit is that if we fail after (2), the new recovery will only have to recover log 8. In previous case, we'll have to restart the recovery from the beginning. The bigger benefit will be to enable easier integration of multiple column families in Recovery code path.
[1] I'm happy to dicuss this decison, but I believe this is the cleanest way to go. It also makes backward compatibility much easier. We don't have a requirement of adding multiple column families atomically.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15237
2014-01-22 18:45:26 +00:00
|
|
|
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
|
|
|
|
bool read_only);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2012-10-19 21:00:53 +00:00
|
|
|
// The following two methods are used to flush a memtable to
|
|
|
|
// storage. The first one is used atdatabase RecoveryTime (when the
|
|
|
|
// database is opened) and is heavyweight because it holds the mutex
|
|
|
|
// for the entire period. The second method WriteLevel0Table supports
|
|
|
|
// concurrent flush memtables to storage.
|
|
|
|
Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
|
2013-06-11 21:23:58 +00:00
|
|
|
Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
|
2012-10-19 21:00:53 +00:00
|
|
|
uint64_t* filenumber);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Fix SlowdownAmount
Summary:
This had a few bugs.
1) bottom and top were reversed. top is for the max value but the callers were passing the max
value to bottom. The result is that the max sleep is used when n >= bottom.
2) one of the callers passed values with type double and these values are frequently between
1.0 and 2.0 so rounding will do some bad things
3) sometimes the function returned 0 when there should be a stall
With this change and one other diff (out for review soon) there are slightly fewer stalls on one workload.
With the fix.
Stalls(secs): 160.166 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 58.495 leveln_slowdown
Stalls(count): 910261 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 54526 leveln_slowdown
Without the fix.
Stalls(secs): 172.227 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 56.538 leveln_slowdown
Stalls(count): 160831 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 52845 leveln_slowdown
Task ID: #
Blame Rev:
Test Plan:
run db_bench for --benchmarks=overwrite with IO-bound database
Revert Plan:
Database Impact:
Memcache Impact:
Other Notes:
EImportant:
- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -
Reviewers: haobo
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15243
2014-01-17 02:44:23 +00:00
|
|
|
uint64_t SlowdownAmount(int n, double bottom, double top);
|
2013-12-20 17:57:58 +00:00
|
|
|
// MakeRoomForWrite will return superversion_to_free through an arugment,
|
|
|
|
// which the caller needs to delete. We do it because caller can delete
|
|
|
|
// the superversion outside of mutex
|
|
|
|
Status MakeRoomForWrite(bool force /* compact even if there is room? */,
|
|
|
|
SuperVersion** superversion_to_free);
|
2014-01-14 22:49:31 +00:00
|
|
|
void BuildBatchGroup(Writer** last_writer,
|
|
|
|
autovector<WriteBatch*>* write_batch_group);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2012-07-06 18:42:09 +00:00
|
|
|
// Force current memtable contents to be flushed.
|
|
|
|
Status FlushMemTable(const FlushOptions& options);
|
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
// Wait for memtable flushed
|
|
|
|
Status WaitForFlushMemTable();
|
2012-07-06 18:42:09 +00:00
|
|
|
|
2012-08-14 22:20:36 +00:00
|
|
|
void MaybeScheduleLogDBDeployStats();
|
2012-08-27 07:50:26 +00:00
|
|
|
static void BGLogDBDeployStats(void* db);
|
|
|
|
void LogDBDeployStats();
|
2012-08-14 22:20:36 +00:00
|
|
|
|
2013-10-14 22:12:15 +00:00
|
|
|
void MaybeScheduleFlushOrCompaction();
|
2013-09-13 21:38:37 +00:00
|
|
|
static void BGWorkCompaction(void* db);
|
|
|
|
static void BGWorkFlush(void* db);
|
|
|
|
void BackgroundCallCompaction();
|
|
|
|
void BackgroundCallFlush();
|
2013-08-06 19:54:37 +00:00
|
|
|
Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state);
|
2013-11-08 23:23:46 +00:00
|
|
|
Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state);
|
2013-09-02 06:23:40 +00:00
|
|
|
void CleanupCompaction(CompactionState* compact, Status status);
|
2013-11-08 23:23:46 +00:00
|
|
|
Status DoCompactionWork(CompactionState* compact,
|
|
|
|
DeletionState& deletion_state);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
Status OpenCompactionOutputFile(CompactionState* compact);
|
|
|
|
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
|
|
|
|
Status InstallCompactionResults(CompactionState* compact);
|
2012-10-19 21:00:53 +00:00
|
|
|
void AllocateCompactionOutputFileNumbers(CompactionState* compact);
|
|
|
|
void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
|
2012-11-29 00:42:36 +00:00
|
|
|
|
2012-11-26 21:56:45 +00:00
|
|
|
void PurgeObsoleteWALFiles();
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2013-08-06 19:54:37 +00:00
|
|
|
Status AppendSortedWalsOfType(const std::string& path,
|
|
|
|
VectorLogPtr& log_files,
|
|
|
|
WalFileType type);
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2013-08-06 19:54:37 +00:00
|
|
|
// Requires: all_logs should be sorted with earliest log file first
|
|
|
|
// Retains all log files in all_logs which contain updates with seq no.
|
|
|
|
// Greater Than or Equal to the requested SequenceNumber.
|
|
|
|
Status RetainProbableWalFiles(VectorLogPtr& all_logs,
|
|
|
|
const SequenceNumber target);
|
2013-03-18 21:50:59 +00:00
|
|
|
// return true if
|
2013-08-06 19:54:37 +00:00
|
|
|
bool CheckWalFileExistsAndEmpty(const WalFileType type,
|
|
|
|
const uint64_t number);
|
2012-11-30 01:28:37 +00:00
|
|
|
|
2013-08-06 19:54:37 +00:00
|
|
|
Status ReadFirstRecord(const WalFileType type, const uint64_t number,
|
|
|
|
WriteBatch* const result);
|
2012-11-30 01:28:37 +00:00
|
|
|
|
|
|
|
Status ReadFirstLine(const std::string& fname, WriteBatch* const batch);
|
2013-05-10 22:21:04 +00:00
|
|
|
|
2013-05-28 19:35:43 +00:00
|
|
|
void PrintStatistics();
|
|
|
|
|
2013-10-05 05:32:05 +00:00
|
|
|
// dump rocksdb.stats to LOG
|
2013-05-10 22:21:04 +00:00
|
|
|
void MaybeDumpStats();
|
|
|
|
|
2013-06-30 06:21:36 +00:00
|
|
|
// Return the minimum empty level that could hold the total data in the
|
|
|
|
// input level. Return the input level, if such level could not be found.
|
|
|
|
int FindMinimumEmptyLevelFitting(int level);
|
|
|
|
|
2013-09-04 20:13:08 +00:00
|
|
|
// Move the files in the input level to the target level.
|
|
|
|
// If target_level < 0, automatically calculate the minimum level that could
|
|
|
|
// hold the data set.
|
2014-01-22 20:46:24 +00:00
|
|
|
Status ReFitLevel(int level, int target_level = -1);
|
2013-06-30 06:21:36 +00:00
|
|
|
|
2014-01-17 05:56:26 +00:00
|
|
|
// Returns the current SuperVersion number.
|
|
|
|
uint64_t CurrentVersionNumber() const;
|
|
|
|
|
|
|
|
// Returns a pair of iterators (mutable-only and immutable-only) used
|
|
|
|
// internally by TailingIterator and stores CurrentVersionNumber() in
|
|
|
|
// *superversion_number. These iterators are always up-to-date, i.e. can
|
|
|
|
// be used to read new data.
|
|
|
|
std::pair<Iterator*, Iterator*> GetTailingIteratorPair(
|
|
|
|
const ReadOptions& options,
|
|
|
|
uint64_t* superversion_number);
|
2013-06-30 06:21:36 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Constant after construction
|
2012-04-17 15:36:46 +00:00
|
|
|
const InternalFilterPolicy internal_filter_policy_;
|
2011-03-18 22:37:00 +00:00
|
|
|
bool owns_info_log_;
|
|
|
|
|
|
|
|
// table_cache_ provides its own synchronization
|
2013-01-20 10:07:13 +00:00
|
|
|
unique_ptr<TableCache> table_cache_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-02-15 23:28:24 +00:00
|
|
|
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
|
2011-03-18 22:37:00 +00:00
|
|
|
FileLock* db_lock_;
|
|
|
|
|
|
|
|
// State below is protected by mutex_
|
|
|
|
port::Mutex mutex_;
|
|
|
|
port::AtomicPointer shutting_down_;
|
2011-06-07 14:40:26 +00:00
|
|
|
port::CondVar bg_cv_; // Signalled when background work finishes
|
2011-06-22 02:36:45 +00:00
|
|
|
uint64_t logfile_number_;
|
2013-01-20 10:07:13 +00:00
|
|
|
unique_ptr<log::Writer> log_;
|
2014-01-24 22:30:28 +00:00
|
|
|
ColumnFamilyData* default_cfd_;
|
2014-01-28 19:05:04 +00:00
|
|
|
unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
|
2013-12-20 17:57:58 +00:00
|
|
|
|
2012-08-14 22:20:36 +00:00
|
|
|
std::string host_name_;
|
|
|
|
|
2014-01-27 19:02:21 +00:00
|
|
|
std::unique_ptr<Directory> db_directory_;
|
|
|
|
|
2012-03-09 00:23:21 +00:00
|
|
|
// Queue of writers.
|
|
|
|
std::deque<Writer*> writers_;
|
2013-03-28 22:19:28 +00:00
|
|
|
WriteBatch tmp_batch_;
|
2012-03-09 00:23:21 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
SnapshotList snapshots_;
|
|
|
|
|
|
|
|
// Set of table files to protect from deletion because they are
|
|
|
|
// part of ongoing compactions.
|
|
|
|
std::set<uint64_t> pending_outputs_;
|
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-21 23:10:39 +00:00
|
|
|
// count how many background compactions are running or have been scheduled
|
2012-10-19 21:00:53 +00:00
|
|
|
int bg_compaction_scheduled_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
2013-12-21 23:10:39 +00:00
|
|
|
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
|
|
|
|
// compactions (if manual_compaction_ is not null). This mechanism enables
|
|
|
|
// manual compactions to wait until all other compactions are finished.
|
|
|
|
int bg_manual_only_;
|
|
|
|
|
2013-09-13 21:38:37 +00:00
|
|
|
// number of background memtable flush jobs, submitted to the HIGH pool
|
|
|
|
int bg_flush_scheduled_;
|
|
|
|
|
2012-08-27 07:50:26 +00:00
|
|
|
// Has a background stats log thread scheduled?
|
|
|
|
bool bg_logstats_scheduled_;
|
|
|
|
|
2011-06-07 14:40:26 +00:00
|
|
|
// Information for a manual compaction
|
|
|
|
struct ManualCompaction {
|
2014-01-15 00:19:09 +00:00
|
|
|
int input_level;
|
|
|
|
int output_level;
|
2011-10-05 23:30:28 +00:00
|
|
|
bool done;
|
2014-01-22 20:46:24 +00:00
|
|
|
Status status;
|
2012-10-19 21:00:53 +00:00
|
|
|
bool in_progress; // compaction request being processed?
|
2013-02-15 23:28:24 +00:00
|
|
|
const InternalKey* begin; // nullptr means beginning of key range
|
|
|
|
const InternalKey* end; // nullptr means end of key range
|
2011-10-05 23:30:28 +00:00
|
|
|
InternalKey tmp_storage; // Used to keep track of compaction progress
|
2011-06-07 14:40:26 +00:00
|
|
|
};
|
|
|
|
ManualCompaction* manual_compaction_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Have we encountered a background error in paranoid mode?
|
|
|
|
Status bg_error_;
|
|
|
|
|
2013-03-28 22:19:28 +00:00
|
|
|
std::unique_ptr<StatsLogger> logger_;
|
2012-08-14 22:20:36 +00:00
|
|
|
|
2012-08-22 00:02:21 +00:00
|
|
|
int64_t volatile last_log_ts;
|
2012-08-14 22:20:36 +00:00
|
|
|
|
2012-09-15 00:11:35 +00:00
|
|
|
// shall we disable deletion of obsolete files
|
2014-01-02 11:33:42 +00:00
|
|
|
// if 0 the deletion is enabled.
|
|
|
|
// if non-zero, files will not be getting deleted
|
|
|
|
// This enables two different threads to call
|
|
|
|
// EnableFileDeletions() and DisableFileDeletions()
|
|
|
|
// without any synchronization
|
|
|
|
int disable_delete_obsolete_files_;
|
2012-09-15 00:11:35 +00:00
|
|
|
|
2012-10-16 15:53:46 +00:00
|
|
|
// last time when DeleteObsoleteFiles was invoked
|
|
|
|
uint64_t delete_obsolete_files_last_run_;
|
|
|
|
|
2013-05-06 18:41:01 +00:00
|
|
|
// last time when PurgeObsoleteWALFiles ran.
|
|
|
|
uint64_t purge_wal_files_last_run_;
|
|
|
|
|
2013-05-10 22:21:04 +00:00
|
|
|
// last time stats were dumped to LOG
|
2013-05-24 19:52:45 +00:00
|
|
|
std::atomic<uint64_t> last_stats_dump_time_microsec_;
|
2013-05-10 22:21:04 +00:00
|
|
|
|
2013-11-07 02:46:28 +00:00
|
|
|
// obsolete files will be deleted every this seconds if ttl deletion is
|
|
|
|
// enabled and archive size_limit is disabled.
|
|
|
|
uint64_t default_interval_to_delete_obsolete_WAL_;
|
|
|
|
|
2012-11-06 19:21:57 +00:00
|
|
|
bool flush_on_destroy_; // Used when disableWAL is true.
|
|
|
|
|
2014-01-30 04:40:41 +00:00
|
|
|
InternalStats internal_stats_;
|
Improve output for GetProperty('leveldb.stats')
Summary:
Display separate values for read, write & total compaction IO.
Display compaction amplification and write amplification.
Add similar values for the period since the last call to GetProperty. Results since the server started
are reported as "cumulative" stats. Results since the last call to GetProperty are reported as
"interval" stats.
Level Files Size(MB) Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
0 7 13 21 0 211 0 0 211 0.0 0.0 10.1 0 0 0 0 113 0.0
1 79 157 88 993 989 198 795 194 9.0 11.3 11.2 106 405 502 97 14 0.0
2 19 36 5 63 63 37 27 36 2.4 12.3 12.2 19 14 32 18 12 0.0
>>>>>>>>>>>>>>>>>>>>>>>>> text below has been is new and/or reformatted
Uptime(secs): 122.2 total, 0.9 interval
Compaction IO cumulative (GB): 0.21 new, 1.03 read, 1.23 write, 2.26 read+write
Compaction IO cumulative (MB/sec): 1.7 new, 8.6 read, 10.3 write, 19.0 read+write
Amplification cumulative: 6.0 write, 11.0 compaction
Compaction IO interval (MB): 5.59 new, 0.00 read, 5.59 write, 5.59 read+write
Compaction IO interval (MB/sec): 6.5 new, 0.0 read, 6.5 write, 6.5 read+write
Amplification interval: 1.0 write, 1.0 compaction
>>>>>>>>>>>>>>>>>>>>>>>> text above is new and/or reformatted
Stalls(secs): 90.574 level0_slowdown, 0.000 level0_numfiles, 10.165 memtable_compaction, 0.000 leveln_slowdown
Task ID: #
Blame Rev:
Test Plan:
make check, run db_bench
Revert Plan:
Database Impact:
Memcache Impact:
Other Notes:
EImportant:
- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -
Reviewers: haobo
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11049
2013-06-03 15:16:16 +00:00
|
|
|
|
2012-08-17 23:06:05 +00:00
|
|
|
static const int KEEP_LOG_FILE_NUM = 1000;
|
2012-09-06 00:44:13 +00:00
|
|
|
std::string db_absolute_path_;
|
2012-08-17 23:06:05 +00:00
|
|
|
|
2012-10-19 21:00:53 +00:00
|
|
|
// count of the number of contiguous delaying writes
|
|
|
|
int delayed_writes_;
|
|
|
|
|
2013-03-15 00:00:04 +00:00
|
|
|
// The options to access storage files
|
2013-06-07 22:35:17 +00:00
|
|
|
const EnvOptions storage_options_;
|
2013-03-15 00:00:04 +00:00
|
|
|
|
2013-06-30 06:21:36 +00:00
|
|
|
// A value of true temporarily disables scheduling of background work
|
|
|
|
bool bg_work_gate_closed_;
|
|
|
|
|
|
|
|
// Guard against multiple concurrent refitting
|
|
|
|
bool refitting_level_;
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// No copying allowed
|
|
|
|
DBImpl(const DBImpl&);
|
|
|
|
void operator=(const DBImpl&);
|
|
|
|
|
2012-10-19 21:00:53 +00:00
|
|
|
// dump the delayed_writes_ to the log file and reset counter.
|
|
|
|
void DelayLoggingAndReset();
|
2012-11-27 05:16:21 +00:00
|
|
|
|
2013-03-21 22:59:47 +00:00
|
|
|
// Return the earliest snapshot where seqno is visible.
|
|
|
|
// Store the snapshot right before that, if any, in prev_snapshot
|
|
|
|
inline SequenceNumber findEarliestVisibleSnapshot(
|
|
|
|
SequenceNumber in,
|
|
|
|
std::vector<SequenceNumber>& snapshots,
|
|
|
|
SequenceNumber* prev_snapshot);
|
2013-07-06 01:49:18 +00:00
|
|
|
|
2013-12-20 17:57:58 +00:00
|
|
|
// Background threads call this function, which is just a wrapper around
|
2014-01-29 21:28:50 +00:00
|
|
|
// the cfd->InstallSuperVersion() function. Background threads carry
|
2013-12-20 17:57:58 +00:00
|
|
|
// deletion_state which can have new_superversion already allocated.
|
2014-01-24 22:30:28 +00:00
|
|
|
void InstallSuperVersion(ColumnFamilyData* cfd,
|
|
|
|
DeletionState& deletion_state);
|
2013-12-20 17:57:58 +00:00
|
|
|
|
2013-07-26 19:57:01 +00:00
|
|
|
// Function that Get and KeyMayExist call with no_io true or false
|
|
|
|
// Note: 'value_found' from KeyMayExist propagates here
|
2013-07-06 01:49:18 +00:00
|
|
|
Status GetImpl(const ReadOptions& options,
|
2014-01-28 19:05:04 +00:00
|
|
|
const ColumnFamilyHandle& column_family, const Slice& key,
|
|
|
|
std::string* value, bool* value_found = nullptr);
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Sanitize db options. The caller should delete result.info_log if
|
|
|
|
// it is not equal to src.info_log.
|
|
|
|
extern Options SanitizeOptions(const std::string& db,
|
|
|
|
const InternalKeyComparator* icmp,
|
2012-04-17 15:36:46 +00:00
|
|
|
const InternalFilterPolicy* ipolicy,
|
2011-03-18 22:37:00 +00:00
|
|
|
const Options& src);
|
|
|
|
|
2013-10-30 17:52:33 +00:00
|
|
|
|
|
|
|
// Determine compression type, based on user options, level of the output
|
|
|
|
// file and whether compression is disabled.
|
|
|
|
// If enable_compression is false, then compression is always disabled no
|
|
|
|
// matter what the values of the other two parameters are.
|
|
|
|
// Otherwise, the compression type is determined based on options and level.
|
|
|
|
CompressionType GetCompressionType(const Options& options, int level,
|
|
|
|
const bool enable_compression);
|
|
|
|
|
2013-12-19 18:02:53 +00:00
|
|
|
// Determine compression type for L0 file written by memtable flush.
|
|
|
|
CompressionType GetCompressionFlush(const Options& options);
|
|
|
|
|
2013-10-04 04:49:15 +00:00
|
|
|
} // namespace rocksdb
|