2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2013-10-05 05:32:05 +00:00
|
|
|
#pragma once
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
#include <string>
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-12 22:53:55 +00:00
|
|
|
#include <utility>
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
#include <vector>
|
2018-12-17 21:12:22 +00:00
|
|
|
#include "db/range_tombstone_fragmenter.h"
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
#include "db/table_properties_collector.h"
|
2019-06-01 00:19:43 +00:00
|
|
|
#include "logging/event_logger.h"
|
2017-04-06 02:02:00 +00:00
|
|
|
#include "options/cf_options.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/comparator.h"
|
2014-07-08 19:31:49 +00:00
|
|
|
#include "rocksdb/env.h"
|
Added EventListener::OnTableFileCreationStarted() callback
Summary: Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
Test Plan: unit test.
Reviewers: dhruba, yhchiang, ott, sdong
Reviewed By: sdong
Subscribers: sdong, kradhakrishnan, IslamAbdelRahman, andrewkr, yhchiang, leveldb, ott, dhruba
Differential Revision: https://reviews.facebook.net/D56337
2016-04-29 18:35:00 +00:00
|
|
|
#include "rocksdb/listener.h"
|
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/status.h"
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-12 22:53:55 +00:00
|
|
|
#include "rocksdb/table_properties.h"
|
Added EventListener::OnTableFileCreationStarted() callback
Summary: Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
Test Plan: unit test.
Reviewers: dhruba, yhchiang, ott, sdong
Reviewed By: sdong
Subscribers: sdong, kradhakrishnan, IslamAbdelRahman, andrewkr, yhchiang, leveldb, ott, dhruba
Differential Revision: https://reviews.facebook.net/D56337
2016-04-29 18:35:00 +00:00
|
|
|
#include "rocksdb/types.h"
|
2016-11-01 03:35:54 +00:00
|
|
|
#include "table/scoped_arena_iterator.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
struct Options;
|
|
|
|
struct FileMetaData;
|
|
|
|
|
|
|
|
class Env;
|
2013-11-13 04:05:28 +00:00
|
|
|
struct EnvOptions;
|
2011-03-18 22:37:00 +00:00
|
|
|
class Iterator;
|
2017-10-06 17:26:38 +00:00
|
|
|
class SnapshotChecker;
|
2011-03-18 22:37:00 +00:00
|
|
|
class TableCache;
|
|
|
|
class VersionEdit;
|
2013-10-29 00:54:09 +00:00
|
|
|
class TableBuilder;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
class WritableFileWriter;
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 21:35:54 +00:00
|
|
|
class InternalStats;
|
2013-10-29 00:54:09 +00:00
|
|
|
|
2016-04-07 06:10:32 +00:00
|
|
|
// @param column_family_name Name of the column family that is also identified
|
|
|
|
// by column_family_id, or empty string if unknown. It must outlive the
|
|
|
|
// TableBuilder returned by this function.
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
TableBuilder* NewTableBuilder(
|
2018-05-21 21:33:55 +00:00
|
|
|
const ImmutableCFOptions& options, const MutableCFOptions& moptions,
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
|
|
|
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
|
|
|
|
int_tbl_prop_collector_factories,
|
2016-04-07 06:10:32 +00:00
|
|
|
uint32_t column_family_id, const std::string& column_family_name,
|
|
|
|
WritableFileWriter* file, const CompressionType compression_type,
|
2019-03-18 19:07:35 +00:00
|
|
|
const uint64_t sample_for_compression,
|
2017-06-28 00:02:20 +00:00
|
|
|
const CompressionOptions& compression_opts, int level,
|
2017-10-23 22:22:05 +00:00
|
|
|
const bool skip_filters = false, const uint64_t creation_time = 0,
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
2019-04-11 02:24:25 +00:00
|
|
|
const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
|
|
|
|
const uint64_t file_creation_time = 0);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Build a Table file from the contents of *iter. The generated file
|
2014-06-13 22:54:19 +00:00
|
|
|
// will be named according to number specified in meta. On success, the rest of
|
2011-06-22 02:36:45 +00:00
|
|
|
// *meta will be filled with metadata about the generated table.
|
|
|
|
// If no data is present in *iter, meta->file_size will be set to
|
|
|
|
// zero, and no Table file will be produced.
|
2016-04-07 06:10:32 +00:00
|
|
|
//
|
|
|
|
// @param column_family_name Name of the column family that is also identified
|
|
|
|
// by column_family_id, or empty string if unknown.
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
extern Status BuildTable(
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
const std::string& dbname, Env* env, FileSystem* fs,
|
|
|
|
const ImmutableCFOptions& options,
|
|
|
|
const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
|
2016-11-01 03:35:54 +00:00
|
|
|
TableCache* table_cache, InternalIterator* iter,
|
2018-12-17 21:12:22 +00:00
|
|
|
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
|
|
|
|
range_del_iters,
|
|
|
|
FileMetaData* meta, const InternalKeyComparator& internal_comparator,
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
|
|
|
|
int_tbl_prop_collector_factories,
|
2016-04-07 06:10:32 +00:00
|
|
|
uint32_t column_family_id, const std::string& column_family_name,
|
|
|
|
std::vector<SequenceNumber> snapshots,
|
Use SST files for Transaction conflict detection
Summary:
Currently, transactions can fail even if there is no actual write conflict. This is due to relying on only the memtables to check for write-conflicts. Users have to tune memtable settings to try to avoid this, but it's hard to figure out exactly how to tune these settings.
With this diff, TransactionDB will use both memtables and SST files to determine if there are any write conflicts. This relies on the fact that BlockBasedTable stores sequence numbers for all writes that happen after any open snapshot. Also, D50295 is needed to prevent SingleDelete from disappearing writes (the TODOs in this test code will be fixed once the other diff is approved and merged).
Note that Optimistic transactions will still rely on tuning memtable settings as we do not want to read from SST while on the write thread. Also, memtable settings can still be used to reduce how often TransactionDB needs to read SST files.
Test Plan: unit tests, db bench
Reviewers: rven, yhchiang, kradhakrishnan, IslamAbdelRahman, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb, yoshinorim
Differential Revision: https://reviews.facebook.net/D50475
2015-10-15 23:37:15 +00:00
|
|
|
SequenceNumber earliest_write_conflict_snapshot,
|
2017-10-06 17:26:38 +00:00
|
|
|
SnapshotChecker* snapshot_checker, const CompressionType compression,
|
2019-03-18 19:07:35 +00:00
|
|
|
const uint64_t sample_for_compression,
|
2015-04-17 22:26:50 +00:00
|
|
|
const CompressionOptions& compression_opts, bool paranoid_file_checks,
|
Added EventListener::OnTableFileCreationStarted() callback
Summary: Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
Test Plan: unit test.
Reviewers: dhruba, yhchiang, ott, sdong
Reviewed By: sdong
Subscribers: sdong, kradhakrishnan, IslamAbdelRahman, andrewkr, yhchiang, leveldb, ott, dhruba
Differential Revision: https://reviews.facebook.net/D56337
2016-04-29 18:35:00 +00:00
|
|
|
InternalStats* internal_stats, TableFileCreationReason reason,
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
IOStatus* io_status, EventLogger* event_logger = nullptr, int job_id = 0,
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-12 22:53:55 +00:00
|
|
|
const Env::IOPriority io_priority = Env::IO_HIGH,
|
2017-06-28 00:02:20 +00:00
|
|
|
TableProperties* table_properties = nullptr, int level = -1,
|
2017-11-10 17:25:26 +00:00
|
|
|
const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
2019-04-11 02:24:25 +00:00
|
|
|
Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
|
|
|
|
const uint64_t file_creation_time = 0);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|