2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-29 03:34:02 +00:00
|
|
|
|
|
|
|
#pragma once
|
2019-05-30 21:47:29 +00:00
|
|
|
|
2013-10-29 03:34:02 +00:00
|
|
|
#include <stdint.h>
|
2022-10-25 18:50:38 +00:00
|
|
|
|
2016-04-07 06:10:32 +00:00
|
|
|
#include <string>
|
TablePropertiesCollectorFactory
Summary:
This diff addresses task #4296714 and rethinks how users provide us with TablePropertiesCollectors as part of Options.
Here's description of task #4296714:
I'm debugging #4295529 and noticed that our count of user properties kDeletedKeys is wrong. We're sharing one single InternalKeyPropertiesCollector with all Table Builders. In LOG Files, we're outputting number of kDeletedKeys as connected with a single table, while it's actually the total count of deleted keys since creation of the DB.
For example, this table has 3155 entries and 1391828 deleted keys.
The problem with current approach that we call methods on a single TablePropertiesCollector for all the tables we create. Even worse, we could do it from multiple threads at the same time and TablePropertiesCollector has no way of knowing which table we're calling it for.
Good part: Looks like nobody inside Facebook is using Options::table_properties_collectors. This means we should be able to painfully change the API.
In this change, I introduce TablePropertiesCollectorFactory. For every table we create, we call `CreateTablePropertiesCollector`, which creates a TablePropertiesCollector for a single table. We then use it sequentially from a single thread, which means it doesn't have to be thread-safe.
Test Plan:
Added a test in table_properties_collector_test that fails on master (build two tables, assert that kDeletedKeys count is correct for the second one).
Also, all other tests
Reviewers: sdong, dhruba, haobo, kailiu
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18579
2014-05-13 19:30:55 +00:00
|
|
|
#include <vector>
|
2022-10-25 18:50:38 +00:00
|
|
|
|
2020-02-10 23:42:46 +00:00
|
|
|
#include "db/version_edit.h"
|
2013-10-29 03:34:02 +00:00
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/status.h"
|
2014-06-18 23:36:48 +00:00
|
|
|
#include "rocksdb/table.h"
|
2013-10-29 03:34:02 +00:00
|
|
|
#include "rocksdb/table_properties.h"
|
2019-09-05 17:03:42 +00:00
|
|
|
#include "table/plain/plain_table_bloom.h"
|
2019-05-30 21:47:29 +00:00
|
|
|
#include "table/plain/plain_table_index.h"
|
|
|
|
#include "table/plain/plain_table_key_coding.h"
|
2016-04-07 06:10:32 +00:00
|
|
|
#include "table/table_builder.h"
|
2013-10-29 03:34:02 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2013-10-29 03:34:02 +00:00
|
|
|
|
|
|
|
class BlockBuilder;
|
|
|
|
class BlockHandle;
|
|
|
|
class WritableFile;
|
|
|
|
class TableBuilder;
|
|
|
|
|
2019-05-23 23:22:13 +00:00
|
|
|
// The builder class of PlainTable. For description of PlainTable format
|
|
|
|
// See comments of class PlainTableFactory, where instances of
|
|
|
|
// PlainTableReader are created.
|
2022-10-25 18:50:38 +00:00
|
|
|
class PlainTableBuilder : public TableBuilder {
|
2014-06-18 23:36:48 +00:00
|
|
|
public:
|
2013-10-29 03:34:02 +00:00
|
|
|
// Create a builder that will store the contents of the table it is
|
|
|
|
// building in *file. Does not close the file. It is up to the
|
|
|
|
// caller to close the file after calling Finish(). The output file
|
|
|
|
// will be part of level specified by 'level'. A value of -1 means
|
|
|
|
// that the caller does not know which level the output file will reside.
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
PlainTableBuilder(
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
|
2024-02-02 22:14:43 +00:00
|
|
|
const InternalTblPropCollFactories* internal_tbl_prop_coll_factories,
|
2021-09-28 19:33:03 +00:00
|
|
|
uint32_t column_family_id, int level_at_creation,
|
|
|
|
WritableFileWriter* file, uint32_t user_key_size,
|
|
|
|
EncodingType encoding_type, size_t index_sparseness,
|
|
|
|
uint32_t bloom_bits_per_key, const std::string& column_family_name,
|
|
|
|
uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
|
|
|
|
double hash_table_ratio = 0, bool store_index_in_file = false,
|
|
|
|
const std::string& db_id = "", const std::string& db_session_id = "",
|
|
|
|
uint64_t file_number = 0);
|
2019-09-12 01:07:12 +00:00
|
|
|
// No copying allowed
|
|
|
|
PlainTableBuilder(const PlainTableBuilder&) = delete;
|
|
|
|
void operator=(const PlainTableBuilder&) = delete;
|
2013-10-29 03:34:02 +00:00
|
|
|
|
|
|
|
// REQUIRES: Either Finish() or Abandon() has been called.
|
|
|
|
~PlainTableBuilder();
|
|
|
|
|
|
|
|
// Add key,value to the table being constructed.
|
|
|
|
// REQUIRES: key is after any previously added key according to comparator.
|
|
|
|
// REQUIRES: Finish(), Abandon() have not been called
|
|
|
|
void Add(const Slice& key, const Slice& value) override;
|
|
|
|
|
|
|
|
// Return non-ok iff some error has been detected.
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
Status status() const override { return status_; }
|
|
|
|
|
|
|
|
// Return non-ok iff some error happens during IO.
|
|
|
|
IOStatus io_status() const override { return io_status_; }
|
2013-10-29 03:34:02 +00:00
|
|
|
|
|
|
|
// Finish building the table. Stops using the file passed to the
|
|
|
|
// constructor after this function returns.
|
|
|
|
// REQUIRES: Finish(), Abandon() have not been called
|
|
|
|
Status Finish() override;
|
|
|
|
|
|
|
|
// Indicate that the contents of this builder should be abandoned. Stops
|
|
|
|
// using the file passed to the constructor after this function returns.
|
|
|
|
// If the caller is not going to call Finish(), it must call Abandon()
|
|
|
|
// before destroying this builder.
|
|
|
|
// REQUIRES: Finish(), Abandon() have not been called
|
|
|
|
void Abandon() override;
|
|
|
|
|
|
|
|
// Number of calls to Add() so far.
|
|
|
|
uint64_t NumEntries() const override;
|
|
|
|
|
|
|
|
// Size of the file generated so far. If invoked after a successful
|
|
|
|
// Finish() call, returns the size of the final generated file.
|
|
|
|
uint64_t FileSize() const override;
|
|
|
|
|
2023-11-08 22:00:36 +00:00
|
|
|
TableProperties GetTableProperties() const override { return properties_; }
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-12 22:53:55 +00:00
|
|
|
|
2019-03-01 23:41:55 +00:00
|
|
|
bool SaveIndexInFile() const { return store_index_in_file_; }
|
|
|
|
|
2020-02-10 23:42:46 +00:00
|
|
|
// Get file checksum
|
2020-03-29 22:57:02 +00:00
|
|
|
std::string GetFileChecksum() const override;
|
2020-02-10 23:42:46 +00:00
|
|
|
|
|
|
|
// Get file checksum function name
|
|
|
|
const char* GetFileChecksumFuncName() const override;
|
|
|
|
|
Fix/cleanup SeqnoToTimeMapping (#12253)
Summary:
The SeqnoToTimeMapping class (RocksDB internal) used by the preserve_internal_time_seconds / preclude_last_level_data_seconds options was essentially in a prototype state with some significant flaws that would risk biting us some day. This is a big, complicated change because both the implementation and the behavioral requirements of the class needed to be upgraded together. In short, this makes SeqnoToTimeMapping more internally responsible for maintaining good invariants, so that callers don't easily encounter dangerous scenarios.
* Some API functions were confusingly named and structured, so I fully refactored the APIs to use clear naming (e.g. `DecodeFrom` and `CopyFromSeqnoRange`), object states, function preconditions, etc.
* Previously the object could informally be sorted / compacted or not, and there was limited checking or enforcement on these states. Now there's a well-defined "enforced" state that is consistently checked in debug mode for applicable operations. (I attempted to create a separate "builder" class for unenforced states, but IIRC found that more cumbersome for existing uses than it was worth.)
* Previously operations would coalesce data in a way that was better for `GetProximalTimeBeforeSeqno` than for `GetProximalSeqnoBeforeTime` which is odd because the latter is the only one used by DB code currently (what is the seqno cut-off for data definitely older than this given time?). This is now reversed to consistently favor `GetProximalSeqnoBeforeTime`, with that logic concentrated in one place: `SeqnoToTimeMapping::SeqnoTimePair::Merge()`. Unfortunately, a lot of unit test logic was specifically testing the old, suboptimal behavior.
* Previously, the natural behavior of SeqnoToTimeMapping was to THROW AWAY data needed to get reasonable answers to the important `GetProximalSeqnoBeforeTime` queries. This is because SeqnoToTimeMapping only had a FIFO policy for staying within the entry capacity (except in aggregate+sort+serialize mode). If the DB wasn't extremely careful to avoid gathering too many time mappings, it could lose track of where the seqno cutoff was for cold data (`GetProximalSeqnoBeforeTime()` returning 0) and preventing all further data migration to the cold tier--until time passes etc. for mappings to catch up with FIFO purging of them. (The problem is not so acute because SST files contain relevant snapshots of the mappings, but the problem would apply to long-lived memtables.)
* Now the SeqnoToTimeMapping class has fully-integrated smarts for keeping a sufficiently complete history, within capacity limits, to give good answers to `GetProximalSeqnoBeforeTime` queries.
* Fixes old `// FIXME: be smarter about how we erase to avoid data falling off the front prematurely.`
* Fix an apparent bug in how entries are selected for storing into SST files. Previously, it only selected entries within the seqno range of the file, but that would easily leave a gap at the beginning of the timeline for data in the file for the purposes of answering GetProximalXXX queries with reasonable accuracy. This could probably lead to the same problem discussed above in naively throwing away entries in FIFO order in the old SeqnoToTimeMapping. The updated testing of GetProximalSeqnoBeforeTime in BasicSeqnoToTimeMapping relies on the fixed behavior.
* Fix a potential compaction CPU efficiency/scaling issue in which each compaction output file would iterate over and sort all seqno-to-time mappings from all compaction input files. Now we distill the input file entries to a constant size before processing each compaction output file.
Intended follow-up (me or others):
* Expand some direct testing of SeqnoToTimeMapping APIs. Here I've focused on updating existing tests to make sense.
* There are likely more gaps in availability of needed SeqnoToTimeMapping data when the DB shuts down and is restarted, at least with WAL.
* The data tracked in the DB could be kept more accurate and limited if it used the oldest seqno of unflushed data. This might require some more API refactoring.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12253
Test Plan: unit tests updated
Reviewed By: jowlyzhang
Differential Revision: D52913733
Pulled By: pdillinger
fbshipit-source-id: 020737fcbbe6212f6701191a6ab86565054c9593
2024-01-20 05:50:38 +00:00
|
|
|
void SetSeqnoTimeTableProperties(const SeqnoToTimeMapping& relevant_mapping,
|
2022-07-15 04:49:34 +00:00
|
|
|
uint64_t uint_64) override;
|
|
|
|
|
2014-06-18 23:36:48 +00:00
|
|
|
private:
|
2014-07-18 23:58:13 +00:00
|
|
|
Arena arena_;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions& ioptions_;
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions& moptions_;
|
2024-02-02 22:14:43 +00:00
|
|
|
std::vector<std::unique_ptr<InternalTblPropColl>>
|
TablePropertiesCollectorFactory
Summary:
This diff addresses task #4296714 and rethinks how users provide us with TablePropertiesCollectors as part of Options.
Here's description of task #4296714:
I'm debugging #4295529 and noticed that our count of user properties kDeletedKeys is wrong. We're sharing one single InternalKeyPropertiesCollector with all Table Builders. In LOG Files, we're outputting number of kDeletedKeys as connected with a single table, while it's actually the total count of deleted keys since creation of the DB.
For example, this table has 3155 entries and 1391828 deleted keys.
The problem with current approach that we call methods on a single TablePropertiesCollector for all the tables we create. Even worse, we could do it from multiple threads at the same time and TablePropertiesCollector has no way of knowing which table we're calling it for.
Good part: Looks like nobody inside Facebook is using Options::table_properties_collectors. This means we should be able to painfully change the API.
In this change, I introduce TablePropertiesCollectorFactory. For every table we create, we call `CreateTablePropertiesCollector`, which creates a TablePropertiesCollector for a single table. We then use it sequentially from a single thread, which means it doesn't have to be thread-safe.
Test Plan:
Added a test in table_properties_collector_test that fails on master (build two tables, assert that kDeletedKeys count is correct for the second one).
Also, all other tests
Reviewers: sdong, dhruba, haobo, kailiu
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18579
2014-05-13 19:30:55 +00:00
|
|
|
table_properties_collectors_;
|
2014-07-18 23:58:13 +00:00
|
|
|
|
2019-03-01 23:41:55 +00:00
|
|
|
BloomBlockBuilder bloom_block_;
|
|
|
|
std::unique_ptr<PlainTableIndexBuilder> index_builder_;
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
WritableFileWriter* file_;
|
2013-10-29 03:34:02 +00:00
|
|
|
uint64_t offset_ = 0;
|
2019-03-01 23:41:55 +00:00
|
|
|
uint32_t bloom_bits_per_key_;
|
|
|
|
size_t huge_page_tlb_size_;
|
2013-10-29 03:34:02 +00:00
|
|
|
Status status_;
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
IOStatus io_status_;
|
2013-12-06 00:51:26 +00:00
|
|
|
TableProperties properties_;
|
2014-06-18 23:36:48 +00:00
|
|
|
PlainTableKeyEncoder encoder_;
|
2013-10-29 03:34:02 +00:00
|
|
|
|
2019-03-01 23:41:55 +00:00
|
|
|
bool store_index_in_file_;
|
|
|
|
|
2014-07-18 23:58:13 +00:00
|
|
|
std::vector<uint32_t> keys_or_prefixes_hashes_;
|
2013-10-29 03:34:02 +00:00
|
|
|
bool closed_ = false; // Either Finish() or Abandon() has been called.
|
|
|
|
|
2014-07-18 23:58:13 +00:00
|
|
|
const SliceTransform* prefix_extractor_;
|
|
|
|
|
|
|
|
Slice GetPrefix(const Slice& target) const {
|
|
|
|
assert(target.size() >= 8); // target is internal key
|
2022-03-15 17:02:33 +00:00
|
|
|
return GetPrefixFromUserKey(ExtractUserKey(target));
|
2014-07-18 23:58:13 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Slice GetPrefix(const ParsedInternalKey& target) const {
|
|
|
|
return GetPrefixFromUserKey(target.user_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice GetPrefixFromUserKey(const Slice& user_key) const {
|
|
|
|
if (!IsTotalOrderMode()) {
|
|
|
|
return prefix_extractor_->Transform(user_key);
|
|
|
|
} else {
|
|
|
|
// Use empty slice as prefix if prefix_extractor is not set.
|
|
|
|
// In that case,
|
|
|
|
// it falls back to pure binary search and
|
|
|
|
// total iterator seek is supported.
|
|
|
|
return Slice();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
|
2013-10-29 03:34:02 +00:00
|
|
|
};
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|