2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2012-05-30 06:18:16 +00:00
|
|
|
|
2018-09-06 01:07:53 +00:00
|
|
|
#pragma once
|
2012-05-30 06:18:16 +00:00
|
|
|
|
2013-05-10 20:19:39 +00:00
|
|
|
#include <atomic>
|
2013-02-15 19:53:17 +00:00
|
|
|
#include <cstddef>
|
|
|
|
#include <cstdint>
|
2019-02-20 23:46:59 +00:00
|
|
|
#include <map>
|
2013-02-15 19:53:17 +00:00
|
|
|
#include <memory>
|
2019-02-20 23:46:59 +00:00
|
|
|
#include <string>
|
2013-06-10 18:57:55 +00:00
|
|
|
#include <vector>
|
2013-02-15 19:53:17 +00:00
|
|
|
|
2017-04-26 22:19:50 +00:00
|
|
|
#include "rocksdb/status.h"
|
|
|
|
|
2013-10-04 04:49:15 +00:00
|
|
|
namespace rocksdb {
|
2012-05-30 06:18:16 +00:00
|
|
|
|
2012-11-03 04:02:40 +00:00
|
|
|
/**
|
|
|
|
* Keep adding ticker's here.
|
2013-12-07 00:51:35 +00:00
|
|
|
* 1. Any ticker should be added before TICKER_ENUM_MAX.
|
|
|
|
* 2. Add a readable string in TickersNameMap below for the newly added ticker.
|
2017-06-14 23:57:39 +00:00
|
|
|
* 3. Add a corresponding enum value to TickerType.java in the java API
|
2019-02-28 18:14:19 +00:00
|
|
|
* 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
|
|
|
|
* and toCppTickers
|
2012-11-03 04:02:40 +00:00
|
|
|
*/
|
2014-07-28 19:05:36 +00:00
|
|
|
enum Tickers : uint32_t {
|
2013-11-13 06:46:51 +00:00
|
|
|
// total block cache misses
|
|
|
|
// REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
|
|
|
|
// BLOCK_CACHE_FILTER_MISS +
|
|
|
|
// BLOCK_CACHE_DATA_MISS;
|
2014-07-28 19:05:36 +00:00
|
|
|
BLOCK_CACHE_MISS = 0,
|
2013-11-13 06:46:51 +00:00
|
|
|
// total block cache hit
|
|
|
|
// REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
|
|
|
|
// BLOCK_CACHE_FILTER_HIT +
|
|
|
|
// BLOCK_CACHE_DATA_HIT;
|
2013-10-01 21:06:29 +00:00
|
|
|
BLOCK_CACHE_HIT,
|
2013-11-13 06:46:51 +00:00
|
|
|
// # of blocks added to block cache.
|
|
|
|
BLOCK_CACHE_ADD,
|
2016-03-11 01:35:19 +00:00
|
|
|
// # of failures when adding blocks to block cache.
|
|
|
|
BLOCK_CACHE_ADD_FAILURES,
|
2013-11-13 06:46:51 +00:00
|
|
|
// # of times cache miss when accessing index block from block cache.
|
|
|
|
BLOCK_CACHE_INDEX_MISS,
|
|
|
|
// # of times cache hit when accessing index block from block cache.
|
|
|
|
BLOCK_CACHE_INDEX_HIT,
|
2016-10-11 18:59:05 +00:00
|
|
|
// # of index blocks added to block cache.
|
|
|
|
BLOCK_CACHE_INDEX_ADD,
|
2016-06-03 17:47:47 +00:00
|
|
|
// # of bytes of index blocks inserted into cache
|
|
|
|
BLOCK_CACHE_INDEX_BYTES_INSERT,
|
|
|
|
// # of bytes of index block erased from cache
|
|
|
|
BLOCK_CACHE_INDEX_BYTES_EVICT,
|
2013-11-13 06:46:51 +00:00
|
|
|
// # of times cache miss when accessing filter block from block cache.
|
|
|
|
BLOCK_CACHE_FILTER_MISS,
|
|
|
|
// # of times cache hit when accessing filter block from block cache.
|
|
|
|
BLOCK_CACHE_FILTER_HIT,
|
2016-10-11 18:59:05 +00:00
|
|
|
// # of filter blocks added to block cache.
|
|
|
|
BLOCK_CACHE_FILTER_ADD,
|
2016-06-03 17:47:47 +00:00
|
|
|
// # of bytes of bloom filter blocks inserted into cache
|
|
|
|
BLOCK_CACHE_FILTER_BYTES_INSERT,
|
|
|
|
// # of bytes of bloom filter block erased from cache
|
|
|
|
BLOCK_CACHE_FILTER_BYTES_EVICT,
|
2013-11-13 06:46:51 +00:00
|
|
|
// # of times cache miss when accessing data block from block cache.
|
|
|
|
BLOCK_CACHE_DATA_MISS,
|
|
|
|
// # of times cache hit when accessing data block from block cache.
|
|
|
|
BLOCK_CACHE_DATA_HIT,
|
2016-10-11 18:59:05 +00:00
|
|
|
// # of data blocks added to block cache.
|
|
|
|
BLOCK_CACHE_DATA_ADD,
|
|
|
|
// # of bytes of data blocks inserted into cache
|
|
|
|
BLOCK_CACHE_DATA_BYTES_INSERT,
|
2015-10-07 22:17:20 +00:00
|
|
|
// # of bytes read from cache.
|
|
|
|
BLOCK_CACHE_BYTES_READ,
|
|
|
|
// # of bytes written into cache.
|
|
|
|
BLOCK_CACHE_BYTES_WRITE,
|
2016-06-03 17:47:47 +00:00
|
|
|
|
2018-04-05 22:54:24 +00:00
|
|
|
// # of times bloom filter has avoided file reads, i.e., negatives.
|
2013-11-13 06:46:51 +00:00
|
|
|
BLOOM_FILTER_USEFUL,
|
2018-04-05 22:54:24 +00:00
|
|
|
// # of times bloom FullFilter has not avoided the reads.
|
|
|
|
BLOOM_FILTER_FULL_POSITIVE,
|
|
|
|
// # of times bloom FullFilter has not avoided the reads and data actually
|
|
|
|
// exist.
|
|
|
|
BLOOM_FILTER_FULL_TRUE_POSITIVE,
|
2013-10-01 21:06:29 +00:00
|
|
|
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
BLOOM_FILTER_MICROS,
|
|
|
|
|
2015-12-16 02:20:10 +00:00
|
|
|
// # persistent cache hit
|
|
|
|
PERSISTENT_CACHE_HIT,
|
|
|
|
// # persistent cache miss
|
|
|
|
PERSISTENT_CACHE_MISS,
|
|
|
|
|
2016-08-11 00:42:24 +00:00
|
|
|
// # total simulation block cache hits
|
|
|
|
SIM_BLOCK_CACHE_HIT,
|
|
|
|
// # total simulation block cache misses
|
|
|
|
SIM_BLOCK_CACHE_MISS,
|
|
|
|
|
2013-12-03 20:59:53 +00:00
|
|
|
// # of memtable hits.
|
|
|
|
MEMTABLE_HIT,
|
|
|
|
// # of memtable misses.
|
|
|
|
MEMTABLE_MISS,
|
|
|
|
|
2015-02-09 22:53:58 +00:00
|
|
|
// # of Get() queries served by L0
|
|
|
|
GET_HIT_L0,
|
|
|
|
// # of Get() queries served by L1
|
|
|
|
GET_HIT_L1,
|
|
|
|
// # of Get() queries served by L2 and up
|
|
|
|
GET_HIT_L2_AND_UP,
|
|
|
|
|
2012-11-09 02:18:34 +00:00
|
|
|
/**
|
|
|
|
* COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
|
2016-11-28 19:44:40 +00:00
|
|
|
* There are 4 reasons currently.
|
2012-11-09 02:18:34 +00:00
|
|
|
*/
|
2014-02-14 00:28:21 +00:00
|
|
|
COMPACTION_KEY_DROP_NEWER_ENTRY, // key was written with a newer value.
|
2016-11-28 19:44:40 +00:00
|
|
|
// Also includes keys dropped for range del.
|
2014-02-14 00:28:21 +00:00
|
|
|
COMPACTION_KEY_DROP_OBSOLETE, // The key is obsolete.
|
2016-11-28 19:44:40 +00:00
|
|
|
COMPACTION_KEY_DROP_RANGE_DEL, // key was covered by a range tombstone.
|
2014-02-14 00:28:21 +00:00
|
|
|
COMPACTION_KEY_DROP_USER, // user compaction function has dropped the key.
|
2016-11-28 19:44:40 +00:00
|
|
|
COMPACTION_RANGE_DEL_DROP_OBSOLETE, // all keys in range were deleted.
|
2017-08-19 21:01:25 +00:00
|
|
|
// Deletions obsoleted before bottom level due to file gap optimization.
|
|
|
|
COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
|
2018-03-08 18:39:15 +00:00
|
|
|
// If a compaction was cancelled in sfm to prevent ENOSPC
|
|
|
|
COMPACTION_CANCELLED,
|
2016-11-28 19:44:40 +00:00
|
|
|
|
2013-01-16 00:48:22 +00:00
|
|
|
// Number of keys written to the database via the Put and Write call's
|
2013-10-01 21:06:29 +00:00
|
|
|
NUMBER_KEYS_WRITTEN,
|
2013-01-16 00:48:22 +00:00
|
|
|
// Number of Keys read,
|
2013-10-01 21:06:29 +00:00
|
|
|
NUMBER_KEYS_READ,
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 21:12:47 +00:00
|
|
|
// Number keys updated, if inplace update is enabled
|
|
|
|
NUMBER_KEYS_UPDATED,
|
2015-06-24 22:00:51 +00:00
|
|
|
// The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
|
|
|
|
// DB::Merge(), and DB::Write().
|
2013-10-01 21:06:29 +00:00
|
|
|
BYTES_WRITTEN,
|
2015-06-24 22:00:51 +00:00
|
|
|
// The number of uncompressed bytes read from DB::Get(). It could be
|
|
|
|
// either from memtables, cache, or table files.
|
|
|
|
// For the number of logical bytes read from DB::MultiGet(),
|
|
|
|
// please use NUMBER_MULTIGET_BYTES_READ.
|
2013-10-01 21:06:29 +00:00
|
|
|
BYTES_READ,
|
2015-09-11 18:37:44 +00:00
|
|
|
// The number of calls to seek/next/prev
|
|
|
|
NUMBER_DB_SEEK,
|
|
|
|
NUMBER_DB_NEXT,
|
|
|
|
NUMBER_DB_PREV,
|
|
|
|
// The number of calls to seek/next/prev that returned data
|
|
|
|
NUMBER_DB_SEEK_FOUND,
|
|
|
|
NUMBER_DB_NEXT_FOUND,
|
|
|
|
NUMBER_DB_PREV_FOUND,
|
|
|
|
// The number of uncompressed bytes read from an iterator.
|
|
|
|
// Includes size of key and value.
|
|
|
|
ITER_BYTES_READ,
|
2013-10-01 21:06:29 +00:00
|
|
|
NO_FILE_CLOSES,
|
|
|
|
NO_FILE_OPENS,
|
|
|
|
NO_FILE_ERRORS,
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 01:51:08 +00:00
|
|
|
// DEPRECATED Time system had to wait to do LO-L1 compactions
|
2013-10-01 21:06:29 +00:00
|
|
|
STALL_L0_SLOWDOWN_MICROS,
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 01:51:08 +00:00
|
|
|
// DEPRECATED Time system had to wait to move memtable to L1.
|
2013-10-01 21:06:29 +00:00
|
|
|
STALL_MEMTABLE_COMPACTION_MICROS,
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 01:51:08 +00:00
|
|
|
// DEPRECATED write throttle because of too many files in L0
|
2013-10-01 21:06:29 +00:00
|
|
|
STALL_L0_NUM_FILES_MICROS,
|
DB Stats Dump to print total stall time
Summary:
Add printing of stall time in DB Stats:
Sample outputs:
** DB Stats **
Uptime(secs): 53.2 total, 1.7 interval
Cumulative writes: 625940 writes, 625939 keys, 625940 batches, 1.0 writes per batch, 0.49 GB user ingest, stall micros: 50691070
Cumulative WAL: 625940 writes, 625939 syncs, 1.00 writes per sync, 0.49 GB written
Interval writes: 10859 writes, 10859 keys, 10859 batches, 1.0 writes per batch, 8.7 MB user ingest, stall micros: 1692319
Interval WAL: 10859 writes, 10859 syncs, 1.00 writes per sync, 0.01 MB written
Test Plan:
make all check
verify printing using db_bench
Reviewers: igor, yhchiang, rven, MarkCallaghan
Reviewed By: MarkCallaghan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D31239
2015-01-09 01:51:08 +00:00
|
|
|
// Writer has to wait for compaction or flush to finish.
|
|
|
|
STALL_MICROS,
|
2015-02-05 05:39:45 +00:00
|
|
|
// The wait time for db mutex.
|
2016-01-26 01:07:37 +00:00
|
|
|
// Disabled by default. To enable it set stats level to kAll
|
2015-02-05 05:39:45 +00:00
|
|
|
DB_MUTEX_WAIT_MICROS,
|
2013-10-01 21:06:29 +00:00
|
|
|
RATE_LIMIT_DELAY_MILLIS,
|
2018-11-20 21:10:05 +00:00
|
|
|
// DEPRECATED number of iterators currently open
|
|
|
|
NO_ITERATORS,
|
2013-06-05 18:22:38 +00:00
|
|
|
|
|
|
|
// Number of MultiGet calls, keys read, and bytes read
|
2013-10-01 21:06:29 +00:00
|
|
|
NUMBER_MULTIGET_CALLS,
|
|
|
|
NUMBER_MULTIGET_KEYS_READ,
|
|
|
|
NUMBER_MULTIGET_BYTES_READ,
|
2013-06-05 18:22:38 +00:00
|
|
|
|
2013-07-28 18:53:08 +00:00
|
|
|
// Number of deletes records that were not required to be
|
|
|
|
// written to storage because key does not exist
|
2013-10-01 21:06:29 +00:00
|
|
|
NUMBER_FILTERED_DELETES,
|
|
|
|
NUMBER_MERGE_FAILURES,
|
2013-08-09 06:07:36 +00:00
|
|
|
|
2013-08-23 21:49:57 +00:00
|
|
|
// number of times bloom was checked before creating iterator on a
|
|
|
|
// file, and the number of times the check was useful in avoiding
|
|
|
|
// iterator creation (and thus likely IOPs).
|
2013-10-01 21:06:29 +00:00
|
|
|
BLOOM_FILTER_PREFIX_CHECKED,
|
|
|
|
BLOOM_FILTER_PREFIX_USEFUL,
|
2013-08-23 21:49:57 +00:00
|
|
|
|
2013-07-28 18:53:08 +00:00
|
|
|
// Number of times we had to reseek inside an iteration to skip
|
|
|
|
// over large number of keys with same userkey.
|
2013-10-01 21:06:29 +00:00
|
|
|
NUMBER_OF_RESEEKS_IN_ITERATION,
|
2013-07-28 18:53:08 +00:00
|
|
|
|
2013-10-04 17:33:28 +00:00
|
|
|
// Record the number of calls to GetUpadtesSince. Useful to keep track of
|
|
|
|
// transaction log iterator refreshes
|
|
|
|
GET_UPDATES_SINCE_CALLS,
|
2014-02-14 00:28:21 +00:00
|
|
|
BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache
|
|
|
|
BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache
|
2017-05-18 06:03:54 +00:00
|
|
|
// Number of blocks added to compressed block cache
|
2016-03-11 01:35:19 +00:00
|
|
|
BLOCK_CACHE_COMPRESSED_ADD,
|
|
|
|
// Number of failures when adding blocks to compressed block cache
|
|
|
|
BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
|
|
|
|
WAL_FILE_SYNCED, // Number of times WAL sync is done
|
|
|
|
WAL_FILE_BYTES, // Number of bytes written to WAL
|
Add monitoring for universal compaction and add counters for compaction IO
Summary:
Adds these counters
{ WAL_FILE_SYNCED, "rocksdb.wal.synced" }
number of writes that request a WAL sync
{ WAL_FILE_BYTES, "rocksdb.wal.bytes" },
number of bytes written to the WAL
{ WRITE_DONE_BY_SELF, "rocksdb.write.self" },
number of writes processed by the calling thread
{ WRITE_DONE_BY_OTHER, "rocksdb.write.other" },
number of writes not processed by the calling thread. Instead these were
processed by the current holder of the write lock
{ WRITE_WITH_WAL, "rocksdb.write.wal" },
number of writes that request WAL logging
{ COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" },
number of bytes read during compaction
{ COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" },
number of bytes written during compaction
Per-interval stats output was updated with WAL stats and correct stats for universal compaction
including a correct value for write-amplification. It now looks like:
Compactions
Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall Stall-cnt
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0 7 464 46.4 281 3411 3875 3411 0 3875 2.1 12.1 13.8 621 0 240 240 628 0.0 0
Uptime(secs): 310.8 total, 2.0 interval
Writes cumulative: 9999999 total, 9999999 batches, 1.0 per batch, 1.22 ingest GB
WAL cumulative: 9999999 WAL writes, 9999999 WAL syncs, 1.00 writes per sync, 1.22 GB written
Compaction IO cumulative (GB): 1.22 new, 3.33 read, 3.78 write, 7.12 read+write
Compaction IO cumulative (MB/sec): 4.0 new, 11.0 read, 12.5 write, 23.4 read+write
Amplification cumulative: 4.1 write, 6.8 compaction
Writes interval: 100000 total, 100000 batches, 1.0 per batch, 12.5 ingest MB
WAL interval: 100000 WAL writes, 100000 WAL syncs, 1.00 writes per sync, 0.01 MB written
Compaction IO interval (MB): 12.49 new, 14.98 read, 21.50 write, 36.48 read+write
Compaction IO interval (MB/sec): 6.4 new, 7.6 read, 11.0 write, 18.6 read+write
Amplification interval: 101.7 write, 102.9 compaction
Stalls(secs): 142.924 level0_slowdown, 0.000 level0_numfiles, 0.805 memtable_compaction, 0.000 leveln_slowdown
Stalls(count): 132461 level0_slowdown, 0 level0_numfiles, 3 memtable_compaction, 0 leveln_slowdown
Task ID: #3329644, #3301695
Blame Rev:
Test Plan:
Revert Plan:
Database Impact:
Memcache Impact:
Other Notes:
EImportant:
- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -
Reviewers: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14583
2013-12-09 21:43:34 +00:00
|
|
|
|
|
|
|
// Writes can be processed by requesting thread or by the thread at the
|
|
|
|
// head of the writers queue.
|
|
|
|
WRITE_DONE_BY_SELF,
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
WRITE_DONE_BY_OTHER, // Equivalent to writes done for others
|
2014-09-08 17:37:05 +00:00
|
|
|
WRITE_TIMEDOUT, // Number of writes ending up with timed-out.
|
2014-02-14 00:28:21 +00:00
|
|
|
WRITE_WITH_WAL, // Number of Write calls that request WAL
|
|
|
|
COMPACT_READ_BYTES, // Bytes read during compaction
|
|
|
|
COMPACT_WRITE_BYTES, // Bytes written during compaction
|
2014-07-03 23:28:03 +00:00
|
|
|
FLUSH_WRITE_BYTES, // Bytes written during flush
|
Add monitoring for universal compaction and add counters for compaction IO
Summary:
Adds these counters
{ WAL_FILE_SYNCED, "rocksdb.wal.synced" }
number of writes that request a WAL sync
{ WAL_FILE_BYTES, "rocksdb.wal.bytes" },
number of bytes written to the WAL
{ WRITE_DONE_BY_SELF, "rocksdb.write.self" },
number of writes processed by the calling thread
{ WRITE_DONE_BY_OTHER, "rocksdb.write.other" },
number of writes not processed by the calling thread. Instead these were
processed by the current holder of the write lock
{ WRITE_WITH_WAL, "rocksdb.write.wal" },
number of writes that request WAL logging
{ COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" },
number of bytes read during compaction
{ COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" },
number of bytes written during compaction
Per-interval stats output was updated with WAL stats and correct stats for universal compaction
including a correct value for write-amplification. It now looks like:
Compactions
Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall Stall-cnt
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0 7 464 46.4 281 3411 3875 3411 0 3875 2.1 12.1 13.8 621 0 240 240 628 0.0 0
Uptime(secs): 310.8 total, 2.0 interval
Writes cumulative: 9999999 total, 9999999 batches, 1.0 per batch, 1.22 ingest GB
WAL cumulative: 9999999 WAL writes, 9999999 WAL syncs, 1.00 writes per sync, 1.22 GB written
Compaction IO cumulative (GB): 1.22 new, 3.33 read, 3.78 write, 7.12 read+write
Compaction IO cumulative (MB/sec): 4.0 new, 11.0 read, 12.5 write, 23.4 read+write
Amplification cumulative: 4.1 write, 6.8 compaction
Writes interval: 100000 total, 100000 batches, 1.0 per batch, 12.5 ingest MB
WAL interval: 100000 WAL writes, 100000 WAL syncs, 1.00 writes per sync, 0.01 MB written
Compaction IO interval (MB): 12.49 new, 14.98 read, 21.50 write, 36.48 read+write
Compaction IO interval (MB/sec): 6.4 new, 7.6 read, 11.0 write, 18.6 read+write
Amplification interval: 101.7 write, 102.9 compaction
Stalls(secs): 142.924 level0_slowdown, 0.000 level0_numfiles, 0.805 memtable_compaction, 0.000 leveln_slowdown
Stalls(count): 132461 level0_slowdown, 0 level0_numfiles, 3 memtable_compaction, 0 leveln_slowdown
Task ID: #3329644, #3301695
Blame Rev:
Test Plan:
Revert Plan:
Database Impact:
Memcache Impact:
Other Notes:
EImportant:
- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -
Reviewers: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14583
2013-12-09 21:43:34 +00:00
|
|
|
|
2014-02-14 00:28:21 +00:00
|
|
|
// Number of table's properties loaded directly from file, without creating
|
|
|
|
// table reader object.
|
|
|
|
NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
|
2014-03-07 22:43:22 +00:00
|
|
|
NUMBER_SUPERVERSION_ACQUIRES,
|
|
|
|
NUMBER_SUPERVERSION_RELEASES,
|
2014-03-09 05:12:13 +00:00
|
|
|
NUMBER_SUPERVERSION_CLEANUPS,
|
2016-07-19 16:44:03 +00:00
|
|
|
|
|
|
|
// # of compressions/decompressions executed
|
|
|
|
NUMBER_BLOCK_COMPRESSED,
|
|
|
|
NUMBER_BLOCK_DECOMPRESSED,
|
|
|
|
|
2014-06-09 19:26:09 +00:00
|
|
|
NUMBER_BLOCK_NOT_COMPRESSED,
|
2015-03-03 18:59:36 +00:00
|
|
|
MERGE_OPERATION_TOTAL_TIME,
|
|
|
|
FILTER_OPERATION_TOTAL_TIME,
|
2015-06-23 17:25:45 +00:00
|
|
|
|
|
|
|
// Row cache.
|
|
|
|
ROW_CACHE_HIT,
|
|
|
|
ROW_CACHE_MISS,
|
|
|
|
|
2016-08-27 01:55:58 +00:00
|
|
|
// Read amplification statistics.
|
|
|
|
// Read amplification can be calculated using this formula
|
|
|
|
// (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
|
|
|
|
//
|
|
|
|
// REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
|
|
|
|
READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used.
|
|
|
|
READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks.
|
|
|
|
|
2017-03-03 01:40:24 +00:00
|
|
|
// Number of refill intervals where rate limiter's bytes are fully consumed.
|
|
|
|
NUMBER_RATE_LIMITER_DRAINS,
|
|
|
|
|
2017-11-21 05:12:55 +00:00
|
|
|
// Number of internal keys skipped by Iterator
|
|
|
|
NUMBER_ITER_SKIP,
|
|
|
|
|
2017-11-28 19:42:28 +00:00
|
|
|
// BlobDB specific stats
|
|
|
|
// # of Put/PutTTL/PutUntil to BlobDB.
|
|
|
|
BLOB_DB_NUM_PUT,
|
|
|
|
// # of Write to BlobDB.
|
|
|
|
BLOB_DB_NUM_WRITE,
|
|
|
|
// # of Get to BlobDB.
|
|
|
|
BLOB_DB_NUM_GET,
|
|
|
|
// # of MultiGet to BlobDB.
|
|
|
|
BLOB_DB_NUM_MULTIGET,
|
|
|
|
// # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
|
|
|
|
BLOB_DB_NUM_SEEK,
|
|
|
|
// # of Next to BlobDB iterator.
|
|
|
|
BLOB_DB_NUM_NEXT,
|
|
|
|
// # of Prev to BlobDB iterator.
|
|
|
|
BLOB_DB_NUM_PREV,
|
|
|
|
// # of keys written to BlobDB.
|
|
|
|
BLOB_DB_NUM_KEYS_WRITTEN,
|
|
|
|
// # of keys read from BlobDB.
|
|
|
|
BLOB_DB_NUM_KEYS_READ,
|
|
|
|
// # of bytes (key + value) written to BlobDB.
|
|
|
|
BLOB_DB_BYTES_WRITTEN,
|
|
|
|
// # of bytes (keys + value) read from BlobDB.
|
|
|
|
BLOB_DB_BYTES_READ,
|
|
|
|
// # of keys written by BlobDB as non-TTL inlined value.
|
|
|
|
BLOB_DB_WRITE_INLINED,
|
|
|
|
// # of keys written by BlobDB as TTL inlined value.
|
|
|
|
BLOB_DB_WRITE_INLINED_TTL,
|
|
|
|
// # of keys written by BlobDB as non-TTL blob value.
|
|
|
|
BLOB_DB_WRITE_BLOB,
|
|
|
|
// # of keys written by BlobDB as TTL blob value.
|
|
|
|
BLOB_DB_WRITE_BLOB_TTL,
|
|
|
|
// # of bytes written to blob file.
|
|
|
|
BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
|
|
|
|
// # of bytes read from blob file.
|
|
|
|
BLOB_DB_BLOB_FILE_BYTES_READ,
|
|
|
|
// # of times a blob files being synced.
|
|
|
|
BLOB_DB_BLOB_FILE_SYNCED,
|
|
|
|
// # of blob index evicted from base DB by BlobDB compaction filter because
|
|
|
|
// of expiration.
|
2018-03-06 19:46:20 +00:00
|
|
|
BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
|
|
|
|
// size of blob index evicted from base DB by BlobDB compaction filter
|
|
|
|
// because of expiration.
|
|
|
|
BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
|
|
|
|
// # of blob index evicted from base DB by BlobDB compaction filter because
|
|
|
|
// of corresponding file deleted.
|
|
|
|
BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
|
|
|
|
// size of blob index evicted from base DB by BlobDB compaction filter
|
|
|
|
// because of corresponding file deleted.
|
|
|
|
BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
|
2017-11-28 19:42:28 +00:00
|
|
|
// # of blob files being garbage collected.
|
|
|
|
BLOB_DB_GC_NUM_FILES,
|
|
|
|
// # of blob files generated by garbage collection.
|
|
|
|
BLOB_DB_GC_NUM_NEW_FILES,
|
|
|
|
// # of BlobDB garbage collection failures.
|
|
|
|
BLOB_DB_GC_FAILURES,
|
|
|
|
// # of keys drop by BlobDB garbage collection because they had been
|
|
|
|
// overwritten.
|
|
|
|
BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
|
|
|
|
// # of keys drop by BlobDB garbage collection because of expiration.
|
|
|
|
BLOB_DB_GC_NUM_KEYS_EXPIRED,
|
|
|
|
// # of keys relocated to new blob file by garbage collection.
|
|
|
|
BLOB_DB_GC_NUM_KEYS_RELOCATED,
|
|
|
|
// # of bytes drop by BlobDB garbage collection because they had been
|
|
|
|
// overwritten.
|
|
|
|
BLOB_DB_GC_BYTES_OVERWRITTEN,
|
|
|
|
// # of bytes drop by BlobDB garbage collection because of expiration.
|
|
|
|
BLOB_DB_GC_BYTES_EXPIRED,
|
|
|
|
// # of bytes relocated to new blob file by garbage collection.
|
|
|
|
BLOB_DB_GC_BYTES_RELOCATED,
|
|
|
|
// # of blob files evicted because of BlobDB is full.
|
|
|
|
BLOB_DB_FIFO_NUM_FILES_EVICTED,
|
|
|
|
// # of keys in the blob files evicted because of BlobDB is full.
|
|
|
|
BLOB_DB_FIFO_NUM_KEYS_EVICTED,
|
|
|
|
// # of bytes in the blob files evicted because of BlobDB is full.
|
|
|
|
BLOB_DB_FIFO_BYTES_EVICTED,
|
|
|
|
|
2018-12-14 19:34:29 +00:00
|
|
|
// These counters indicate a performance issue in WritePrepared transactions.
|
2018-04-08 04:55:42 +00:00
|
|
|
// We should not seem them ticking them much.
|
|
|
|
// # of times prepare_mutex_ is acquired in the fast path.
|
|
|
|
TXN_PREPARE_MUTEX_OVERHEAD,
|
|
|
|
// # of times old_commit_map_mutex_ is acquired in the fast path.
|
|
|
|
TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
|
|
|
|
// # of times we checked a batch for duplicate keys.
|
|
|
|
TXN_DUPLICATE_KEY_OVERHEAD,
|
|
|
|
// # of times snapshot_mutex_ is acquired in the fast path.
|
|
|
|
TXN_SNAPSHOT_MUTEX_OVERHEAD,
|
2019-08-05 20:30:56 +00:00
|
|
|
// # of times ::Get returned TryAgain due to expired snapshot seq
|
|
|
|
TXN_GET_TRY_AGAIN,
|
2018-04-08 04:55:42 +00:00
|
|
|
|
2019-01-24 02:11:08 +00:00
|
|
|
// Number of keys actually found in MultiGet calls (vs number requested by
|
|
|
|
// caller)
|
2018-04-20 22:17:46 +00:00
|
|
|
// NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
|
|
|
|
NUMBER_MULTIGET_KEYS_FOUND,
|
2018-11-13 19:44:25 +00:00
|
|
|
|
2018-11-20 21:10:05 +00:00
|
|
|
NO_ITERATOR_CREATED, // number of iterators created
|
2018-11-13 19:44:25 +00:00
|
|
|
NO_ITERATOR_DELETED, // number of iterators deleted
|
2019-01-24 02:11:08 +00:00
|
|
|
|
|
|
|
BLOCK_CACHE_COMPRESSION_DICT_MISS,
|
|
|
|
BLOCK_CACHE_COMPRESSION_DICT_HIT,
|
|
|
|
BLOCK_CACHE_COMPRESSION_DICT_ADD,
|
|
|
|
BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
|
|
|
|
BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
|
2013-10-01 21:06:29 +00:00
|
|
|
TICKER_ENUM_MAX
|
2012-11-03 04:02:40 +00:00
|
|
|
};
|
|
|
|
|
2013-07-28 18:53:08 +00:00
|
|
|
// The order of items listed in Tickers should be the same as
|
|
|
|
// the order listed in TickersNameMap
|
2018-11-27 05:30:12 +00:00
|
|
|
extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
|
2013-02-21 03:22:13 +00:00
|
|
|
|
2013-02-15 19:53:17 +00:00
|
|
|
/**
|
|
|
|
* Keep adding histogram's here.
|
2017-05-18 06:03:54 +00:00
|
|
|
* Any histogram should have value less than HISTOGRAM_ENUM_MAX
|
2013-02-15 19:53:17 +00:00
|
|
|
* Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX
|
2013-06-10 18:57:55 +00:00
|
|
|
* Add a string representation in HistogramsNameMap below
|
2013-02-15 19:53:17 +00:00
|
|
|
* And increment HISTOGRAM_ENUM_MAX
|
2017-06-14 23:57:39 +00:00
|
|
|
* Add a corresponding enum value to HistogramType.java in the java API
|
2013-02-15 19:53:17 +00:00
|
|
|
*/
|
2014-07-28 19:05:36 +00:00
|
|
|
enum Histograms : uint32_t {
|
|
|
|
DB_GET = 0,
|
2013-10-01 21:06:29 +00:00
|
|
|
DB_WRITE,
|
|
|
|
COMPACTION_TIME,
|
2019-01-30 00:23:21 +00:00
|
|
|
COMPACTION_CPU_TIME,
|
2015-09-10 20:50:00 +00:00
|
|
|
SUBCOMPACTION_SETUP_TIME,
|
2013-10-01 21:06:29 +00:00
|
|
|
TABLE_SYNC_MICROS,
|
|
|
|
COMPACTION_OUTFILE_SYNC_MICROS,
|
|
|
|
WAL_FILE_SYNC_MICROS,
|
|
|
|
MANIFEST_FILE_SYNC_MICROS,
|
2013-06-07 17:02:28 +00:00
|
|
|
// TIME SPENT IN IO DURING TABLE OPEN
|
2013-10-01 21:06:29 +00:00
|
|
|
TABLE_OPEN_IO_MICROS,
|
|
|
|
DB_MULTIGET,
|
|
|
|
READ_BLOCK_COMPACTION_MICROS,
|
|
|
|
READ_BLOCK_GET_MICROS,
|
|
|
|
WRITE_RAW_BLOCK_MICROS,
|
|
|
|
STALL_L0_SLOWDOWN_COUNT,
|
|
|
|
STALL_MEMTABLE_COMPACTION_COUNT,
|
|
|
|
STALL_L0_NUM_FILES_COUNT,
|
|
|
|
HARD_RATE_LIMIT_DELAY_COUNT,
|
|
|
|
SOFT_RATE_LIMIT_DELAY_COUNT,
|
|
|
|
NUM_FILES_IN_SINGLE_COMPACTION,
|
2014-08-13 22:56:37 +00:00
|
|
|
DB_SEEK,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 18:20:25 +00:00
|
|
|
WRITE_STALL,
|
2015-08-05 19:11:30 +00:00
|
|
|
SST_READ_MICROS,
|
2015-09-14 18:03:37 +00:00
|
|
|
// The number of subcompactions actually scheduled during a compaction
|
|
|
|
NUM_SUBCOMPACTIONS_SCHEDULED,
|
2016-02-01 02:09:24 +00:00
|
|
|
// Value size distribution in each operation
|
|
|
|
BYTES_PER_READ,
|
|
|
|
BYTES_PER_WRITE,
|
|
|
|
BYTES_PER_MULTIGET,
|
2016-07-19 16:44:03 +00:00
|
|
|
|
|
|
|
// number of bytes compressed/decompressed
|
|
|
|
// number of bytes is when uncompressed; i.e. before/after respectively
|
|
|
|
BYTES_COMPRESSED,
|
|
|
|
BYTES_DECOMPRESSED,
|
|
|
|
COMPRESSION_TIMES_NANOS,
|
|
|
|
DECOMPRESSION_TIMES_NANOS,
|
2017-05-31 14:27:40 +00:00
|
|
|
// Number of merge operands passed to the merge operator in user read
|
|
|
|
// requests.
|
|
|
|
READ_NUM_MERGE_OPERANDS,
|
2016-07-19 16:44:03 +00:00
|
|
|
|
2017-11-28 19:42:28 +00:00
|
|
|
// BlobDB specific stats
|
|
|
|
// Size of keys written to BlobDB.
|
|
|
|
BLOB_DB_KEY_SIZE,
|
|
|
|
// Size of values written to BlobDB.
|
|
|
|
BLOB_DB_VALUE_SIZE,
|
|
|
|
// BlobDB Put/PutWithTTL/PutUntil/Write latency.
|
|
|
|
BLOB_DB_WRITE_MICROS,
|
|
|
|
// BlobDB Get lagency.
|
|
|
|
BLOB_DB_GET_MICROS,
|
|
|
|
// BlobDB MultiGet latency.
|
|
|
|
BLOB_DB_MULTIGET_MICROS,
|
|
|
|
// BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
|
|
|
|
BLOB_DB_SEEK_MICROS,
|
|
|
|
// BlobDB Next latency.
|
|
|
|
BLOB_DB_NEXT_MICROS,
|
|
|
|
// BlobDB Prev latency.
|
|
|
|
BLOB_DB_PREV_MICROS,
|
|
|
|
// Blob file write latency.
|
|
|
|
BLOB_DB_BLOB_FILE_WRITE_MICROS,
|
|
|
|
// Blob file read latency.
|
|
|
|
BLOB_DB_BLOB_FILE_READ_MICROS,
|
|
|
|
// Blob file sync latency.
|
|
|
|
BLOB_DB_BLOB_FILE_SYNC_MICROS,
|
|
|
|
// BlobDB garbage collection time.
|
|
|
|
BLOB_DB_GC_MICROS,
|
|
|
|
// BlobDB compression time.
|
|
|
|
BLOB_DB_COMPRESSION_MICROS,
|
|
|
|
// BlobDB decompression time.
|
|
|
|
BLOB_DB_DECOMPRESSION_MICROS,
|
2017-12-16 02:45:38 +00:00
|
|
|
// Time spent flushing memtable to disk
|
|
|
|
FLUSH_TIME,
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
SST_BATCH_SIZE,
|
2017-11-28 19:42:28 +00:00
|
|
|
|
2018-11-27 18:46:26 +00:00
|
|
|
HISTOGRAM_ENUM_MAX,
|
2013-02-15 19:53:17 +00:00
|
|
|
};
|
|
|
|
|
2018-11-27 05:30:12 +00:00
|
|
|
extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
|
2013-06-10 18:57:55 +00:00
|
|
|
|
2013-02-15 19:53:17 +00:00
|
|
|
struct HistogramData {
|
|
|
|
double median;
|
|
|
|
double percentile95;
|
|
|
|
double percentile99;
|
|
|
|
double average;
|
|
|
|
double standard_deviation;
|
2017-03-09 06:13:15 +00:00
|
|
|
// zero-initialize new members since old Statistics::histogramData()
|
|
|
|
// implementations won't write them.
|
|
|
|
double max = 0.0;
|
2018-05-21 18:09:00 +00:00
|
|
|
uint64_t count = 0;
|
|
|
|
uint64_t sum = 0;
|
2018-12-14 22:18:17 +00:00
|
|
|
double min = 0.0;
|
2013-02-15 19:53:17 +00:00
|
|
|
};
|
|
|
|
|
2019-05-24 17:27:28 +00:00
|
|
|
// StatsLevel can be used to reduce statistics overhead by skipping certain
|
|
|
|
// types of stats in the stats collection process.
|
|
|
|
// Usage:
|
|
|
|
// options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
|
2019-02-28 18:14:19 +00:00
|
|
|
enum StatsLevel : uint8_t {
|
|
|
|
// Disable timer stats, and skip histogram stats
|
|
|
|
kExceptHistogramOrTimers,
|
|
|
|
// Skip timer stats
|
|
|
|
kExceptTimers,
|
2016-09-02 02:57:55 +00:00
|
|
|
// Collect all stats except time inside mutex lock AND time spent on
|
|
|
|
// compression.
|
|
|
|
kExceptDetailedTimers,
|
2016-01-26 01:07:37 +00:00
|
|
|
// Collect all stats except the counters requiring to get time inside the
|
|
|
|
// mutex lock.
|
|
|
|
kExceptTimeForMutex,
|
|
|
|
// Collect all stats, including measuring duration of mutex operations.
|
|
|
|
// If getting time is expensive on the platform to run, it can
|
2016-06-25 07:29:40 +00:00
|
|
|
// reduce scalability to more threads, especially for writes.
|
2016-01-26 01:07:37 +00:00
|
|
|
kAll,
|
|
|
|
};
|
|
|
|
|
2019-05-24 17:27:28 +00:00
|
|
|
// Analyze the performance of a db by providing cumulative stats over time.
|
|
|
|
// Usage:
|
|
|
|
// Options options;
|
|
|
|
// options.statistics = rocksdb::CreateDBStatistics();
|
|
|
|
// Status s = DB::Open(options, kDBPath, &db);
|
|
|
|
// ...
|
|
|
|
// options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
|
|
|
|
// HistogramData hist;
|
|
|
|
// options.statistics->histogramData(FLUSH_TIME, &hist);
|
2012-05-30 06:18:16 +00:00
|
|
|
class Statistics {
|
2012-11-03 04:02:40 +00:00
|
|
|
public:
|
2014-01-17 20:46:06 +00:00
|
|
|
virtual ~Statistics() {}
|
2019-07-24 00:08:26 +00:00
|
|
|
static const char* Type() { return "Statistics"; }
|
2014-07-28 19:05:36 +00:00
|
|
|
virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
|
|
|
|
virtual void histogramData(uint32_t type,
|
|
|
|
HistogramData* const data) const = 0;
|
2018-03-05 21:08:17 +00:00
|
|
|
virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; }
|
2014-07-28 19:05:36 +00:00
|
|
|
virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
|
|
|
|
virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
|
2016-10-11 17:54:11 +00:00
|
|
|
virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0;
|
2019-02-28 18:14:19 +00:00
|
|
|
virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) {
|
2019-03-01 18:39:00 +00:00
|
|
|
if (get_stats_level() <= StatsLevel::kExceptTimers) {
|
2019-02-28 18:14:19 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
recordInHistogram(histogramType, time);
|
|
|
|
}
|
|
|
|
// The function is here only for backward compatibility reason.
|
|
|
|
// Users implementing their own Statistics class should override
|
|
|
|
// recordInHistogram() instead and leave measureTime() as it is.
|
|
|
|
virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) {
|
|
|
|
// This is not supposed to be called.
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
virtual void recordInHistogram(uint32_t histogramType, uint64_t time) {
|
|
|
|
// measureTime() is the old and inaccurate function name.
|
|
|
|
// To keep backward compatible. If users implement their own
|
|
|
|
// statistics, which overrides meareTime() but doesn't override
|
|
|
|
// this function. We forward to measureTime().
|
|
|
|
measureTime(histogramType, time);
|
|
|
|
}
|
2013-02-15 19:53:17 +00:00
|
|
|
|
2017-04-26 22:19:50 +00:00
|
|
|
// Resets all ticker and histogram stats
|
2019-03-27 20:21:27 +00:00
|
|
|
virtual Status Reset() { return Status::NotSupported("Not implemented"); }
|
2017-04-26 22:19:50 +00:00
|
|
|
|
2013-06-19 03:28:41 +00:00
|
|
|
// String representation of the statistic object.
|
2014-08-11 22:04:41 +00:00
|
|
|
virtual std::string ToString() const {
|
|
|
|
// Do nothing by default
|
|
|
|
return std::string("ToString(): not implemented");
|
|
|
|
}
|
2014-07-28 19:05:36 +00:00
|
|
|
|
2019-02-20 23:46:59 +00:00
|
|
|
virtual bool getTickerMap(std::map<std::string, uint64_t>*) const {
|
|
|
|
// Do nothing by default
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
2014-07-28 19:05:36 +00:00
|
|
|
// Override this function to disable particular histogram collection
|
|
|
|
virtual bool HistEnabledForType(uint32_t type) const {
|
|
|
|
return type < HISTOGRAM_ENUM_MAX;
|
|
|
|
}
|
2019-03-01 18:39:00 +00:00
|
|
|
void set_stats_level(StatsLevel sl) {
|
|
|
|
stats_level_.store(sl, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
StatsLevel get_stats_level() const {
|
|
|
|
return stats_level_.load(std::memory_order_relaxed);
|
|
|
|
}
|
2016-01-26 01:07:37 +00:00
|
|
|
|
2019-03-01 18:39:00 +00:00
|
|
|
private:
|
|
|
|
std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers};
|
2012-05-30 06:18:16 +00:00
|
|
|
};
|
|
|
|
|
2013-05-23 18:34:58 +00:00
|
|
|
// Create a concrete DBStatistics object
|
|
|
|
std::shared_ptr<Statistics> CreateDBStatistics();
|
|
|
|
|
2013-10-04 04:49:15 +00:00
|
|
|
} // namespace rocksdb
|