2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2023-05-17 18:27:09 +00:00
|
|
|
#include "rocksdb/statistics.h"
|
2014-08-01 03:52:13 +00:00
|
|
|
|
2014-01-22 01:51:36 +00:00
|
|
|
#include <algorithm>
|
2019-09-20 19:00:55 +00:00
|
|
|
#include <cinttypes>
|
2013-06-19 03:28:41 +00:00
|
|
|
#include <cstdio>
|
2021-09-10 16:46:47 +00:00
|
|
|
|
2023-05-17 18:27:09 +00:00
|
|
|
#include "monitoring/statistics_impl.h"
|
2021-09-10 16:46:47 +00:00
|
|
|
#include "rocksdb/convenience.h"
|
|
|
|
#include "rocksdb/utilities/customizable_util.h"
|
|
|
|
#include "rocksdb/utilities/options_type.h"
|
|
|
|
#include "util/string_util.h"
|
2013-06-19 03:28:41 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2013-06-19 03:28:41 +00:00
|
|
|
|
2018-11-27 05:30:12 +00:00
|
|
|
// The order of items listed in Tickers should be the same as
|
|
|
|
// the order listed in TickersNameMap
|
|
|
|
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
|
|
|
{BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
|
|
|
|
{BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
|
|
|
|
{BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
|
|
|
|
{BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
|
|
|
|
{BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
|
|
|
|
{BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
|
|
|
|
{BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"},
|
|
|
|
{BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
|
|
|
|
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
|
|
|
|
{BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"},
|
|
|
|
{BLOCK_CACHE_FILTER_BYTES_INSERT,
|
|
|
|
"rocksdb.block.cache.filter.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
|
|
|
|
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
|
|
|
|
{BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"},
|
|
|
|
{BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
|
|
|
|
{BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_MISS,
|
|
|
|
"rocksdb.block.cache.compression.dict.miss"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_HIT,
|
|
|
|
"rocksdb.block.cache.compression.dict.hit"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_ADD,
|
|
|
|
"rocksdb.block.cache.compression.dict.add"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
|
|
|
|
"rocksdb.block.cache.compression.dict.bytes.insert"},
|
|
|
|
{BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"},
|
|
|
|
{BLOCK_CACHE_INDEX_ADD_REDUNDANT,
|
|
|
|
"rocksdb.block.cache.index.add.redundant"},
|
|
|
|
{BLOCK_CACHE_FILTER_ADD_REDUNDANT,
|
|
|
|
"rocksdb.block.cache.filter.add.redundant"},
|
|
|
|
{BLOCK_CACHE_DATA_ADD_REDUNDANT, "rocksdb.block.cache.data.add.redundant"},
|
|
|
|
{BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT,
|
|
|
|
"rocksdb.block.cache.compression.dict.add.redundant"},
|
|
|
|
{SECONDARY_CACHE_HITS, "rocksdb.secondary.cache.hits"},
|
|
|
|
{SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"},
|
|
|
|
{SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"},
|
|
|
|
{SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"},
|
|
|
|
{COMPRESSED_SECONDARY_CACHE_DUMMY_HITS,
|
|
|
|
"rocksdb.compressed.secondary.cache.dummy.hits"},
|
|
|
|
{COMPRESSED_SECONDARY_CACHE_HITS,
|
|
|
|
"rocksdb.compressed.secondary.cache.hits"},
|
|
|
|
{COMPRESSED_SECONDARY_CACHE_PROMOTIONS,
|
|
|
|
"rocksdb.compressed.secondary.cache.promotions"},
|
|
|
|
{COMPRESSED_SECONDARY_CACHE_PROMOTION_SKIPS,
|
|
|
|
"rocksdb.compressed.secondary.cache.promotion.skips"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
|
|
|
|
{BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"},
|
|
|
|
{BLOOM_FILTER_FULL_TRUE_POSITIVE,
|
|
|
|
"rocksdb.bloom.filter.full.true.positive"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
|
|
|
|
{BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
|
|
|
|
{BLOOM_FILTER_PREFIX_TRUE_POSITIVE,
|
|
|
|
"rocksdb.bloom.filter.prefix.true.positive"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"},
|
|
|
|
{PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"},
|
|
|
|
{SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"},
|
|
|
|
{SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"},
|
|
|
|
{MEMTABLE_HIT, "rocksdb.memtable.hit"},
|
|
|
|
{MEMTABLE_MISS, "rocksdb.memtable.miss"},
|
|
|
|
{GET_HIT_L0, "rocksdb.l0.hit"},
|
|
|
|
{GET_HIT_L1, "rocksdb.l1.hit"},
|
|
|
|
{GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
|
|
|
|
{COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
|
|
|
|
{COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
|
|
|
|
{COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
|
|
|
|
{COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
|
|
|
|
{COMPACTION_RANGE_DEL_DROP_OBSOLETE,
|
|
|
|
"rocksdb.compaction.range_del.drop.obsolete"},
|
|
|
|
{COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
|
|
|
|
"rocksdb.compaction.optimized.del.drop.obsolete"},
|
|
|
|
{COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
|
|
|
|
{NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
|
|
|
|
{NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
|
|
|
|
{NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
|
|
|
|
{BYTES_WRITTEN, "rocksdb.bytes.written"},
|
|
|
|
{BYTES_READ, "rocksdb.bytes.read"},
|
|
|
|
{NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
|
|
|
|
{NUMBER_DB_NEXT, "rocksdb.number.db.next"},
|
|
|
|
{NUMBER_DB_PREV, "rocksdb.number.db.prev"},
|
|
|
|
{NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
|
|
|
|
{NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
|
|
|
|
{NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
|
|
|
|
{ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
|
|
|
|
{NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
|
|
|
|
{NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"},
|
|
|
|
{NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{NO_FILE_OPENS, "rocksdb.no.file.opens"},
|
|
|
|
{NO_FILE_ERRORS, "rocksdb.no.file.errors"},
|
|
|
|
{STALL_MICROS, "rocksdb.stall.micros"},
|
|
|
|
{DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
|
|
|
|
{NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
|
|
|
|
{NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
|
|
|
|
{NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
|
|
|
|
{GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
|
|
|
|
{WAL_FILE_SYNCED, "rocksdb.wal.synced"},
|
|
|
|
{WAL_FILE_BYTES, "rocksdb.wal.bytes"},
|
|
|
|
{WRITE_DONE_BY_SELF, "rocksdb.write.self"},
|
|
|
|
{WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
|
|
|
|
{WRITE_WITH_WAL, "rocksdb.write.wal"},
|
|
|
|
{COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
|
|
|
|
{COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
|
|
|
|
{FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
|
2020-07-29 20:38:07 +00:00
|
|
|
{COMPACT_READ_BYTES_MARKED, "rocksdb.compact.read.marked.bytes"},
|
|
|
|
{COMPACT_READ_BYTES_PERIODIC, "rocksdb.compact.read.periodic.bytes"},
|
|
|
|
{COMPACT_READ_BYTES_TTL, "rocksdb.compact.read.ttl.bytes"},
|
|
|
|
{COMPACT_WRITE_BYTES_MARKED, "rocksdb.compact.write.marked.bytes"},
|
|
|
|
{COMPACT_WRITE_BYTES_PERIODIC, "rocksdb.compact.write.periodic.bytes"},
|
|
|
|
{COMPACT_WRITE_BYTES_TTL, "rocksdb.compact.write.ttl.bytes"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
|
|
|
|
"rocksdb.number.direct.load.table.properties"},
|
|
|
|
{NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
|
|
|
|
{NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
|
|
|
|
{NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
|
|
|
|
{NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"},
|
|
|
|
{NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{BYTES_COMPRESSED_FROM, "rocksdb.bytes.compressed.from"},
|
|
|
|
{BYTES_COMPRESSED_TO, "rocksdb.bytes.compressed.to"},
|
|
|
|
{BYTES_COMPRESSION_BYPASSED, "rocksdb.bytes.compression_bypassed"},
|
|
|
|
{BYTES_COMPRESSION_REJECTED, "rocksdb.bytes.compression.rejected"},
|
|
|
|
{NUMBER_BLOCK_COMPRESSION_BYPASSED,
|
|
|
|
"rocksdb.number.block_compression_bypassed"},
|
|
|
|
{NUMBER_BLOCK_COMPRESSION_REJECTED,
|
|
|
|
"rocksdb.number.block_compression_rejected"},
|
|
|
|
{BYTES_DECOMPRESSED_FROM, "rocksdb.bytes.decompressed.from"},
|
|
|
|
{BYTES_DECOMPRESSED_TO, "rocksdb.bytes.decompressed.to"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
|
|
|
|
{FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
|
2023-09-12 22:48:36 +00:00
|
|
|
{COMPACTION_CPU_TOTAL_TIME, "rocksdb.compaction.total.time.cpu_micros"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
|
|
|
|
{ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
|
|
|
|
{READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
|
|
|
|
{READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
|
|
|
|
{NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
|
|
|
|
{BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
|
|
|
|
{BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
|
|
|
|
{BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
|
|
|
|
{BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
|
|
|
|
{BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
|
|
|
|
{BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
|
|
|
|
{BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
|
|
|
|
{BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
|
|
|
|
{BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
|
|
|
|
{BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
|
|
|
|
{BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
|
|
|
|
{BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
|
|
|
|
{BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
|
|
|
|
{BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
|
|
|
|
{BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
|
|
|
|
{BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
|
|
|
|
{BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"},
|
|
|
|
{BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
|
|
|
|
"rocksdb.blobdb.blob.index.expired.count"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
|
|
|
|
"rocksdb.blobdb.blob.index.evicted.count"},
|
|
|
|
{BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"},
|
|
|
|
{BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
|
|
|
|
{BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
|
|
|
|
{BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
|
|
|
|
{BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
|
|
|
|
{BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
|
|
|
|
{BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
|
|
|
|
{BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
|
|
|
|
{BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"},
|
|
|
|
{BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"},
|
|
|
|
{BLOB_DB_CACHE_ADD, "rocksdb.blobdb.cache.add"},
|
|
|
|
{BLOB_DB_CACHE_ADD_FAILURES, "rocksdb.blobdb.cache.add.failures"},
|
|
|
|
{BLOB_DB_CACHE_BYTES_READ, "rocksdb.blobdb.cache.bytes.read"},
|
|
|
|
{BLOB_DB_CACHE_BYTES_WRITE, "rocksdb.blobdb.cache.bytes.write"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
|
|
|
|
{TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
|
|
|
|
"rocksdb.txn.overhead.mutex.old.commit.map"},
|
|
|
|
{TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
|
|
|
|
{TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
|
2019-08-05 20:30:56 +00:00
|
|
|
{TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"},
|
2020-06-05 16:41:03 +00:00
|
|
|
{FILES_MARKED_TRASH, "rocksdb.files.marked.trash"},
|
2023-06-16 17:05:25 +00:00
|
|
|
{FILES_DELETED_FROM_TRASH_QUEUE, "rocksdb.files.marked.trash.deleted"},
|
2020-06-05 16:41:03 +00:00
|
|
|
{FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"},
|
2023-06-09 20:25:57 +00:00
|
|
|
{ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.error.count"},
|
2021-03-18 05:36:42 +00:00
|
|
|
{ERROR_HANDLER_BG_IO_ERROR_COUNT,
|
2023-06-09 20:25:57 +00:00
|
|
|
"rocksdb.error.handler.bg.io.error.count"},
|
2021-03-18 05:36:42 +00:00
|
|
|
{ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
|
2023-06-09 20:25:57 +00:00
|
|
|
"rocksdb.error.handler.bg.retryable.io.error.count"},
|
2021-03-18 05:36:42 +00:00
|
|
|
{ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"},
|
|
|
|
{ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
|
|
|
|
"rocksdb.error.handler.autoresume.retry.total.count"},
|
|
|
|
{ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
|
|
|
|
"rocksdb.error.handler.autoresume.success.count"},
|
2021-06-18 11:56:43 +00:00
|
|
|
{MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
|
|
|
|
"rocksdb.memtable.payload.bytes.at.flush"},
|
|
|
|
{MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
|
|
|
|
"rocksdb.memtable.garbage.bytes.at.flush"},
|
2021-09-07 20:25:24 +00:00
|
|
|
{VERIFY_CHECKSUM_READ_BYTES, "rocksdb.verify_checksum.read.bytes"},
|
2021-09-08 01:23:58 +00:00
|
|
|
{BACKUP_READ_BYTES, "rocksdb.backup.read.bytes"},
|
|
|
|
{BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"},
|
2021-09-28 20:59:15 +00:00
|
|
|
{REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"},
|
|
|
|
{REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"},
|
2021-11-16 23:15:48 +00:00
|
|
|
{HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"},
|
|
|
|
{WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"},
|
|
|
|
{COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"},
|
|
|
|
{HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"},
|
|
|
|
{WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"},
|
|
|
|
{COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"},
|
2022-02-18 21:35:36 +00:00
|
|
|
{LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"},
|
|
|
|
{LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"},
|
|
|
|
{NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"},
|
|
|
|
{NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"},
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
{LAST_LEVEL_SEEK_FILTERED, "rocksdb.last.level.seek.filtered"},
|
|
|
|
{LAST_LEVEL_SEEK_FILTER_MATCH, "rocksdb.last.level.seek.filter.match"},
|
|
|
|
{LAST_LEVEL_SEEK_DATA, "rocksdb.last.level.seek.data"},
|
|
|
|
{LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER,
|
|
|
|
"rocksdb.last.level.seek.data.useful.no.filter"},
|
|
|
|
{LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH,
|
|
|
|
"rocksdb.last.level.seek.data.useful.filter.match"},
|
|
|
|
{NON_LAST_LEVEL_SEEK_FILTERED, "rocksdb.non.last.level.seek.filtered"},
|
|
|
|
{NON_LAST_LEVEL_SEEK_FILTER_MATCH,
|
|
|
|
"rocksdb.non.last.level.seek.filter.match"},
|
|
|
|
{NON_LAST_LEVEL_SEEK_DATA, "rocksdb.non.last.level.seek.data"},
|
|
|
|
{NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER,
|
|
|
|
"rocksdb.non.last.level.seek.data.useful.no.filter"},
|
|
|
|
{NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH,
|
|
|
|
"rocksdb.non.last.level.seek.data.useful.filter.match"},
|
2022-06-16 19:12:43 +00:00
|
|
|
{BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"},
|
2023-05-13 01:16:11 +00:00
|
|
|
{BLOCK_CHECKSUM_MISMATCH_COUNT, "rocksdb.block.checksum.mismatch.count"},
|
2022-06-28 20:52:35 +00:00
|
|
|
{MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"},
|
2022-11-14 05:38:35 +00:00
|
|
|
{READ_ASYNC_MICROS, "rocksdb.read.async.micros"},
|
2023-02-28 18:36:56 +00:00
|
|
|
{ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"},
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
{TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"},
|
|
|
|
{TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"},
|
2023-04-06 22:39:38 +00:00
|
|
|
{TIMESTAMP_FILTER_TABLE_CHECKED, "rocksdb.timestamp.filter.table.checked"},
|
|
|
|
{TIMESTAMP_FILTER_TABLE_FILTERED,
|
|
|
|
"rocksdb.timestamp.filter.table.filtered"},
|
2023-08-18 22:52:04 +00:00
|
|
|
{READAHEAD_TRIMMED, "rocksdb.readahead.trimmed"},
|
2023-10-19 01:00:07 +00:00
|
|
|
{FIFO_MAX_SIZE_COMPACTIONS, "rocksdb.fifo.max.size.compactions"},
|
|
|
|
{FIFO_TTL_COMPACTIONS, "rocksdb.fifo.ttl.compactions"},
|
2023-10-23 21:42:44 +00:00
|
|
|
{PREFETCH_BYTES, "rocksdb.prefetch.bytes"},
|
|
|
|
{PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"},
|
|
|
|
{PREFETCH_HITS, "rocksdb.prefetch.hits"},
|
2024-03-18 23:16:05 +00:00
|
|
|
{SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"},
|
2024-08-12 22:32:07 +00:00
|
|
|
{FILE_READ_CORRUPTION_RETRY_COUNT,
|
|
|
|
"rocksdb.file.read.corruption.retry.count"},
|
|
|
|
{FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
|
|
|
|
"rocksdb.file.read.corruption.retry.success.count"},
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
};
|
2018-11-27 05:30:12 +00:00
|
|
|
|
|
|
|
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
|
|
|
{DB_GET, "rocksdb.db.get.micros"},
|
|
|
|
{DB_WRITE, "rocksdb.db.write.micros"},
|
|
|
|
{COMPACTION_TIME, "rocksdb.compaction.times.micros"},
|
2019-01-30 00:23:21 +00:00
|
|
|
{COMPACTION_CPU_TIME, "rocksdb.compaction.times.cpu_micros"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
|
|
|
|
{TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
|
|
|
|
{COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
|
|
|
|
{WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
|
|
|
|
{MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
|
|
|
|
{TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
|
|
|
|
{DB_MULTIGET, "rocksdb.db.multiget.micros"},
|
|
|
|
{READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
|
|
|
|
{READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
|
|
|
|
{WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
|
|
|
|
{NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
|
|
|
|
{DB_SEEK, "rocksdb.db.seek.micros"},
|
|
|
|
{WRITE_STALL, "rocksdb.db.write.stall"},
|
|
|
|
{SST_READ_MICROS, "rocksdb.sst.read.micros"},
|
2023-04-21 16:07:18 +00:00
|
|
|
{FILE_READ_FLUSH_MICROS, "rocksdb.file.read.flush.micros"},
|
|
|
|
{FILE_READ_COMPACTION_MICROS, "rocksdb.file.read.compaction.micros"},
|
2023-05-18 16:26:29 +00:00
|
|
|
{FILE_READ_DB_OPEN_MICROS, "rocksdb.file.read.db.open.micros"},
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
{FILE_READ_GET_MICROS, "rocksdb.file.read.get.micros"},
|
|
|
|
{FILE_READ_MULTIGET_MICROS, "rocksdb.file.read.multiget.micros"},
|
|
|
|
{FILE_READ_DB_ITERATOR_MICROS, "rocksdb.file.read.db.iterator.micros"},
|
|
|
|
{FILE_READ_VERIFY_DB_CHECKSUM_MICROS,
|
|
|
|
"rocksdb.file.read.verify.db.checksum.micros"},
|
|
|
|
{FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS,
|
|
|
|
"rocksdb.file.read.verify.file.checksums.micros"},
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
{SST_WRITE_MICROS, "rocksdb.sst.write.micros"},
|
|
|
|
{FILE_WRITE_FLUSH_MICROS, "rocksdb.file.write.flush.micros"},
|
|
|
|
{FILE_WRITE_COMPACTION_MICROS, "rocksdb.file.write.compaction.micros"},
|
|
|
|
{FILE_WRITE_DB_OPEN_MICROS, "rocksdb.file.write.db.open.micros"},
|
2018-11-27 05:30:12 +00:00
|
|
|
{NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
|
|
|
|
{BYTES_PER_READ, "rocksdb.bytes.per.read"},
|
|
|
|
{BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
|
|
|
|
{BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
|
|
|
|
{COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
|
|
|
|
{DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
|
|
|
|
{READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
|
|
|
|
{BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
|
|
|
|
{BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
|
|
|
|
{BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
|
|
|
|
{BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
|
|
|
|
{BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
|
|
|
|
{BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
|
|
|
|
{BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
|
|
|
|
{BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
|
|
|
|
{BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
|
|
|
|
{BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
|
|
|
|
{BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
|
|
|
|
{BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
|
|
|
|
{BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
|
|
|
|
{FLUSH_TIME, "rocksdb.db.flush.micros"},
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
{SST_BATCH_SIZE, "rocksdb.sst.batch.size"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"},
|
2020-10-07 20:27:03 +00:00
|
|
|
{NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
|
|
|
"rocksdb.num.index.and.filter.blocks.read.per.level"},
|
|
|
|
{NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"},
|
2024-02-16 01:22:03 +00:00
|
|
|
{NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"},
|
2021-03-18 05:36:42 +00:00
|
|
|
{ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
|
|
|
|
"rocksdb.error.handler.autoresume.retry.count"},
|
2022-04-06 21:26:53 +00:00
|
|
|
{ASYNC_READ_BYTES, "rocksdb.async.read.bytes"},
|
2022-04-26 04:58:22 +00:00
|
|
|
{POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
|
|
|
|
{PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
|
2022-08-29 21:37:44 +00:00
|
|
|
{ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"},
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
{TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
|
|
|
|
"rocksdb.table.open.prefetch.tail.read.bytes"},
|
2018-11-27 05:30:12 +00:00
|
|
|
};
|
|
|
|
|
2014-01-17 20:46:06 +00:00
|
|
|
std::shared_ptr<Statistics> CreateDBStatistics() {
|
2018-11-27 20:56:40 +00:00
|
|
|
return std::make_shared<StatisticsImpl>(nullptr);
|
2014-01-17 20:46:06 +00:00
|
|
|
}
|
|
|
|
|
2021-09-10 16:46:47 +00:00
|
|
|
static int RegisterBuiltinStatistics(ObjectLibrary& library,
|
|
|
|
const std::string& /*arg*/) {
|
2022-01-11 14:32:42 +00:00
|
|
|
library.AddFactory<Statistics>(
|
2021-09-10 16:46:47 +00:00
|
|
|
StatisticsImpl::kClassName(),
|
|
|
|
[](const std::string& /*uri*/, std::unique_ptr<Statistics>* guard,
|
|
|
|
std::string* /* errmsg */) {
|
|
|
|
guard->reset(new StatisticsImpl(nullptr));
|
|
|
|
return guard->get();
|
|
|
|
});
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status Statistics::CreateFromString(const ConfigOptions& config_options,
|
|
|
|
const std::string& id,
|
|
|
|
std::shared_ptr<Statistics>* result) {
|
|
|
|
static std::once_flag once;
|
|
|
|
std::call_once(once, [&]() {
|
|
|
|
RegisterBuiltinStatistics(*(ObjectLibrary::Default().get()), "");
|
|
|
|
});
|
|
|
|
Status s;
|
|
|
|
if (id == "" || id == StatisticsImpl::kClassName()) {
|
|
|
|
result->reset(new StatisticsImpl(nullptr));
|
|
|
|
} else if (id == kNullptrString) {
|
|
|
|
result->reset();
|
|
|
|
} else {
|
2023-02-17 20:54:07 +00:00
|
|
|
s = LoadSharedObject<Statistics>(config_options, id, result);
|
2021-09-10 16:46:47 +00:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::unordered_map<std::string, OptionTypeInfo> stats_type_info = {
|
|
|
|
{"inner", OptionTypeInfo::AsCustomSharedPtr<Statistics>(
|
|
|
|
0, OptionVerificationType::kByNameAllowFromNull,
|
|
|
|
OptionTypeFlags::kCompareNever)},
|
|
|
|
};
|
|
|
|
|
2018-11-27 20:56:40 +00:00
|
|
|
StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats)
|
2021-09-10 16:46:47 +00:00
|
|
|
: stats_(std::move(stats)) {
|
|
|
|
RegisterOptions("StatisticsOptions", &stats_, &stats_type_info);
|
|
|
|
}
|
2014-01-17 20:46:06 +00:00
|
|
|
|
2023-12-01 19:10:30 +00:00
|
|
|
StatisticsImpl::~StatisticsImpl() = default;
|
2014-01-17 20:46:06 +00:00
|
|
|
|
2014-07-28 19:05:36 +00:00
|
|
|
uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const {
|
2016-08-24 22:42:31 +00:00
|
|
|
MutexLock lock(&aggregate_lock_);
|
2017-04-26 22:19:50 +00:00
|
|
|
return getTickerCountLocked(tickerType);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const {
|
2018-11-27 20:56:40 +00:00
|
|
|
assert(tickerType < TICKER_ENUM_MAX);
|
2017-05-23 17:29:14 +00:00
|
|
|
uint64_t res = 0;
|
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType];
|
2016-11-21 02:14:33 +00:00
|
|
|
}
|
2017-05-23 17:29:14 +00:00
|
|
|
return res;
|
Thread-specific histogram statistics
Summary:
To reduce contention for atomics when HistogramStats are shared across
threads, this diff makes them thread-specific so updates are faster. This comes
at the expense of slower reads (much less frequent), which now require merging
all histograms. In this diff,
- Thread-specific HistogramImpl is created upon the thread's first measureTime()
- Thread-specific HistogramImpl are merged and deleted upon thread termination or ThreadLocalPtr destruction, whichever comes first
- getHistogramString() and histogramData() merge all histograms, both thread-specific and previously merged ones
Test Plan:
unit tests, ran db_bench and verified histograms look similar
before:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 7.63% db_bench db_bench [.] rocksdb::HistogramStat::Add
after:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 0.98% db_bench db_bench [.] rocksdb::HistogramStat::Add
Reviewers: sdong, MarkCallaghan, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62649
2016-08-31 21:02:09 +00:00
|
|
|
}
|
|
|
|
|
2014-07-28 19:05:36 +00:00
|
|
|
void StatisticsImpl::histogramData(uint32_t histogramType,
|
|
|
|
HistogramData* const data) const {
|
2017-04-26 22:19:50 +00:00
|
|
|
MutexLock lock(&aggregate_lock_);
|
2017-05-23 17:29:14 +00:00
|
|
|
getHistogramImplLocked(histogramType)->Data(data);
|
2017-04-26 22:19:50 +00:00
|
|
|
}
|
|
|
|
|
2017-05-23 17:29:14 +00:00
|
|
|
std::unique_ptr<HistogramImpl> StatisticsImpl::getHistogramImplLocked(
|
|
|
|
uint32_t histogramType) const {
|
2018-11-27 20:56:40 +00:00
|
|
|
assert(histogramType < HISTOGRAM_ENUM_MAX);
|
2017-05-23 17:29:14 +00:00
|
|
|
std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
|
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
res_hist->Merge(
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->histograms_[histogramType]);
|
|
|
|
}
|
|
|
|
return res_hist;
|
2014-01-17 20:46:06 +00:00
|
|
|
}
|
|
|
|
|
Add Statistics.getHistogramString() to print more detailed outputs of a histogram
Summary:
Provide a way for users to know more detailed ditribution of a histogram metrics. Example outputs:
Manually add statement
fprintf(stdout, "%s\n", dbstats->getHistogramString(SST_READ_MICROS).c_str());
Will print out something like:
Count: 989151 Average: 1.7659 StdDev: 1.52
Min: 0.0000 Median: 1.2071 Max: 860.0000
Percentiles: P50: 1.21 P75: 1.70 P99: 5.12 P99.9: 13.67 P99.99: 21.70
------------------------------------------------------
[ 0, 1 ) 390839 39.513% 39.513% ########
[ 1, 2 ) 500918 50.641% 90.154% ##########
[ 2, 3 ) 79358 8.023% 98.177% ##
[ 3, 4 ) 6297 0.637% 98.813%
[ 4, 5 ) 1712 0.173% 98.986%
[ 5, 6 ) 1134 0.115% 99.101%
[ 6, 7 ) 1222 0.124% 99.224%
[ 7, 8 ) 1529 0.155% 99.379%
[ 8, 9 ) 1264 0.128% 99.507%
[ 9, 10 ) 988 0.100% 99.607%
[ 10, 12 ) 1378 0.139% 99.746%
[ 12, 14 ) 1828 0.185% 99.931%
[ 14, 16 ) 410 0.041% 99.972%
[ 16, 18 ) 72 0.007% 99.980%
[ 18, 20 ) 67 0.007% 99.986%
[ 20, 25 ) 106 0.011% 99.997%
[ 25, 30 ) 24 0.002% 99.999%
[ 30, 35 ) 1 0.000% 100.000%
[ 250, 300 ) 2 0.000% 100.000%
[ 300, 350 ) 1 0.000% 100.000%
[ 800, 900 ) 1 0.000% 100.000%
Test Plan: Manually add a print in db_bench and make sure it prints out as expected. Will add some codes to cover the function
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43611
2015-08-05 20:14:28 +00:00
|
|
|
std::string StatisticsImpl::getHistogramString(uint32_t histogramType) const {
|
2017-04-26 22:19:50 +00:00
|
|
|
MutexLock lock(&aggregate_lock_);
|
2017-05-23 17:29:14 +00:00
|
|
|
return getHistogramImplLocked(histogramType)->ToString();
|
Thread-specific histogram statistics
Summary:
To reduce contention for atomics when HistogramStats are shared across
threads, this diff makes them thread-specific so updates are faster. This comes
at the expense of slower reads (much less frequent), which now require merging
all histograms. In this diff,
- Thread-specific HistogramImpl is created upon the thread's first measureTime()
- Thread-specific HistogramImpl are merged and deleted upon thread termination or ThreadLocalPtr destruction, whichever comes first
- getHistogramString() and histogramData() merge all histograms, both thread-specific and previously merged ones
Test Plan:
unit tests, ran db_bench and verified histograms look similar
before:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 7.63% db_bench db_bench [.] rocksdb::HistogramStat::Add
after:
$ TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench --benchmarks=readwhilewriting --statistics --num=1000000 --use_existing_db --threads=64 --cache_size=250000000 --compression_type=lz4
...
+ 0.98% db_bench db_bench [.] rocksdb::HistogramStat::Add
Reviewers: sdong, MarkCallaghan, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62649
2016-08-31 21:02:09 +00:00
|
|
|
}
|
|
|
|
|
2014-07-28 19:05:36 +00:00
|
|
|
void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
|
2016-08-24 22:42:31 +00:00
|
|
|
{
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
2017-04-26 22:19:50 +00:00
|
|
|
setTickerCountLocked(tickerType, count);
|
2014-07-28 19:05:36 +00:00
|
|
|
}
|
|
|
|
if (stats_ && tickerType < TICKER_ENUM_MAX) {
|
|
|
|
stats_->setTickerCount(tickerType, count);
|
|
|
|
}
|
2014-01-17 20:46:06 +00:00
|
|
|
}
|
|
|
|
|
2017-04-26 22:19:50 +00:00
|
|
|
void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) {
|
2018-11-27 20:56:40 +00:00
|
|
|
assert(tickerType < TICKER_ENUM_MAX);
|
2017-05-23 17:29:14 +00:00
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
if (core_idx == 0) {
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count;
|
|
|
|
} else {
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = 0;
|
|
|
|
}
|
2017-04-26 22:19:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-11 17:54:11 +00:00
|
|
|
uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
|
|
|
|
uint64_t sum = 0;
|
|
|
|
{
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
2018-11-27 20:56:40 +00:00
|
|
|
assert(tickerType < TICKER_ENUM_MAX);
|
2017-05-23 17:29:14 +00:00
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
sum +=
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange(
|
|
|
|
0, std::memory_order_relaxed);
|
2016-10-11 17:54:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (stats_ && tickerType < TICKER_ENUM_MAX) {
|
|
|
|
stats_->setTickerCount(tickerType, 0);
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
2014-07-28 19:05:36 +00:00
|
|
|
void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
|
2020-09-05 06:23:40 +00:00
|
|
|
if (get_stats_level() <= StatsLevel::kExceptTickers) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (tickerType < TICKER_ENUM_MAX) {
|
|
|
|
per_core_stats_.Access()->tickers_[tickerType].fetch_add(
|
|
|
|
count, std::memory_order_relaxed);
|
|
|
|
if (stats_) {
|
|
|
|
stats_->recordTick(tickerType, count);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(false);
|
2014-07-28 19:05:36 +00:00
|
|
|
}
|
2014-01-17 20:46:06 +00:00
|
|
|
}
|
|
|
|
|
2019-02-28 18:14:19 +00:00
|
|
|
void StatisticsImpl::recordInHistogram(uint32_t histogramType, uint64_t value) {
|
2018-11-27 20:56:40 +00:00
|
|
|
assert(histogramType < HISTOGRAM_ENUM_MAX);
|
2019-03-01 18:39:00 +00:00
|
|
|
if (get_stats_level() <= StatsLevel::kExceptHistogramOrTimers) {
|
2019-02-28 18:14:19 +00:00
|
|
|
return;
|
|
|
|
}
|
2017-05-23 17:29:14 +00:00
|
|
|
per_core_stats_.Access()->histograms_[histogramType].Add(value);
|
2014-07-28 19:05:36 +00:00
|
|
|
if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) {
|
2019-02-28 18:14:19 +00:00
|
|
|
stats_->recordInHistogram(histogramType, value);
|
2014-07-28 19:05:36 +00:00
|
|
|
}
|
2014-01-17 20:46:06 +00:00
|
|
|
}
|
|
|
|
|
2017-04-26 22:19:50 +00:00
|
|
|
Status StatisticsImpl::Reset() {
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
for (uint32_t i = 0; i < TICKER_ENUM_MAX; ++i) {
|
|
|
|
setTickerCountLocked(i, 0);
|
|
|
|
}
|
|
|
|
for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; ++i) {
|
2017-05-23 17:29:14 +00:00
|
|
|
for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
|
|
|
|
per_core_stats_.AccessAtCore(core_idx)->histograms_[i].Clear();
|
|
|
|
}
|
2017-04-26 22:19:50 +00:00
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2013-06-19 03:28:41 +00:00
|
|
|
namespace {
|
2014-01-17 20:46:06 +00:00
|
|
|
|
2013-06-19 03:28:41 +00:00
|
|
|
// a buffer size used for temp string buffers
|
2017-04-06 02:02:00 +00:00
|
|
|
const int kTmpStrBufferSize = 200;
|
2013-06-19 03:28:41 +00:00
|
|
|
|
2022-10-25 03:45:54 +00:00
|
|
|
} // namespace
|
2013-06-19 03:28:41 +00:00
|
|
|
|
2014-07-28 19:05:36 +00:00
|
|
|
std::string StatisticsImpl::ToString() const {
|
2017-04-26 22:19:50 +00:00
|
|
|
MutexLock lock(&aggregate_lock_);
|
2013-06-19 03:28:41 +00:00
|
|
|
std::string res;
|
|
|
|
res.reserve(20000);
|
|
|
|
for (const auto& t : TickersNameMap) {
|
2018-11-27 20:56:40 +00:00
|
|
|
assert(t.first < TICKER_ENUM_MAX);
|
|
|
|
char buffer[kTmpStrBufferSize];
|
|
|
|
snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n",
|
|
|
|
t.second.c_str(), getTickerCountLocked(t.first));
|
|
|
|
res.append(buffer);
|
2013-06-19 03:28:41 +00:00
|
|
|
}
|
|
|
|
for (const auto& h : HistogramsNameMap) {
|
2018-11-27 20:56:40 +00:00
|
|
|
assert(h.first < HISTOGRAM_ENUM_MAX);
|
|
|
|
char buffer[kTmpStrBufferSize];
|
|
|
|
HistogramData hData;
|
|
|
|
getHistogramImplLocked(h.first)->Data(&hData);
|
|
|
|
// don't handle failures - buffer should always be big enough and arguments
|
|
|
|
// should be provided correctly
|
2018-12-13 22:12:02 +00:00
|
|
|
int ret =
|
|
|
|
snprintf(buffer, kTmpStrBufferSize,
|
|
|
|
"%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64
|
|
|
|
" SUM : %" PRIu64 "\n",
|
|
|
|
h.second.c_str(), hData.median, hData.percentile95,
|
|
|
|
hData.percentile99, hData.max, hData.count, hData.sum);
|
2018-11-27 20:56:40 +00:00
|
|
|
if (ret < 0 || ret >= kTmpStrBufferSize) {
|
|
|
|
assert(false);
|
|
|
|
continue;
|
2014-07-28 19:05:36 +00:00
|
|
|
}
|
2018-11-27 20:56:40 +00:00
|
|
|
res.append(buffer);
|
2013-06-19 03:28:41 +00:00
|
|
|
}
|
|
|
|
res.shrink_to_fit();
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2019-02-20 23:46:59 +00:00
|
|
|
bool StatisticsImpl::getTickerMap(
|
|
|
|
std::map<std::string, uint64_t>* stats_map) const {
|
|
|
|
assert(stats_map);
|
2023-12-01 19:10:30 +00:00
|
|
|
if (!stats_map) {
|
|
|
|
return false;
|
|
|
|
}
|
2019-02-20 23:46:59 +00:00
|
|
|
stats_map->clear();
|
|
|
|
MutexLock lock(&aggregate_lock_);
|
|
|
|
for (const auto& t : TickersNameMap) {
|
|
|
|
assert(t.first < TICKER_ENUM_MAX);
|
|
|
|
(*stats_map)[t.second.c_str()] = getTickerCountLocked(t.first);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-07-28 19:05:36 +00:00
|
|
|
bool StatisticsImpl::HistEnabledForType(uint32_t type) const {
|
2018-11-27 20:56:40 +00:00
|
|
|
return type < HISTOGRAM_ENUM_MAX;
|
2014-07-28 19:05:36 +00:00
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|