rocksdb/monitoring/thread_status_updater.h

227 lines
8.5 KiB
C
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// The implementation of ThreadStatus.
//
// Note that we make get and set access to ThreadStatusData lockless.
// As a result, ThreadStatusData as a whole is not atomic. However,
// we guarantee consistent ThreadStatusData all the time whenever
// user call GetThreadList(). This consistency guarantee is done
// by having the following constraint in the internal implementation
// of set and get order:
//
// 1. When reset any information in ThreadStatusData, always start from
// clearing up the lower-level information first.
// 2. When setting any information in ThreadStatusData, always start from
// setting the higher-level information.
// 3. When returning ThreadStatusData to the user, fields are fetched from
// higher-level to lower-level. In addition, where there's a nullptr
// in one field, then all fields that has lower-level than that field
// should be ignored.
//
// The high to low level information would be:
// thread_id > thread_type > db > cf > operation > state
//
// This means user might not always get full information, but whenever
// returned by the GetThreadList() is guaranteed to be consistent.
#pragma once
#include <atomic>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "port/port.h"
#include "rocksdb/status.h"
#include "rocksdb/thread_status.h"
#include "util/thread_operation.h"
namespace ROCKSDB_NAMESPACE {
class ColumnFamilyHandle;
// The structure that keeps constant information about a column family.
struct ConstantColumnFamilyInfo {
#ifdef ROCKSDB_USING_THREAD_STATUS
public:
ConstantColumnFamilyInfo(const void* _db_key, const std::string& _db_name,
const std::string& _cf_name)
: db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {}
const void* db_key;
const std::string db_name;
const std::string cf_name;
#endif // ROCKSDB_USING_THREAD_STATUS
};
// the internal data-structure that is used to reflect the current
// status of a thread using a set of atomic pointers.
struct ThreadStatusData {
#ifdef ROCKSDB_USING_THREAD_STATUS
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
explicit ThreadStatusData() {
enable_tracking.store(false);
thread_id.store(0);
thread_type.store(ThreadStatus::USER);
cf_key.store(nullptr);
operation_type.store(ThreadStatus::OP_UNKNOWN);
op_start_time.store(0);
state_type.store(ThreadStatus::STATE_UNKNOWN);
}
// A flag to indicate whether the thread tracking is enabled
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
// in the current thread.
// If set to false, then SetThreadOperation and SetThreadState
// will be no-op.
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
std::atomic<bool> enable_tracking;
std::atomic<uint64_t> thread_id;
std::atomic<ThreadStatus::ThreadType> thread_type;
std::atomic<void*> cf_key;
std::atomic<ThreadStatus::OperationType> operation_type;
Report elapsed time in micros in ThreadStatus instead of start time. Summary: Report elapsed time of a thread operation in micros in ThreadStatus instead of start time of a thread operation in seconds since the Epoch, 1970-01-01 00:00:00 (UTC). Test Plan: ./db_bench --benchmarks=fillrandom --num=100000 --threads=40 \ --max_background_compactions=10 --max_background_flushes=3 \ --thread_status_per_interval=1000 --key_size=16 --value_size=1000 \ --num_column_families=10 Sample Output: ThreadID ThreadType cfName Operation ElapsedTime Stage State 140667724562496 High Pri column_family_name_000002 Flush 772.419 ms FlushJob::WriteLevel0Table 140667728756800 High Pri default Flush 617.845 ms FlushJob::WriteLevel0Table 140667732951104 High Pri column_family_name_000005 Flush 772.078 ms FlushJob::WriteLevel0Table 140667875557440 Low Pri column_family_name_000008 Compaction 1409.216 ms CompactionJob::Install 140667737145408 Low Pri 140667749728320 Low Pri 140667816837184 Low Pri column_family_name_000007 Compaction 1071.815 ms CompactionJob::ProcessKeyValueCompaction 140667787477056 Low Pri column_family_name_000009 Compaction 772.516 ms CompactionJob::ProcessKeyValueCompaction 140667741339712 Low Pri 140667758116928 Low Pri column_family_name_000004 Compaction 620.739 ms CompactionJob::ProcessKeyValueCompaction 140667753922624 Low Pri 140667842003008 Low Pri column_family_name_000006 Compaction 1260.079 ms CompactionJob::ProcessKeyValueCompaction 140667745534016 Low Pri Reviewers: sdong, igor, rven Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D35769
2015-03-23 23:35:04 +00:00
std::atomic<uint64_t> op_start_time;
Allow GetThreadList() to report operation stage. Summary: Allow GetThreadList() to report operation stage. Test Plan: ./thread_list_test ./db_bench --benchmarks=fillrandom --num=100000 --threads=40 \ --max_background_compactions=10 --max_background_flushes=3 \ --thread_status_per_interval=1000 --key_size=16 --value_size=1000 \ --num_column_families=10 export ROCKSDB_TESTS=ThreadStatus ./db_test Sample output ThreadID ThreadType cfName Operation OP_StartTime ElapsedTime Stage State 140116265861184 Low Pri 140116270055488 Low Pri 140116274249792 High Pri column_family_name_000005 Flush 2015/03/10-14:58:11 0 us FlushJob::WriteLevel0Table 140116400078912 Low Pri column_family_name_000004 Compaction 2015/03/10-14:58:11 0 us CompactionJob::FinishCompactionOutputFile 140116358135872 Low Pri column_family_name_000006 Compaction 2015/03/10-14:58:10 1 us CompactionJob::FinishCompactionOutputFile 140116341358656 Low Pri 140116295221312 High Pri default Flush 2015/03/10-14:58:11 0 us FlushJob::WriteLevel0Table 140116324581440 Low Pri column_family_name_000009 Compaction 2015/03/10-14:58:11 0 us CompactionJob::ProcessKeyValueCompaction 140116278444096 Low Pri 140116299415616 Low Pri column_family_name_000008 Compaction 2015/03/10-14:58:11 0 us CompactionJob::FinishCompactionOutputFile 140116291027008 High Pri column_family_name_000001 Flush 2015/03/10-14:58:11 0 us FlushJob::WriteLevel0Table 140116286832704 Low Pri column_family_name_000002 Compaction 2015/03/10-14:58:11 0 us CompactionJob::FinishCompactionOutputFile 140116282638400 Low Pri Reviewers: rven, igor, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D34683
2015-03-13 17:45:40 +00:00
std::atomic<ThreadStatus::OperationStage> operation_stage;
Allow GetThreadList() to report basic compaction operation properties. Summary: Now we're able to show more details about a compaction in GetThreadList() :) This patch allows GetThreadList() to report basic compaction operation properties. Basic compaction properties include: 1. job id 2. compaction input / output level 3. compaction property flags (is_manual, is_deletion, .. etc) 4. total input bytes 5. the number of bytes has been read currently. 6. the number of bytes has been written currently. Flush operation properties will be done in a seperate diff. Test Plan: /db_bench --threads=30 --num=1000000 --benchmarks=fillrandom --thread_status_per_interval=1 Sample output of tracking same job: ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 31.357 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 59.440 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 226.375 ms CompactionJob::Install BaseInputLevel 1 | BytesRead 3958013 | BytesWritten 3621940 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | Reviewers: sdong, rven, igor Reviewed By: igor Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D37653
2015-05-07 05:50:35 +00:00
std::atomic<uint64_t> op_properties[ThreadStatus::kNumOperationProperties];
std::atomic<ThreadStatus::StateType> state_type;
#endif // ROCKSDB_USING_THREAD_STATUS
};
// The class that stores and updates the status of the current thread
// using a thread-local ThreadStatusData.
//
// In most of the case, you should use ThreadStatusUtil to update
// the status of the current thread instead of using ThreadSatusUpdater
// directly.
//
// @see ThreadStatusUtil
class ThreadStatusUpdater {
public:
ThreadStatusUpdater() {}
// Releases all ThreadStatusData of all active threads.
virtual ~ThreadStatusUpdater() {}
// Unregister the current thread.
void UnregisterThread();
// Reset the status of the current thread. This includes resetting
// ColumnFamilyInfoKey, ThreadOperation, and ThreadState.
void ResetThreadStatus();
// Set the id of the current thread.
void SetThreadID(uint64_t thread_id);
// Register the current thread for tracking.
void RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id);
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
void SetEnableTracking(bool enable_tracking);
// Update the column-family info of the current thread by setting
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
// its thread-local pointer of ThreadStatusData to the correct entry.
void SetColumnFamilyInfoKey(const void* cf_key);
// returns the column family info key.
const void* GetColumnFamilyInfoKey();
// Update the thread operation of the current thread.
void SetThreadOperation(const ThreadStatus::OperationType type);
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
// Return the thread operation of the current thread.
ThreadStatus::OperationType GetThreadOperation();
Report elapsed time in micros in ThreadStatus instead of start time. Summary: Report elapsed time of a thread operation in micros in ThreadStatus instead of start time of a thread operation in seconds since the Epoch, 1970-01-01 00:00:00 (UTC). Test Plan: ./db_bench --benchmarks=fillrandom --num=100000 --threads=40 \ --max_background_compactions=10 --max_background_flushes=3 \ --thread_status_per_interval=1000 --key_size=16 --value_size=1000 \ --num_column_families=10 Sample Output: ThreadID ThreadType cfName Operation ElapsedTime Stage State 140667724562496 High Pri column_family_name_000002 Flush 772.419 ms FlushJob::WriteLevel0Table 140667728756800 High Pri default Flush 617.845 ms FlushJob::WriteLevel0Table 140667732951104 High Pri column_family_name_000005 Flush 772.078 ms FlushJob::WriteLevel0Table 140667875557440 Low Pri column_family_name_000008 Compaction 1409.216 ms CompactionJob::Install 140667737145408 Low Pri 140667749728320 Low Pri 140667816837184 Low Pri column_family_name_000007 Compaction 1071.815 ms CompactionJob::ProcessKeyValueCompaction 140667787477056 Low Pri column_family_name_000009 Compaction 772.516 ms CompactionJob::ProcessKeyValueCompaction 140667741339712 Low Pri 140667758116928 Low Pri column_family_name_000004 Compaction 620.739 ms CompactionJob::ProcessKeyValueCompaction 140667753922624 Low Pri 140667842003008 Low Pri column_family_name_000006 Compaction 1260.079 ms CompactionJob::ProcessKeyValueCompaction 140667745534016 Low Pri Reviewers: sdong, igor, rven Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D35769
2015-03-23 23:35:04 +00:00
// The start time of the current thread operation. It is in the format
// of micro-seconds since some fixed point in time.
void SetOperationStartTime(const uint64_t start_time);
Allow GetThreadList() to report basic compaction operation properties. Summary: Now we're able to show more details about a compaction in GetThreadList() :) This patch allows GetThreadList() to report basic compaction operation properties. Basic compaction properties include: 1. job id 2. compaction input / output level 3. compaction property flags (is_manual, is_deletion, .. etc) 4. total input bytes 5. the number of bytes has been read currently. 6. the number of bytes has been written currently. Flush operation properties will be done in a seperate diff. Test Plan: /db_bench --threads=30 --num=1000000 --benchmarks=fillrandom --thread_status_per_interval=1 Sample output of tracking same job: ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 31.357 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 59.440 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 226.375 ms CompactionJob::Install BaseInputLevel 1 | BytesRead 3958013 | BytesWritten 3621940 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | Reviewers: sdong, rven, igor Reviewed By: igor Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D37653
2015-05-07 05:50:35 +00:00
// Set the "i"th property of the current operation.
//
// NOTE: Our practice here is to set all the thread operation properties
// and stage before we set thread operation, and thread operation
// will be set in std::memory_order_release. This is to ensure
// whenever a thread operation is not OP_UNKNOWN, we will always
// have a consistent information on its properties.
void SetThreadOperationProperty(int i, uint64_t value);
Allow GetThreadList() to report basic compaction operation properties. Summary: Now we're able to show more details about a compaction in GetThreadList() :) This patch allows GetThreadList() to report basic compaction operation properties. Basic compaction properties include: 1. job id 2. compaction input / output level 3. compaction property flags (is_manual, is_deletion, .. etc) 4. total input bytes 5. the number of bytes has been read currently. 6. the number of bytes has been written currently. Flush operation properties will be done in a seperate diff. Test Plan: /db_bench --threads=30 --num=1000000 --benchmarks=fillrandom --thread_status_per_interval=1 Sample output of tracking same job: ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 31.357 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 59.440 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 226.375 ms CompactionJob::Install BaseInputLevel 1 | BytesRead 3958013 | BytesWritten 3621940 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | Reviewers: sdong, rven, igor Reviewed By: igor Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D37653
2015-05-07 05:50:35 +00:00
// Increase the "i"th property of the current operation with
// the specified delta.
void IncreaseThreadOperationProperty(int i, uint64_t delta);
Allow GetThreadList() to report basic compaction operation properties. Summary: Now we're able to show more details about a compaction in GetThreadList() :) This patch allows GetThreadList() to report basic compaction operation properties. Basic compaction properties include: 1. job id 2. compaction input / output level 3. compaction property flags (is_manual, is_deletion, .. etc) 4. total input bytes 5. the number of bytes has been read currently. 6. the number of bytes has been written currently. Flush operation properties will be done in a seperate diff. Test Plan: /db_bench --threads=30 --num=1000000 --benchmarks=fillrandom --thread_status_per_interval=1 Sample output of tracking same job: ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 31.357 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 59.440 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 226.375 ms CompactionJob::Install BaseInputLevel 1 | BytesRead 3958013 | BytesWritten 3621940 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | Reviewers: sdong, rven, igor Reviewed By: igor Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D37653
2015-05-07 05:50:35 +00:00
Allow GetThreadList() to report operation stage. Summary: Allow GetThreadList() to report operation stage. Test Plan: ./thread_list_test ./db_bench --benchmarks=fillrandom --num=100000 --threads=40 \ --max_background_compactions=10 --max_background_flushes=3 \ --thread_status_per_interval=1000 --key_size=16 --value_size=1000 \ --num_column_families=10 export ROCKSDB_TESTS=ThreadStatus ./db_test Sample output ThreadID ThreadType cfName Operation OP_StartTime ElapsedTime Stage State 140116265861184 Low Pri 140116270055488 Low Pri 140116274249792 High Pri column_family_name_000005 Flush 2015/03/10-14:58:11 0 us FlushJob::WriteLevel0Table 140116400078912 Low Pri column_family_name_000004 Compaction 2015/03/10-14:58:11 0 us CompactionJob::FinishCompactionOutputFile 140116358135872 Low Pri column_family_name_000006 Compaction 2015/03/10-14:58:10 1 us CompactionJob::FinishCompactionOutputFile 140116341358656 Low Pri 140116295221312 High Pri default Flush 2015/03/10-14:58:11 0 us FlushJob::WriteLevel0Table 140116324581440 Low Pri column_family_name_000009 Compaction 2015/03/10-14:58:11 0 us CompactionJob::ProcessKeyValueCompaction 140116278444096 Low Pri 140116299415616 Low Pri column_family_name_000008 Compaction 2015/03/10-14:58:11 0 us CompactionJob::FinishCompactionOutputFile 140116291027008 High Pri column_family_name_000001 Flush 2015/03/10-14:58:11 0 us FlushJob::WriteLevel0Table 140116286832704 Low Pri column_family_name_000002 Compaction 2015/03/10-14:58:11 0 us CompactionJob::FinishCompactionOutputFile 140116282638400 Low Pri Reviewers: rven, igor, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D34683
2015-03-13 17:45:40 +00:00
// Update the thread operation stage of the current thread.
ThreadStatus::OperationStage SetThreadOperationStage(
const ThreadStatus::OperationStage stage);
// Clear thread operation of the current thread.
void ClearThreadOperation();
Allow GetThreadList() to report basic compaction operation properties. Summary: Now we're able to show more details about a compaction in GetThreadList() :) This patch allows GetThreadList() to report basic compaction operation properties. Basic compaction properties include: 1. job id 2. compaction input / output level 3. compaction property flags (is_manual, is_deletion, .. etc) 4. total input bytes 5. the number of bytes has been read currently. 6. the number of bytes has been written currently. Flush operation properties will be done in a seperate diff. Test Plan: /db_bench --threads=30 --num=1000000 --benchmarks=fillrandom --thread_status_per_interval=1 Sample output of tracking same job: ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 31.357 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 59.440 ms CompactionJob::FinishCompactionOutputFile BaseInputLevel 1 | BytesRead 2264663 | BytesWritten 1934241 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | ThreadID ThreadType cfName Operation ElapsedTime Stage State OperationProperties 140664171987072 Low Pri default Compaction 226.375 ms CompactionJob::Install BaseInputLevel 1 | BytesRead 3958013 | BytesWritten 3621940 | IsDeletion 0 | IsManual 0 | IsTrivialMove 0 | JobID 277 | OutputLevel 2 | TotalInputBytes 3964158 | Reviewers: sdong, rven, igor Reviewed By: igor Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D37653
2015-05-07 05:50:35 +00:00
// Reset all thread-operation-properties to 0.
void ClearThreadOperationProperties();
// Update the thread state of the current thread.
void SetThreadState(const ThreadStatus::StateType type);
// Clear the thread state of the current thread.
void ClearThreadState();
// Obtain the status of all active registered threads.
Status GetThreadList(std::vector<ThreadStatus>* thread_list);
// Create an entry in the global ColumnFamilyInfo table for the
// specified column family. This function should be called only
// when the current thread does not hold db_mutex.
void NewColumnFamilyInfo(const void* db_key, const std::string& db_name,
const void* cf_key, const std::string& cf_name);
// Erase all ConstantColumnFamilyInfo that is associated with the
// specified db instance. This function should be called only when
// the current thread does not hold db_mutex.
void EraseDatabaseInfo(const void* db_key);
// Erase the ConstantColumnFamilyInfo that is associated with the
// specified ColumnFamilyData. This function should be called only
// when the current thread does not hold db_mutex.
void EraseColumnFamilyInfo(const void* cf_key);
// Verifies whether the input ColumnFamilyHandles matches
// the information stored in the current cf_info_map.
void TEST_VerifyColumnFamilyInfoMap(
const std::vector<ColumnFamilyHandle*>& handles, bool check_exist);
protected:
#ifdef ROCKSDB_USING_THREAD_STATUS
// The thread-local variable for storing thread status.
static thread_local ThreadStatusData* thread_status_data_;
// Returns the pointer to the thread status data only when the
// thread status data is non-null and has enable_tracking == true.
ThreadStatusData* GetLocalThreadStatus();
// Directly returns the pointer to thread_status_data_ without
// checking whether enabling_tracking is true of not.
ThreadStatusData* Get() { return thread_status_data_; }
// The mutex that protects cf_info_map and db_key_map.
std::mutex thread_list_mutex_;
// The current status data of all active threads.
std::unordered_set<ThreadStatusData*> thread_data_set_;
// A global map that keeps the column family information. It is stored
// globally instead of inside DB is to avoid the situation where DB is
// closing while GetThreadList function already get the pointer to its
// CopnstantColumnFamilyInfo.
std::unordered_map<const void*, ConstantColumnFamilyInfo> cf_info_map_;
// A db_key to cf_key map that allows erasing elements in cf_info_map
// associated to the same db_key faster.
std::unordered_map<const void*, std::unordered_set<const void*>> db_key_map_;
#else
static ThreadStatusData* thread_status_data_;
#endif // ROCKSDB_USING_THREAD_STATUS
};
} // namespace ROCKSDB_NAMESPACE