mirror of https://github.com/facebook/rocksdb.git
regression_test.sh: kill very old db_bench (and more) (#10441)
Summary: If a db_bench process gets hung or runaway on a machine, that could prevent regression_test.sh from ever making progress. To fix that, regression_test.sh will now kill any db_bench process that is >12 hours old. Also made this more reliable by not using string matching (grep) to get db_bench process IDs. I also had to make some other updates to get local runs working reliably: * Fix some quoting hell and other dubious complexity with db_bench_cmd * Only save a DB for re-use when building it passes * Report failed command in more cases * Add safeguards against "rm -rf ." Pull Request resolved: https://github.com/facebook/rocksdb/pull/10441 Test Plan: manual (local and remote), with temporary changes e.g. to have a manageable age threshold etc. Reviewed By: riversand963 Differential Revision: D38285537 Pulled By: pdillinger fbshipit-source-id: 4d598876aedc38ac4bd9d8ddf32c5995d8e44db8
This commit is contained in:
parent
cc8ded6152
commit
9da97a3726
|
@ -46,6 +46,7 @@
|
|||
# Default: 1
|
||||
# TEST_PATH: the root directory of the regression test.
|
||||
# Default: "/tmp/rocksdb/regression_test"
|
||||
# !!! NOTE !!! - a DB will also be saved in $TEST_PATH/../db
|
||||
# RESULT_PATH: the directory where the regression results will be generated.
|
||||
# Default: "$TEST_PATH/current_time"
|
||||
# REMOTE_USER_AT_HOST: If set, then test will run on the specified host under
|
||||
|
@ -125,15 +126,14 @@ function main {
|
|||
|
||||
setup_test_directory
|
||||
if [ $TEST_MODE -le 1 ]; then
|
||||
tmp=$DB_PATH
|
||||
DB_PATH=$ORIGIN_PATH
|
||||
test_remote "test -d $DB_PATH"
|
||||
test_remote "test -d $ORIGIN_PATH"
|
||||
if [[ $? -ne 0 ]]; then
|
||||
echo "Building DB..."
|
||||
# compactall alone will not print ops or threads, which will fail update_report
|
||||
run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
|
||||
# only save for future use on success
|
||||
test_remote "mv $DB_PATH $ORIGIN_PATH"
|
||||
fi
|
||||
DB_PATH=$tmp
|
||||
fi
|
||||
if [ $TEST_MODE -ge 1 ]; then
|
||||
build_checkpoint
|
||||
|
@ -204,9 +204,32 @@ function init_arguments {
|
|||
# $4 --- use_existing_db. Default: 1
|
||||
# $5 --- update_report. Default: 1
|
||||
function run_db_bench {
|
||||
# this will terminate all currently-running db_bench
|
||||
find_db_bench_cmd="ps aux | grep db_bench | grep -v grep | grep -v aux | awk '{print \$2}'"
|
||||
# Make sure no other db_bench is running. (Make sure command succeeds if pidof
|
||||
# command exists but finds nothing.)
|
||||
pids_cmd='pidof db_bench || pidof --version > /dev/null'
|
||||
# But first, make best effort to kill any db_bench that have run for more
|
||||
# than 12 hours, as that indicates a hung or runaway process.
|
||||
kill_old_cmd='for PID in $(pidof db_bench); do [ "$(($(stat -c %Y /proc/$PID) + 43200))" -lt "$(date +%s)" ] && echo "Killing old db_bench $PID" && kill $PID && sleep 5 && kill -9 $PID && sleep 5; done; pidof --version > /dev/null'
|
||||
if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
|
||||
pids_cmd="$SSH $REMOTE_USER_AT_HOST '$pids_cmd'"
|
||||
kill_old_cmd="$SSH $REMOTE_USER_AT_HOST '$kill_old_cmd'"
|
||||
fi
|
||||
|
||||
eval $kill_old_cmd
|
||||
exit_on_error $? "$kill_old_cmd"
|
||||
|
||||
pids_output="$(eval $pids_cmd)"
|
||||
exit_on_error $? "$pids_cmd"
|
||||
|
||||
if [ "$pids_output" != "" ]; then
|
||||
echo "Stopped regression_test.sh as there're still recent db_bench "
|
||||
echo "processes running: $pids_output"
|
||||
echo "Clean up test directory"
|
||||
cleanup_test_directory $TEST_ROOT_DIR
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Build db_bench command
|
||||
ops=${2:-$NUM_OPS}
|
||||
threads=${3:-$NUM_THREADS}
|
||||
USE_EXISTING_DB=${4:-1}
|
||||
|
@ -220,7 +243,7 @@ function run_db_bench {
|
|||
options_file_arg=$(setup_options_file)
|
||||
echo "$options_file_arg"
|
||||
# use `which time` to avoid using bash's internal time command
|
||||
db_bench_cmd="("'\$(which time)'" -p $DB_BENCH_DIR/db_bench \
|
||||
db_bench_cmd="\$(which time) -p $DB_BENCH_DIR/db_bench \
|
||||
--benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \
|
||||
--use_existing_db=$USE_EXISTING_DB \
|
||||
--perf_level=$PERF_LEVEL \
|
||||
|
@ -248,38 +271,16 @@ function run_db_bench {
|
|||
--seed=$SEED \
|
||||
--multiread_batched=true \
|
||||
--batch_size=$MULTIREAD_BATCH_SIZE \
|
||||
--multiread_stride=$MULTIREAD_STRIDE) 2>&1"
|
||||
ps_cmd="ps aux"
|
||||
--multiread_stride=$MULTIREAD_STRIDE 2>&1"
|
||||
if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
|
||||
echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
|
||||
db_bench_cmd="$SSH $REMOTE_USER_AT_HOST \"$db_bench_cmd\""
|
||||
ps_cmd="$SSH $REMOTE_USER_AT_HOST $ps_cmd"
|
||||
db_bench_cmd="$SSH $REMOTE_USER_AT_HOST '$db_bench_cmd'"
|
||||
fi
|
||||
echo db_bench_cmd="$db_bench_cmd"
|
||||
|
||||
## make sure no db_bench is running
|
||||
# The following statement is necessary make sure "eval $ps_cmd" will success.
|
||||
# Otherwise, if we simply check whether "$(eval $ps_cmd | grep db_bench)" is
|
||||
# successful or not, then it will always be false since grep will return
|
||||
# non-zero status when there's no matching output.
|
||||
ps_output="$(eval $ps_cmd)"
|
||||
exit_on_error $? "$ps_cmd"
|
||||
|
||||
# perform the actual command to check whether db_bench is running
|
||||
grep_output="$(eval $ps_cmd | grep db_bench | grep -v grep)"
|
||||
if [ "$grep_output" != "" ]; then
|
||||
echo "Stopped regression_test.sh as there're still db_bench processes running:"
|
||||
echo $grep_output
|
||||
echo "Clean up test directory"
|
||||
cleanup_test_directory $TEST_ROOT_DIR
|
||||
exit 2
|
||||
fi
|
||||
|
||||
## run the db_bench
|
||||
cmd="($db_bench_cmd || db_bench_error=1) | tee -a $RESULT_PATH/$1"
|
||||
exit_on_error $?
|
||||
echo $cmd
|
||||
eval $cmd
|
||||
exit_on_error $db_bench_error
|
||||
# Run the db_bench command
|
||||
eval $db_bench_cmd | tee -a "$RESULT_PATH/$1"
|
||||
exit_on_error ${PIPESTATUS[0]} db_bench
|
||||
if [ $UPDATE_REPORT -ne 0 ]; then
|
||||
update_report "$1" "$RESULT_PATH/$1" $ops $threads
|
||||
fi
|
||||
|
@ -397,7 +398,7 @@ function test_remote {
|
|||
|
||||
function run_local {
|
||||
eval "$1"
|
||||
exit_on_error $?
|
||||
exit_on_error $? "$1"
|
||||
}
|
||||
|
||||
function setup_options_file {
|
||||
|
@ -416,8 +417,14 @@ function setup_options_file {
|
|||
function setup_test_directory {
|
||||
echo "Deleting old regression test directories and creating new ones"
|
||||
|
||||
run_local 'test "$DB_PATH" != "."'
|
||||
run_remote "rm -rf $DB_PATH"
|
||||
run_remote "rm -rf $DB_BENCH_DIR"
|
||||
|
||||
if [ "$DB_BENCH_DIR" != "." ]; then
|
||||
run_remote "rm -rf $DB_BENCH_DIR"
|
||||
fi
|
||||
|
||||
run_local 'test "$RESULT_PATH" != "."'
|
||||
run_local "rm -rf $RESULT_PATH"
|
||||
|
||||
if ! [ -z "$WAL_PATH" ]; then
|
||||
|
|
Loading…
Reference in New Issue