regression_test.sh: kill very old db_bench (and more) (#10441)

Summary:
If a db_bench process gets hung or runaway on a machine, that
could prevent regression_test.sh from ever making progress. To fix that,
regression_test.sh will now kill any db_bench process that is >12 hours
old. Also made this more reliable by not using string matching (grep) to
get db_bench process IDs.

I also had to make some other updates to get local runs working
reliably:
* Fix some quoting hell and other dubious complexity with db_bench_cmd
* Only save a DB for re-use when building it passes
* Report failed command in more cases
* Add safeguards against "rm -rf ."

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10441

Test Plan:
manual (local and remote), with temporary changes e.g. to have
a manageable age threshold etc.

Reviewed By: riversand963

Differential Revision: D38285537

Pulled By: pdillinger

fbshipit-source-id: 4d598876aedc38ac4bd9d8ddf32c5995d8e44db8
This commit is contained in:
Peter Dillinger 2022-08-02 09:16:17 -07:00 committed by Facebook GitHub Bot
parent cc8ded6152
commit 9da97a3726
1 changed files with 44 additions and 37 deletions

View File

@ -46,6 +46,7 @@
# Default: 1
# TEST_PATH: the root directory of the regression test.
# Default: "/tmp/rocksdb/regression_test"
# !!! NOTE !!! - a DB will also be saved in $TEST_PATH/../db
# RESULT_PATH: the directory where the regression results will be generated.
# Default: "$TEST_PATH/current_time"
# REMOTE_USER_AT_HOST: If set, then test will run on the specified host under
@ -125,15 +126,14 @@ function main {
setup_test_directory
if [ $TEST_MODE -le 1 ]; then
tmp=$DB_PATH
DB_PATH=$ORIGIN_PATH
test_remote "test -d $DB_PATH"
test_remote "test -d $ORIGIN_PATH"
if [[ $? -ne 0 ]]; then
echo "Building DB..."
# compactall alone will not print ops or threads, which will fail update_report
run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
# only save for future use on success
test_remote "mv $DB_PATH $ORIGIN_PATH"
fi
DB_PATH=$tmp
fi
if [ $TEST_MODE -ge 1 ]; then
build_checkpoint
@ -204,9 +204,32 @@ function init_arguments {
# $4 --- use_existing_db. Default: 1
# $5 --- update_report. Default: 1
function run_db_bench {
# this will terminate all currently-running db_bench
find_db_bench_cmd="ps aux | grep db_bench | grep -v grep | grep -v aux | awk '{print \$2}'"
# Make sure no other db_bench is running. (Make sure command succeeds if pidof
# command exists but finds nothing.)
pids_cmd='pidof db_bench || pidof --version > /dev/null'
# But first, make best effort to kill any db_bench that have run for more
# than 12 hours, as that indicates a hung or runaway process.
kill_old_cmd='for PID in $(pidof db_bench); do [ "$(($(stat -c %Y /proc/$PID) + 43200))" -lt "$(date +%s)" ] && echo "Killing old db_bench $PID" && kill $PID && sleep 5 && kill -9 $PID && sleep 5; done; pidof --version > /dev/null'
if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
pids_cmd="$SSH $REMOTE_USER_AT_HOST '$pids_cmd'"
kill_old_cmd="$SSH $REMOTE_USER_AT_HOST '$kill_old_cmd'"
fi
eval $kill_old_cmd
exit_on_error $? "$kill_old_cmd"
pids_output="$(eval $pids_cmd)"
exit_on_error $? "$pids_cmd"
if [ "$pids_output" != "" ]; then
echo "Stopped regression_test.sh as there're still recent db_bench "
echo "processes running: $pids_output"
echo "Clean up test directory"
cleanup_test_directory $TEST_ROOT_DIR
exit 2
fi
# Build db_bench command
ops=${2:-$NUM_OPS}
threads=${3:-$NUM_THREADS}
USE_EXISTING_DB=${4:-1}
@ -220,7 +243,7 @@ function run_db_bench {
options_file_arg=$(setup_options_file)
echo "$options_file_arg"
# use `which time` to avoid using bash's internal time command
db_bench_cmd="("'\$(which time)'" -p $DB_BENCH_DIR/db_bench \
db_bench_cmd="\$(which time) -p $DB_BENCH_DIR/db_bench \
--benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \
--use_existing_db=$USE_EXISTING_DB \
--perf_level=$PERF_LEVEL \
@ -248,38 +271,16 @@ function run_db_bench {
--seed=$SEED \
--multiread_batched=true \
--batch_size=$MULTIREAD_BATCH_SIZE \
--multiread_stride=$MULTIREAD_STRIDE) 2>&1"
ps_cmd="ps aux"
--multiread_stride=$MULTIREAD_STRIDE 2>&1"
if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
db_bench_cmd="$SSH $REMOTE_USER_AT_HOST \"$db_bench_cmd\""
ps_cmd="$SSH $REMOTE_USER_AT_HOST $ps_cmd"
db_bench_cmd="$SSH $REMOTE_USER_AT_HOST '$db_bench_cmd'"
fi
echo db_bench_cmd="$db_bench_cmd"
## make sure no db_bench is running
# The following statement is necessary make sure "eval $ps_cmd" will success.
# Otherwise, if we simply check whether "$(eval $ps_cmd | grep db_bench)" is
# successful or not, then it will always be false since grep will return
# non-zero status when there's no matching output.
ps_output="$(eval $ps_cmd)"
exit_on_error $? "$ps_cmd"
# perform the actual command to check whether db_bench is running
grep_output="$(eval $ps_cmd | grep db_bench | grep -v grep)"
if [ "$grep_output" != "" ]; then
echo "Stopped regression_test.sh as there're still db_bench processes running:"
echo $grep_output
echo "Clean up test directory"
cleanup_test_directory $TEST_ROOT_DIR
exit 2
fi
## run the db_bench
cmd="($db_bench_cmd || db_bench_error=1) | tee -a $RESULT_PATH/$1"
exit_on_error $?
echo $cmd
eval $cmd
exit_on_error $db_bench_error
# Run the db_bench command
eval $db_bench_cmd | tee -a "$RESULT_PATH/$1"
exit_on_error ${PIPESTATUS[0]} db_bench
if [ $UPDATE_REPORT -ne 0 ]; then
update_report "$1" "$RESULT_PATH/$1" $ops $threads
fi
@ -397,7 +398,7 @@ function test_remote {
function run_local {
eval "$1"
exit_on_error $?
exit_on_error $? "$1"
}
function setup_options_file {
@ -416,8 +417,14 @@ function setup_options_file {
function setup_test_directory {
echo "Deleting old regression test directories and creating new ones"
run_local 'test "$DB_PATH" != "."'
run_remote "rm -rf $DB_PATH"
run_remote "rm -rf $DB_BENCH_DIR"
if [ "$DB_BENCH_DIR" != "." ]; then
run_remote "rm -rf $DB_BENCH_DIR"
fi
run_local 'test "$RESULT_PATH" != "."'
run_local "rm -rf $RESULT_PATH"
if ! [ -z "$WAL_PATH" ]; then