regression_test.sh: kill very old db_bench (and more) (#10441)

Summary: If a db_bench process gets hung or runaway on a machine, that could prevent regression_test.sh from ever making progress. To fix that, regression_test.sh will now kill any db_bench process that is >12 hours old. Also made this more reliable by not using string matching (grep) to get db_bench process IDs. I also had to make some other updates to get local runs working reliably: * Fix some quoting hell and other dubious complexity with db_bench_cmd * Only save a DB for re-use when building it passes * Report failed command in more cases * Add safeguards against "rm -rf ." Pull Request resolved: https://github.com/facebook/rocksdb/pull/10441 Test Plan: manual (local and remote), with temporary changes e.g. to have a manageable age threshold etc. Reviewed By: riversand963 Differential Revision: D38285537 Pulled By: pdillinger fbshipit-source-id: 4d598876aedc38ac4bd9d8ddf32c5995d8e44db8
2022-08-02 09:16:17 -07:00 · 2022-08-02 09:16:17 -07:00 · 9da97a3726
parent cc8ded6152
commit 9da97a3726
1 changed files with 44 additions and 37 deletions
--- a/tools/regression_test.sh
+++ b/tools/regression_test.sh
@ -46,6 +46,7 @@
 #       Default: 1
 #   TEST_PATH: the root directory of the regression test.
 #       Default: "/tmp/rocksdb/regression_test"
+#       !!! NOTE !!! - a DB will also be saved in $TEST_PATH/../db
 #   RESULT_PATH: the directory where the regression results will be generated.
 #       Default: "$TEST_PATH/current_time"
 #   REMOTE_USER_AT_HOST: If set, then test will run on the specified host under
@ -125,15 +126,14 @@ function main {

  setup_test_directory
  if [ $TEST_MODE -le 1 ]; then
-      tmp=$DB_PATH
-      DB_PATH=$ORIGIN_PATH
-      test_remote "test -d $DB_PATH"
+      test_remote "test -d $ORIGIN_PATH"
      if [[ $? -ne 0 ]]; then
          echo "Building DB..."
          # compactall alone will not print ops or threads, which will fail update_report
          run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
+          # only save for future use on success
+          test_remote "mv $DB_PATH $ORIGIN_PATH"
      fi
-      DB_PATH=$tmp
  fi
  if [ $TEST_MODE -ge 1 ]; then
      build_checkpoint
@ -204,9 +204,32 @@ function init_arguments {
 # $4 --- use_existing_db.  Default: 1
 # $5 --- update_report. Default: 1
 function run_db_bench {
-  # this will terminate all currently-running db_bench
-  find_db_bench_cmd="ps aux | grep db_bench | grep -v grep | grep -v aux | awk '{print \$2}'"
+  # Make sure no other db_bench is running. (Make sure command succeeds if pidof
+  # command exists but finds nothing.)
+  pids_cmd='pidof db_bench || pidof --version > /dev/null'
+  # But first, make best effort to kill any db_bench that have run for more
+  # than 12 hours, as that indicates a hung or runaway process.
+  kill_old_cmd='for PID in $(pidof db_bench); do [ "$(($(stat -c %Y /proc/$PID) + 43200))" -lt "$(date +%s)" ] && echo "Killing old db_bench $PID" && kill $PID && sleep 5 && kill -9 $PID && sleep 5; done; pidof --version > /dev/null'
+  if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
+    pids_cmd="$SSH $REMOTE_USER_AT_HOST '$pids_cmd'"
+    kill_old_cmd="$SSH $REMOTE_USER_AT_HOST '$kill_old_cmd'"
+  fi

+  eval $kill_old_cmd
+  exit_on_error $? "$kill_old_cmd"
+
+  pids_output="$(eval $pids_cmd)"
+  exit_on_error $? "$pids_cmd"
+
+  if [ "$pids_output" != "" ]; then
+    echo "Stopped regression_test.sh as there're still recent db_bench "
+    echo "processes running: $pids_output"
+    echo "Clean up test directory"
+    cleanup_test_directory $TEST_ROOT_DIR
+    exit 2
+  fi
+
+  # Build db_bench command
  ops=${2:-$NUM_OPS}
  threads=${3:-$NUM_THREADS}
  USE_EXISTING_DB=${4:-1}
@ -220,7 +243,7 @@ function run_db_bench {
  options_file_arg=$(setup_options_file)
  echo "$options_file_arg"
  # use `which time` to avoid using bash's internal time command
-  db_bench_cmd="("'\$(which time)'" -p $DB_BENCH_DIR/db_bench \
+  db_bench_cmd="\$(which time) -p $DB_BENCH_DIR/db_bench \
      --benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \
      --use_existing_db=$USE_EXISTING_DB \
      --perf_level=$PERF_LEVEL \
@ -248,38 +271,16 @@ function run_db_bench {
      --seed=$SEED \
      --multiread_batched=true \
      --batch_size=$MULTIREAD_BATCH_SIZE \
-      --multiread_stride=$MULTIREAD_STRIDE) 2>&1"
-  ps_cmd="ps aux"
+      --multiread_stride=$MULTIREAD_STRIDE 2>&1"
  if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
    echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
-    db_bench_cmd="$SSH $REMOTE_USER_AT_HOST \"$db_bench_cmd\""
-    ps_cmd="$SSH $REMOTE_USER_AT_HOST $ps_cmd"
+    db_bench_cmd="$SSH $REMOTE_USER_AT_HOST '$db_bench_cmd'"
  fi
+  echo db_bench_cmd="$db_bench_cmd"

-  ## make sure no db_bench is running
-  # The following statement is necessary make sure "eval $ps_cmd" will success.
-  # Otherwise, if we simply check whether "$(eval $ps_cmd | grep db_bench)" is
-  # successful or not, then it will always be false since grep will return
-  # non-zero status when there's no matching output.
-  ps_output="$(eval $ps_cmd)"
-  exit_on_error $? "$ps_cmd"
-
-  # perform the actual command to check whether db_bench is running
-  grep_output="$(eval $ps_cmd | grep db_bench | grep -v grep)"
-  if [ "$grep_output" != "" ]; then
-    echo "Stopped regression_test.sh as there're still db_bench processes running:"
-    echo $grep_output
-    echo "Clean up test directory"
-    cleanup_test_directory $TEST_ROOT_DIR
-    exit 2
-  fi
-
-  ## run the db_bench
-  cmd="($db_bench_cmd || db_bench_error=1) | tee -a $RESULT_PATH/$1"
-  exit_on_error $?
-  echo $cmd
-  eval $cmd
-  exit_on_error $db_bench_error
+  # Run the db_bench command
+  eval $db_bench_cmd | tee -a "$RESULT_PATH/$1"
+  exit_on_error ${PIPESTATUS[0]} db_bench
  if [ $UPDATE_REPORT -ne 0 ]; then
    update_report "$1" "$RESULT_PATH/$1" $ops $threads
  fi
@ -397,7 +398,7 @@ function test_remote {

 function run_local {
  eval "$1"
-  exit_on_error $?
+  exit_on_error $? "$1"
 }

 function setup_options_file {
@ -416,8 +417,14 @@ function setup_options_file {
 function setup_test_directory {
  echo "Deleting old regression test directories and creating new ones"

+  run_local 'test "$DB_PATH" != "."'
  run_remote "rm -rf $DB_PATH"
-  run_remote "rm -rf $DB_BENCH_DIR"
+
+  if [ "$DB_BENCH_DIR" != "." ]; then
+    run_remote "rm -rf $DB_BENCH_DIR"
+  fi
+
+  run_local 'test "$RESULT_PATH" != "."'
  run_local "rm -rf $RESULT_PATH"

  if ! [ -z "$WAL_PATH" ]; then