Troubleshoot blackbox crash test final verification hang (#13070)

Summary:
Add a timeout for the blackbox crash test final verification step, and print the db_stress stack trace on a timeout. The crash test occasionally hangs in the verification step and this will help debug.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13070

Reviewed By: hx235

Differential Revision: D64414461

Pulled By: anand1976

fbshipit-source-id: 4629aac01fbe6c788665beddc66280ba446aadbe
This commit is contained in:
anand76 2024-10-15 13:39:24 -07:00 committed by Facebook GitHub Bot
parent cbebbad7d9
commit 2abbb02d14

View file

@ -443,6 +443,8 @@ blackbox_default_params = {
"duration": 6000, "duration": 6000,
# time for one db_stress instance to run # time for one db_stress instance to run
"interval": 120, "interval": 120,
# time for the final verification step
"verify_timeout": 1200,
# since we will be killing anyway, use large value for ops_per_thread # since we will be killing anyway, use large value for ops_per_thread
"ops_per_thread": 100000000, "ops_per_thread": 100000000,
"reopen": 0, "reopen": 0,
@ -1047,6 +1049,7 @@ def gen_cmd(params, unknown_params):
"cleanup_cmd", "cleanup_cmd",
"skip_tmpdir_check", "skip_tmpdir_check",
"print_stderr_separately", "print_stderr_separately",
"verify_timeout",
} }
and v is not None and v is not None
] ]
@ -1055,9 +1058,10 @@ def gen_cmd(params, unknown_params):
return cmd return cmd
def execute_cmd(cmd, timeout=None): def execute_cmd(cmd, timeout=None, timeout_pstack=False):
child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd))) print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd)))
pid = child.pid
try: try:
outs, errs = child.communicate(timeout=timeout) outs, errs = child.communicate(timeout=timeout)
@ -1065,6 +1069,8 @@ def execute_cmd(cmd, timeout=None):
print("WARNING: db_stress ended before kill: exitcode=%d\n" % child.returncode) print("WARNING: db_stress ended before kill: exitcode=%d\n" % child.returncode)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
hit_timeout = True hit_timeout = True
if timeout_pstack:
os.system("pstack %d" % pid)
child.kill() child.kill()
print("KILLED %d\n" % child.pid) print("KILLED %d\n" % child.pid)
outs, errs = child.communicate() outs, errs = child.communicate()
@ -1139,7 +1145,7 @@ def blackbox_crash_main(args, unknown_args):
cmd = gen_cmd( cmd = gen_cmd(
dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
) )
hit_timeout, retcode, outs, errs = execute_cmd(cmd) hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["verify_timeout"], True)
# For the final run # For the final run
print_output_and_exit_on_error(outs, errs, args.print_stderr_separately) print_output_and_exit_on_error(outs, errs, args.print_stderr_separately)