Skip subsequent trace writes after encountering trace write failure (#11996)

Summary: **Context/Summary:** We ignore trace writing status e.g, 543191f2ea/db/db_impl/db_impl_write.cc (L221-L222) If a write into the trace file fails, subsequent trace write will continue onto the same file. This will trigger the assertion `assert(sync_without_flush_called_)` intended to catch write to a file that has previously seen error, added in https://github.com/facebook/rocksdb/pull/10489, https://github.com/facebook/rocksdb/pull/10555 Alternative (rejected) is to handle trace writing status at a higher level at e.g, 543191f2ea/db/db_impl/db_impl_write.cc (L221-L222). However, it makes sense to ignore such status considering tracing is not a critical but assistant component to db operation. And this alternative requires more code change. So it's better to handle the failure at a lower level as this PR Pull Request resolved: https://github.com/facebook/rocksdb/pull/11996 Test Plan: Add new UT failed before this PR and pass after Reviewed By: akankshamahajan15 Differential Revision: D50532467 Pulled By: hx235 fbshipit-source-id: f2032abafd94917adbf89a20841d15b448782a33
2023-10-24 09:58:02 -07:00 · 2023-10-24 09:58:02 -07:00 · 99b371b417
parent 519f2a41fb
commit 99b371b417
4 changed files with 58 additions and 2 deletions
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@ -31,6 +31,8 @@ int main() {
 #include "test_util/testutil.h"
 #include "tools/trace_analyzer_tool.h"
 #include "trace_replay/trace_replay.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/trace/file_trace_reader_writer.h"

 namespace ROCKSDB_NAMESPACE {

@ -785,6 +787,48 @@ TEST_F(TraceAnalyzerTest, Iterator) {
  */
 }

+TEST_F(TraceAnalyzerTest, ExistsPreviousTraceWriteError) {
+  DB* db_ = nullptr;
+  Options options;
+  options.create_if_missing = true;
+
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  const std::string trace_path =
+      test_path_ + "/previous_trace_write_error_trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(fault_env.get(), env_options_, trace_path,
+                               &trace_writer));
+
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  ASSERT_OK(db_->StartTrace(TraceOptions(), std::move(trace_writer)));
+
+  // Inject write error on the first trace write.
+  // This trace write is made big enough to actually write to FS for error
+  // injection.
+  const std::string kBigKey(1000000, 'k');
+  const std::string kBigValue(1000000, 'v');
+  fault_env->SetFilesystemActive(false, Status::IOError("Injected"));
+
+  ASSERT_OK(db_->Put(WriteOptions(), kBigKey, kBigValue));
+
+  fault_env->SetFilesystemActive(true);
+
+  // Without proper handling of the previous trace write error,
+  // this trace write will continue and crash the db (in DEBUG_LEVEL > 0)
+  // due to writing to the trace file that has seen error.
+  ASSERT_OK(db_->Put(WriteOptions(), kBigKey, kBigValue));
+
+  // Verify `EndTrace()` returns the previous write trace error if any
+  Status s = db_->EndTrace();
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.ToString().find("Tracing has seen error") != std::string::npos);
+  ASSERT_TRUE(s.ToString().find("Injected") != std::string::npos);
+
+  delete db_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+}
+
 // Test analyzing of multiget
 TEST_F(TraceAnalyzerTest, MultiGet) {
  std::string trace_path = test_path_ + "/trace";
--- a/trace_replay/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@ -345,7 +345,8 @@ Tracer::Tracer(SystemClock* clock, const TraceOptions& trace_options,
    : clock_(clock),
      trace_options_(trace_options),
      trace_writer_(std::move(trace_writer)),
-      trace_request_count_(0) {
+      trace_request_count_(0),
+      trace_write_status_(Status::OK()) {
  // TODO: What if this fails?
  WriteHeader().PermitUncheckedError();
 }
@ -612,9 +613,18 @@ Status Tracer::WriteFooter() {
 }

 Status Tracer::WriteTrace(const Trace& trace) {
+  if (!trace_write_status_.ok()) {
+    return Status::Incomplete("Tracing has seen error: %s",
+                              trace_write_status_.ToString());
+  }
+  assert(trace_write_status_.ok());
  std::string encoded_trace;
  TracerHelper::EncodeTrace(trace, &encoded_trace);
-  return trace_writer_->Write(Slice(encoded_trace));
+  Status s = trace_writer_->Write(Slice(encoded_trace));
+  if (!s.ok()) {
+    trace_write_status_ = s;
+  }
+  return s;
 }

 Status Tracer::Close() { return WriteFooter(); }
--- a/trace_replay/trace_replay.h
+++ b/trace_replay/trace_replay.h
@ -178,6 +178,7 @@ class Tracer {
  TraceOptions trace_options_;
  std::unique_ptr<TraceWriter> trace_writer_;
  uint64_t trace_request_count_;
+  Status trace_write_status_;
 };

 }  // namespace ROCKSDB_NAMESPACE
--- a/unreleased_history/behavior_changes/skip_trace_write.md
+++ b/unreleased_history/behavior_changes/skip_trace_write.md
@ -0,0 +1 @@
+If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occured previously in its status message.
				`@ -0,0 +1 @@`
				If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occured previously in its status message.