[NCCL] Add Environment Variable to guard Async Error Handling feature

osalpekar · osalpekar · commit c83c709e92d1 · 2020-09-08T17:48:16.000-07:00
Pull Request resolved: #44163 In this PR, we introduce a new environment variable (NCCL_ASYNC_ERROR_HANDLING), which guards the asynchronous error handling feature. We intend to eventually turn this feature on by default for all users, but this is a temporary solution so the change in behavior from hanging to crashing is not the default for users all of a sudden. ghstack-source-id: 111637788 Differential Revision: [D23517895](https://our.internmc.facebook.com/intern/diff/D23517895/)
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -404,6 +404,21 @@ void ProcessGroupNCCL::parseNcclBlockingWait() {
   }
 }
 
+void ProcessGroupNCCL::parseNcclAsyncErrorHandling() {
+  char* errorHandle = getenv(NCCL_ASYNC_ERROR_HANDLING);
+  if (errorHandle != nullptr) {
+    auto val = std::stoi(errorHandle);
+    if (val == 1) {
+      asyncErrorHandling_ = true;
+      LOG(INFO) << "[Rank " << rank_ << "] NCCL Async Error Handling enabled.";
+    } else if (val != 0) {
+      throw std::runtime_error(
+          "Invalid value for environment variable: " +
+          std::string(NCCL_ASYNC_ERROR_HANDLING));
+    }
+  }
+}
+
 bool ProcessGroupNCCL::WorkNCCL::timedOut() {
   auto currentTimepoint = std::chrono::steady_clock::now();
   return (
@@ -428,6 +443,13 @@ ProcessGroupNCCL::ProcessGroupNCCL(
         "Invalid value for environment variable: " +
         std::string(NCCL_BLOCKING_WAIT));
   }
+  try {
+    parseNcclAsyncErrorHandling();
+  } catch (std::exception& e) {
+    throw std::runtime_error(
+        "Invalid value for environment variable: " +
+        std::string(NCCL_ASYNC_ERROR_HANDLING));
+  }
 
   // If single-process single-device mode, WorkNCCL::getFuture is supported,
   // so get a dedicated stream for each device to run FutureNCCL then callbacks.
@@ -445,31 +467,36 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);
 #endif
 
-  workCleanupThread_ = std::thread(&ProcessGroupNCCL::workCleanupLoop, this);
+  if (asyncErrorHandling_) {
+    workCleanupThread_ = std::thread(&ProcessGroupNCCL::workCleanupLoop, this);
+  }
 }
 
 ProcessGroupNCCL::~ProcessGroupNCCL() {
   terminateProcessGroup_.store(true);
   watchdogCV_.notify_one();
   workListCV_.notify_one();
 
-  std::unique_lock<std::mutex> lock(workListMutex_);
-  // TODO: We can potentially merge this functionality into the workCleanup
-  // thread or just allow the destructor to free workList_.
-  // Clean up any remaining items in the workList_ instead of waiting for the
-  // workCleanup Thread to be scheduled again.
-  for (auto it = workList_.begin(); it != workList_.end();
-       /* no increment*/) {
-    auto& work = *it;
-    if (work->isCompleted()) {
-      it = workList_.erase(it);
-    } else {
-      ++it;
+  if (asyncErrorHandling_) {
+    std::unique_lock<std::mutex> lock(workListMutex_);
+    // TODO: We can potentially merge this functionality into the workCleanup
+    // thread or just allow the destructor to free workList_.
+    // Clean up any remaining items in the workList_ instead of waiting for the
+    // workCleanup Thread to be scheduled again.
+    for (auto it = workList_.begin(); it != workList_.end();
+         /* no increment*/) {
+      auto& work = *it;
+      if (work->isCompleted()) {
+        it = workList_.erase(it);
+      } else {
+        ++it;
+      }
     }
+    // Wait for workList_ to become empty before proceeding with shutdown.
+    workListCV_.wait(lock, [&]() -> bool { return workList_.empty(); });
+    lock.unlock();
+    workCleanupThread_.join();
   }
-  // Wait for workList_ to become empty before proceeding with shutdown.
-  workListCV_.wait(lock, [&]() -> bool { return workList_.empty(); });
-  lock.unlock();
 
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   ncclCommWatchdogThread_.join();
@@ -486,7 +513,6 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
       }
     }
   }
-  workCleanupThread_.join();
 }
 
 void ProcessGroupNCCL::ncclCommWatchdog() {
@@ -542,7 +568,7 @@ void ProcessGroupNCCL::ncclCommWatchdogInternal() {
       }
     }
 
-    {
+    if (asyncErrorHandling_) {
       std::unique_lock<std::mutex> lock(workListMutex_);
       for (auto& work : workList_) {
         work->checkAndSetException();
@@ -964,7 +990,9 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     work->store_ = store_;
   }
 
-  workEnqueue(work);
+  if (asyncErrorHandling_) {
+    workEnqueue(work);
+  }
 
   return work;
 }
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -19,6 +19,10 @@ namespace c10d {
 // non-blocking.
 constexpr const char* NCCL_BLOCKING_WAIT = "NCCL_BLOCKING_WAIT";
 
+// Environment variable which controls whether or not we perform Async Error
+// Handling with NCCL.
+constexpr const char* NCCL_ASYNC_ERROR_HANDLING = "NCCL_ASYNC_ERROR_HANDLING";
+
 // ProcessGroupNCCL implements NCCL bindings for c10d.
 //
 // All functions of the class are expected to be called in the same order
@@ -490,6 +494,10 @@ class ProcessGroupNCCL : public ProcessGroup {
   // accordingly.
   void parseNcclBlockingWait();
 
+  // Reads the NCCL_ASYNC_ERROR_HANDLING environment variable and sets asyncErrorHandling_
+  // accordingly.
+  void parseNcclAsyncErrorHandling();
+
   void workCleanupLoop();
 
  protected:
@@ -594,6 +602,10 @@ class ProcessGroupNCCL : public ProcessGroup {
   // for the operation to complete.
   bool blockingWait_ = false;
 
+  // Whether ot not the workCleanupThread is used to perform async error
+  // handling.
+  bool asyncErrorHandling_ = false;
+
   // Timeout for operations. This is only used when blockingWait_ is enabled.
   std::chrono::milliseconds opTimeout_;