[RPC profiling] Extend RPC profiling to support async function execution over RPC.

rohan-varma · rohan-varma · commit 1168a1a9cdc2 · 2020-09-14T15:50:49.000-07:00
Pull Request resolved: #44664 Closes #39971. This PR adds support for functions decorated with `@rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run. To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node. For example, if the following async function is ran on a server over RPC: ``` def slow_add(x, y): time.sleep(1) return torch.add(x, y) @rpc.functions.async_execution def slow_async_add(to, x, y): return rpc.rpc_async(to, slow_add, args=(x, y)) ``` we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output: ``` ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- -------- ------- --------------- --------------- --------------- Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- -------- ------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s 1.012s 1 1 aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s 1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us 11.843us 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us 22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- -------- ------- --------------- --------------- --------------- Self CPU time total: 164.164us ``` This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code. ghstack-source-id: 112032360 Differential Revision: [D23638387](https://our.internmc.facebook.com/intern/diff/D23638387/)
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
@@ -370,6 +370,14 @@ struct TORCH_API RecordProfile {
   void processEvents(const std::vector<Event*>& events);
 };
 
+// A struct to control settings of disableProfiler options, to be used in
+// conjunction with TlSProfilerGuard.
+
+struct TORCH_API ProfilerDisableOptions {
+  bool cleanupTLSState = true;
+  bool consolidate = true;
+};
+
 // A guard that enables the profiler, taking in an optional callback to process
 // the results
 // Usage:
@@ -379,23 +387,35 @@ struct TORCH_API RecordProfile {
 //   });
 //   Code to profile
 // }
+
 struct TORCH_API TLSProfilerGuard {
   explicit TLSProfilerGuard(
       const ProfilerConfig& cfg,
       c10::optional<std::function<void(const thread_event_lists&)>>
-          resultCallback = c10::nullopt)
-      : cb_(std::move(resultCallback)) {
+          resultCallback = c10::nullopt,
+      c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+          c10::nullopt)
+      : cb_(std::move(resultCallback)),
+        profilerDisableOptions_(std::move(profilerDisableOptions)) {
     enableProfiler(cfg);
   }
   ~TLSProfilerGuard() {
-    thread_event_lists event_lists = disableProfiler();
+    thread_event_lists event_lists;
+    if (profilerDisableOptions_) {
+      event_lists = disableProfiler(
+          profilerDisableOptions_->cleanupTLSState,
+          profilerDisableOptions_->consolidate);
+    } else {
+      event_lists = disableProfiler();
+    }
     if (cb_) {
       (*cb_)(event_lists);
     }
   }
 
  private:
   c10::optional<std::function<void(const thread_event_lists&)>> cb_;
+  c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };
 
 } // namespace profiler
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -486,93 +486,12 @@ void RequestCallbackNoPython::processRpc(
       const auto profilingKeyId = rpcWithProfilingReq.getProfilingId();
       auto wrappedRpcResponseFuture = std::make_shared<FutureMessage>();
       // Enable the profiler with the config from the sender.
-      std::vector<torch::autograd::profiler::Event> profiledEvents;
+      torch::autograd::profiler::ProfilerDisableOptions requestThreadOptions;
+      requestThreadOptions.cleanupTLSState = true;
+      requestThreadOptions.consolidate = false;
       {
         torch::autograd::profiler::TLSProfilerGuard g(
-            profilingConfig,
-            [&profiledEvents, profilingConfig](
-                const std::vector<std::vector<
-                    torch::autograd::profiler::Event>>& event_lists) {
-              // Gather all events into a vector
-              for (auto& l : event_lists) {
-                for (auto& e : l) {
-                  profiledEvents.push_back(e);
-                }
-              }
-              // find __start_profile event and __cuda_start_event.
-              bool cuda_profiling_enabled = profilingConfig.state ==
-                  torch::autograd::profiler::ProfilerState::CUDA;
-              bool found_cpu_start = false;
-              const torch::autograd::profiler::Event* profilerStart = nullptr;
-              // Each device has its own cudaProfilerStart, so we must take
-              // care to use the correct one depending on the device the
-              // operation ran on.
-              std::unordered_map<int, const torch::autograd::profiler::Event*>
-                  cudaProfilerStarts;
-              for (auto& e : profiledEvents) {
-                if (!found_cpu_start &&
-                    0 == strcmp(e.name(), "__start_profile")) {
-                  profilerStart = &e;
-                  found_cpu_start = true;
-                }
-                if (cuda_profiling_enabled &&
-                    0 == strcmp(e.name(), "__cuda_start_event")) {
-                  e.setCudaUs(e.cpu_us());
-                  auto device = e.device();
-                  TORCH_CHECK(
-                      device != -1,
-                      "CUDA profiling was enabled but could not find CUDA device.");
-                  TORCH_CHECK(
-                      cudaProfilerStarts.find(device) ==
-                          cudaProfilerStarts.end(),
-                      c10::str(
-                          "Duplicate __cuda_start_event found for ", device));
-                  cudaProfilerStarts[device] = &e;
-                }
-                // TODO: determine no. of CUDA devices and break here if we have
-                // a cudaProfilerStart for all of them, in the case of cuda
-                // profiling.
-                if (found_cpu_start && !cuda_profiling_enabled) {
-                  break;
-                }
-              }
-              // We should always find __start_profile.
-              TORCH_CHECK(
-                  profilerStart != nullptr,
-                  "Expected to find __start_profile event.");
-              // Should have >= 1 CUDA start event.
-              // TODO: we can enhance this assert by ensuring we have found a
-              // start for every available CUDA device.
-              TORCH_CHECK(
-                  !cuda_profiling_enabled || cudaProfilerStarts.size() > 0,
-                  "Profiler was enabled with CUDA recording, but did not find __cuda_start_event.");
-
-              if (cuda_profiling_enabled) {
-                // Compute and set global time for when this CUDA kernel was
-                // launched/ended, since deserialized event will not have a
-                // corresponding CUDA event.
-                for (auto& e : profiledEvents) {
-                  if (e.has_cuda()) {
-                    auto cuda_device = e.device();
-                    TORCH_CHECK(
-                        cuda_device != -1,
-                        "CUDA profiling was enabled but could not find CUDA device.");
-                    auto it = cudaProfilerStarts.find(cuda_device);
-                    TORCH_CHECK(
-                        it != cudaProfilerStarts.end(),
-                        c10::str(
-                            "Failed to find __cuda_start_event for device ",
-                            cuda_device));
-                    auto cudaProfilerStartEvent = it->second;
-                    double cuda_elapsed_us =
-                        cudaProfilerStartEvent->cuda_elapsed_us(e);
-                    int64_t cuda_us =
-                        cuda_elapsed_us + cudaProfilerStartEvent->cpu_us();
-                    e.setCudaUs(cuda_us);
-                  }
-                }
-              }
-            });
+            profilingConfig, c10::nullopt, requestThreadOptions);
         TORCH_INTERNAL_ASSERT(
             torch::autograd::profiler::profilerEnabled(),
             "Expected profiler to be enabled!");
@@ -583,25 +502,48 @@ void RequestCallbackNoPython::processRpc(
             wrappedMsgType,
             messageId,
             wrappedRpcResponseFuture);
-      }
-      wrappedRpcResponseFuture->addCallback([wrappedRpcResponseFuture,
+
+        auto tid = std::this_thread::get_id();
+        wrappedRpcResponseFuture->addCallback(
+            at::wrapPropagateTLSState<void>([wrappedRpcResponseFuture,
                                              responseFuture,
-                                             profiledEvents =
-                                                 std::move(profiledEvents),
-                                             profilingKeyId] {
-        if (wrappedRpcResponseFuture->hasError()) {
-          // Propagate error
-          responseFuture->setError(wrappedRpcResponseFuture->error()->what());
-        } else {
-          auto rpcWithProfilingResp = std::make_unique<RpcWithProfilingResp>(
-              MessageType::RUN_WITH_PROFILING_RESP,
-              std::move(*wrappedRpcResponseFuture).moveValue(),
-              profiledEvents,
-              profilingKeyId);
-          responseFuture->markCompleted(
-              std::move(*rpcWithProfilingResp).toMessage());
-        }
-      });
+                                             profilingKeyId,
+                                             profilingConfig,
+                                             tid] {
+              std::vector<torch::autograd::profiler::Event> profiledEvents;
+              // Defer consolidation of profiler events until async work has
+              // completed (such as async UDF)
+
+              TORCH_INTERNAL_ASSERT(
+                  torch::autograd::profiler::profilerEnabled(),
+                  "Expected profiler to be enabled!");
+
+              // Only clean up TLS states of profiler if we are disabling on
+              // the main thread.
+              bool shouldCleanUpTLSStates = (std::this_thread::get_id() == tid);
+              auto event_lists = torch::autograd::profiler::disableProfiler(
+                  shouldCleanUpTLSStates, true);
+              if (wrappedRpcResponseFuture->hasError()) {
+                // Propagate error
+                // No need to propagate remote events in the case of an error.
+                responseFuture->setError(
+                    wrappedRpcResponseFuture->error()->what());
+              } else {
+                populateRemoteProfiledEvents(
+                    profiledEvents, profilingConfig, event_lists);
+                auto rpcWithProfilingResp =
+                    std::make_unique<RpcWithProfilingResp>(
+                        MessageType::RUN_WITH_PROFILING_RESP,
+                        std::move(*wrappedRpcResponseFuture).moveValue(),
+                        profiledEvents,
+                        profilingKeyId);
+                responseFuture->markCompleted(
+                    std::move(*rpcWithProfilingResp).toMessage());
+              }
+            }));
+        // Exiting the scope will disable the profiler on this thread with the
+        // options specified above.
+      }
       return;
     }
     default: {
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
@@ -713,6 +713,85 @@ std::vector<at::IValue> readWrappedPayload(
   payload.resize(payload.size() - additionalPayloadSize);
   return tupleElements;
 }
+
+void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::Event>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilingConfig,
+    const std::vector<std::vector<torch::autograd::profiler::Event>>&
+        event_lists) {
+  // Gather all events into a vector
+  for (auto& l : event_lists) {
+    for (auto& e : l) {
+      profiledEvents.push_back(e);
+    }
+  }
+  // find __start_profile event and __cuda_start_event.
+  bool cuda_profiling_enabled =
+      profilingConfig.state == torch::autograd::profiler::ProfilerState::CUDA;
+  bool found_cpu_start = false;
+  const torch::autograd::profiler::Event* profilerStart = nullptr;
+  // Each device has its own cudaProfilerStart, so we must take
+  // care to use the correct one depending on the device the
+  // operation ran on.
+  std::unordered_map<int, const torch::autograd::profiler::Event*>
+      cudaProfilerStarts;
+  for (auto& e : profiledEvents) {
+    if (!found_cpu_start && 0 == strcmp(e.name(), "__start_profile")) {
+      profilerStart = &e;
+      found_cpu_start = true;
+    }
+    if (cuda_profiling_enabled && 0 == strcmp(e.name(), "__cuda_start_event")) {
+      e.setCudaUs(e.cpu_us());
+      auto device = e.device();
+      TORCH_CHECK(
+          device != -1,
+          "CUDA profiling was enabled but could not find CUDA device.");
+      TORCH_CHECK(
+          cudaProfilerStarts.find(device) == cudaProfilerStarts.end(),
+          c10::str("Duplicate __cuda_start_event found for ", device));
+      cudaProfilerStarts[device] = &e;
+    }
+    // TODO: determine no. of CUDA devices and break here if we have
+    // a cudaProfilerStart for all of them, in the case of cuda
+    // profiling.
+    if (found_cpu_start && !cuda_profiling_enabled) {
+      break;
+    }
+  }
+  // We should always find __start_profile.
+  TORCH_CHECK(
+      profilerStart != nullptr, "Expected to find __start_profile event.");
+  // Should have >= 1 CUDA start event.
+  // TODO: we can enhance this assert by ensuring we have found a
+  // start for every available CUDA device.
+  TORCH_CHECK(
+      !cuda_profiling_enabled || cudaProfilerStarts.size() > 0,
+      "Profiler was enabled with CUDA recording, but did not find __cuda_start_event.");
+
+  if (cuda_profiling_enabled) {
+    // Compute and set global time for when this CUDA kernel was
+    // launched/ended, since deserialized event will not have a
+    // corresponding CUDA event.
+    for (auto& e : profiledEvents) {
+      if (e.has_cuda()) {
+        auto cuda_device = e.device();
+        TORCH_CHECK(
+            cuda_device != -1,
+            "CUDA profiling was enabled but could not find CUDA device.");
+        auto it = cudaProfilerStarts.find(cuda_device);
+        TORCH_CHECK(
+            it != cudaProfilerStarts.end(),
+            c10::str(
+                "Failed to find __cuda_start_event for device ", cuda_device));
+        auto cudaProfilerStartEvent = it->second;
+        double cuda_elapsed_us = cudaProfilerStartEvent->cuda_elapsed_us(e);
+        int64_t cuda_us = cuda_elapsed_us + cudaProfilerStartEvent->cpu_us();
+        e.setCudaUs(cuda_us);
+      }
+    }
+  }
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/utils.h b/torch/csrc/distributed/rpc/utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/Device.h>
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 #include <torch/csrc/utils/byte_order.h>
@@ -125,6 +126,12 @@ TORCH_API std::vector<at::IValue> readWrappedPayload(
     std::vector<char>& payload,
     const rpc::Message& message);
 
+TORCH_API void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::Event>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilerConfig,
+    const std::vector<std::vector<torch::autograd::profiler::Event>>&
+        event_lists);
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
@@ -18,6 +18,26 @@
 INIT_METHOD_TEMPLATE = "file://{file_name}"
 
 
+
+def single_threaded_process_group_agent(f):
+    """
+    Forces ProcessGroupAgent to use only a single thread in the ThreadPool for
+    sending and processing requests.
+    """
+    @wraps(f)
+    def wrapper(self, *args, **kwargs):
+        backend_type = self.rpc_backend
+        if backend_type == rpc.backend_registry.BackendType["PROCESS_GROUP"]:
+            self.rpc_backend_options = rpc.backend_registry.construct_rpc_backend_options(
+                self.rpc_backend,
+                init_method=self.init_method,
+                num_send_recv_threads=1,
+            )
+        return_value = f(self, *args, **kwargs)
+        return return_value
+    return wrapper
+
+
 def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
               faulty_messages=None, messages_to_delay=None):
     """
diff --git a/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py
@@ -13,12 +13,19 @@ def rpc_backend(self):
 
     @property
     def rpc_backend_options(self):
-        return rpc.backend_registry.construct_rpc_backend_options(
-            self.rpc_backend,
-            init_method=self.init_method,
-            # Some tests need additional threads (ex: test_trainer_ps)
-            num_send_recv_threads=8,
-        )
+        try:
+            return self._rpc_backend_options
+        except AttributeError:
+            return rpc.backend_registry.construct_rpc_backend_options(
+                self.rpc_backend,
+                init_method=self.init_method,
+                # Some tests need additional threads (ex: test_trainer_ps)
+                num_send_recv_threads=8,
+            )
+
+    @rpc_backend_options.setter
+    def rpc_backend_options(self, new_rpc_backend_options):
+        self._rpc_backend_options = new_rpc_backend_options
 
     def get_shutdown_error_regex(self):
         error_regexes = [
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py