[XLA:GPU] Support for multiple infeed managers per process.

jurahul · tensorflower-gardener · commit 92e03156976e · 2021-03-18T10:05:13.000-07:00
- Enable multiple infeed managers per-process, to enable running multiple replicas within
  the same process (as exercised by SPMD tests).
- To enable this, add the ability to attach a type-erased XLA specific state to the
  GpuExecutor
- Define the XLA specific executor state for GPU to be the infeed manager instance.

PiperOrigin-RevId: 363681266
Change-Id: Ia04f22db51700445a885e514a075029eb9b0be4f
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1487,13 +1487,18 @@ cc_library(
 
 cc_library(
     name = "infeed_manager",
-    srcs = ["infeed_manager.cc"],
+    srcs = [
+        "infeed_manager.cc",
+        "xla_executor_state.h",
+    ],
     hdrs = ["infeed_manager.h"],
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
     deps = [
         ":xfeed_queue",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core/platform:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
     ],
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -68,8 +68,8 @@ Status GpuTransferManager::TransferLiteralToInfeed(
 
 Status GpuTransferManager::EnqueueBuffersToInfeed(
     se::StreamExecutor* executor, ShapeTree<InfeedBuffer> buffers) {
-  gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager();
-  se::Stream* stream = infeed_manager->GetStream(executor);
+  gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager(executor);
+  se::Stream* stream = infeed_manager->GetStream();
 
   // TODO(b/30467474): Since this stream is shared across different
   // infeed requests, blocking on the stream might be
@@ -99,8 +99,8 @@ StatusOr<InfeedBuffer> GpuTransferManager::TransferBufferToInfeedInternal(
     return InvalidArgument("Infeed shape needs 0 bytes");
   }
 
-  gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager();
-  se::Stream* stream = infeed_manager->GetStream(executor);
+  gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager(executor);
+  se::Stream* stream = infeed_manager->GetStream();
   if (stream == nullptr) {
     return InternalError("Failed to obtain a stream");
   }
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -17,29 +17,29 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/compiler/xla/service/gpu/xla_executor_state.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#endif  // GOOGLE_CUDA
+
 namespace xla {
 namespace gpu {
 
-se::Stream* InfeedManager::GetStream(se::StreamExecutor* executor) {
-  tensorflow::mutex_lock l(host_to_device_stream_mu_);
-  if (host_to_device_executor_ == nullptr) {
-    host_to_device_executor_ = executor;
-    host_to_device_stream_ = absl::make_unique<se::Stream>(executor);
-    host_to_device_stream_->Init();
-  }
-
-  if (executor != host_to_device_executor_) {
-    // The requested executor must be the same as the one for which
-    // the stream is cached.
-    return nullptr;
-  }
-
-  return host_to_device_stream_.get();
+InfeedManager::InfeedManager(se::StreamExecutor *executor)
+    : stream_(absl::make_unique<se::Stream>(executor)) {
+  stream_->Init();
 }
 
-InfeedManager* GetOrCreateInfeedManager() {
-  static InfeedManager* manager = new InfeedManager;
-  return manager;
+InfeedManager *GetOrCreateInfeedManager(se::StreamExecutor *executor) {
+#if GOOGLE_CUDA
+  stream_executor::gpu::GpuExecutor *gpu_executor =
+      stream_executor::gpu::ExtractGpuExecutor(executor);
+  auto *xla_state =
+      gpu_executor->getOrCreateXLAState<GpuExecutorXLAState>(executor);
+  return xla_state->getOrCreateInfeedManager(executor);
+#else   // GOOGLE_CUDA
+  return nullptr;
+#endif  // GOOGLE_CUDA
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@@ -64,26 +64,18 @@ class InfeedBuffer {
 // Client-side class used to enqueue infeed buffers.
 class InfeedManager : public XfeedQueue<ShapeTree<InfeedBuffer>> {
  public:
-  // Returns a cached stream associated with an executor. Allocates a
-  // new stream on the first invocation. On subsequent invocations, if
-  // the cached executor is not the same as the requested executor,
-  // returns null.
-  se::Stream* GetStream(se::StreamExecutor* executor);
+  explicit InfeedManager(se::StreamExecutor* executor);
 
- private:
-  // Mutex for serializing the creation of host_to_device_stream_.
-  tensorflow::mutex host_to_device_stream_mu_;
-
-  // Cached host to device stream for queuing infeed data.
-  std::unique_ptr<se::Stream> host_to_device_stream_
-      ABSL_GUARDED_BY(host_to_device_stream_mu_);
+  // Returns a stream for this infeed manager.
+  se::Stream* GetStream() const { return stream_.get(); }
 
-  // Executor that the host_to_device_stream belongs to. Not owned.
-  se::StreamExecutor* host_to_device_executor_ = nullptr;
+ private:
+  // Stream used to enqueue infeed device copies.
+  std::unique_ptr<se::Stream> stream_;
 };
 
-// Singleton creator-or-accessor: Returns the GPU infeed manager.
-InfeedManager* GetOrCreateInfeedManager();
+// Returns the GPU infeed manager for the given stream executor,
+InfeedManager* GetOrCreateInfeedManager(se::StreamExecutor* executor);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -31,15 +32,16 @@ InfeedThunk::InfeedThunk(ThunkInfo thunk_info,
     : Thunk(Kind::kInfeed, thunk_info), dest_slices_(std::move(dest_slices)) {}
 
 Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
-  auto& stream = *params.stream;
-  auto& buffer_allocations = *params.buffer_allocations;
+  se::Stream& stream = *params.stream;
+  const BufferAllocations& buffer_allocations = *params.buffer_allocations;
 
   VLOG(2) << "Infeeding to GPU";
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
+
   ShapeTree<InfeedBuffer> source_buffers =
-      GetOrCreateInfeedManager()->BlockingGetNextDestination();
+      GetOrCreateInfeedManager(stream.parent())->BlockingGetNextDestination();
 
   size_t index = 0;
   for (auto& source : source_buffers.leaves()) {
diff --git a/tensorflow/compiler/xla/service/gpu/xla_executor_state.h b/tensorflow/compiler/xla/service/gpu/xla_executor_state.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XLA_EXECUTOR_STATE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XLA_EXECUTOR_STATE_H_
+
+#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+
+// Defines XLA:GPU specific state that will be attached to the GpuExecutor.
+
+namespace xla {
+namespace gpu {
+
+class GpuExecutorXLAState {
+ public:
+  explicit GpuExecutorXLAState(stream_executor::StreamExecutor *) {}
+  InfeedManager *getOrCreateInfeedManager(stream_executor::StreamExecutor *se) {
+    tensorflow::mutex_lock l(mu_);
+    if (!infeed_manager_) {
+      infeed_manager_ = std::make_unique<InfeedManager>(se);
+    }
+    return infeed_manager_.get();
+  }
+
+ private:
+  tensorflow::mutex mu_;
+  std::unique_ptr<InfeedManager> infeed_manager_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XLA_EXECUTOR_STATE_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -115,10 +115,6 @@ GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
   return cuda_exec->gpu_context();
 }
 
-GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
-  return static_cast<GpuExecutor*>(stream_exec->implementation());
-}
-
 GpuExecutor::~GpuExecutor() {
   CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
   CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
@@ -45,6 +45,7 @@ cc_library(
     srcs = if_gpu_is_configured(["gpu_activation.cc"]),
     hdrs = if_gpu_is_configured(["gpu_activation.h"]),
     deps = if_gpu_is_configured([
+        ":gpu_executor_header",
         ":gpu_activation_header",
         ":gpu_driver_header",
         "//tensorflow/stream_executor",
@@ -109,6 +110,7 @@ cc_library(
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:platform",
         "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/stream_executor/gpu/gpu_activation.cc b/tensorflow/stream_executor/gpu/gpu_activation.cc
@@ -16,14 +16,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/gpu/gpu_activation.h"
 
 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
 namespace gpu {
 
 GpuContext* ExtractGpuContext(GpuExecutor* gpu_exec);
-GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec);
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
     GpuExecutor* gpu_exec)
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -22,11 +22,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
 
+#include <memory>
 #include <set>
+#include <type_traits>
 #include <unordered_map>
 
 #include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/gpu/gpu_kernel.h"
@@ -35,13 +37,55 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 namespace stream_executor {
+
+class StreamExecutor;
+
 namespace gpu {
 
+// Pointer-to-implementation object type with virtual destruction for any XLA
+// specific data hanging off of the GpuExecutor.
+class XLAInterface {
+ public:
+  // Default constructor for the abstract interface.
+  explicit XLAInterface() {}
+
+  // Default destructor for the abstract interface.
+  virtual ~XLAInterface() {}
+};
+
 // CUDA-platform implementation of the platform-agnostic
 // StreamExecutorInterface.
 class GpuExecutor : public internal::StreamExecutorInterface {
+  // Helper classes to attach a type erased state to the GpuExecutor. Currently,
+  // we just need to support some XLA specific state.
+  class Object {
+    struct Concept {
+      virtual ~Concept() {}
+    };
+    template <typename T>
+    struct Model : Concept {
+      explicit Model(StreamExecutor* se) : object(se) {}
+      T object;
+    };
+
+   public:
+    template <typename T>
+    T* getOrCreate(StreamExecutor* se) {
+      tensorflow::mutex_lock l(mu_);
+      if (!object_) {
+        object_ = std::make_unique<Model<T>>(se);
+      }
+      return &(dynamic_cast<Model<T>*>(object_.get())->object);
+    }
+
+   private:
+    tensorflow::mutex mu_;
+    std::unique_ptr<Concept> object_ ABSL_GUARDED_BY(mu_);
+  };
+
  public:
   // sub_platform indicates the subplatform used in this executor; it must
   // be a CUDA type.
@@ -233,6 +277,20 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   GpuContext* gpu_context();
 
+  // Provide a type-erased way of attaching arbitrary XLA specific state to the
+  // GpuExecutor. XLA based execution will use this method to attach per-stream
+  // executor XLA specific objects (like the Infeed and Outfeed managers) to the
+  // stream executor, so that their lifetimes can be tied to the lifetime of the
+  // stream executor for which that object is allocated for. This simplifies
+  // memory management as compared to having these objects reside on the side
+  // and then either leaking or having to implement callbacks that the SE
+  // destructors call to deallocate any side state that is associated with that
+  // SE object.
+  template <typename T>
+  T* getOrCreateXLAState(StreamExecutor* se) {
+    return xla_state_.getOrCreate<T>(se);
+  }
+
  private:
   // Attempts to find a more specific version of the file indicated by
   // filename by looking for compute-capability-specific suffixed versions; i.e.
@@ -337,9 +395,16 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   // The plugin configuration associated with this instance.
   PluginConfig plugin_config_;
 
+  // Type erased XLA specific state attached to GpuExecutor.
+  Object xla_state_;
+
   SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
 };
 
+inline GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
+}
+
 }  // namespace gpu
 }  // namespace stream_executor
 
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -96,10 +96,6 @@ GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
   return rocm_exec->gpu_context();
 }
 
-GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
-  return static_cast<GpuExecutor*>(stream_exec->implementation());
-}
-
 GpuExecutor::~GpuExecutor() {
   for (auto& it : disk_modules_) {
     GpuDriver::UnloadModule(context_, it.second);