[AOTI][refactor] Split common aoti_runtime utils into a separate header (#119066)

desertfire · pytorchmergebot · commit 40ec155e58ee · 2024-02-07T16:54:00.000Z
Summary: Split common utils from aoti_runtime/model.h into a separate header file, because when turning on ABI-compatible mode for JIT Inductor we won't need AOTInductorModel, but we do need some common utils, e.g. RAIIAtenTensorHandle. Differential Revision: [D53478809](https://our.internmc.facebook.com/intern/diff/D53478809) Pull Request resolved: #119066 Approved by: https://github.com/khabinov
diff --git a/torch/_inductor/codegen/aoti_runtime/interface.cpp b/torch/_inductor/codegen/aoti_runtime/interface.cpp
@@ -341,8 +341,4 @@ AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
   })
 }
 
-#define CACHE_TORCH_DTYPE(typename) static auto cached_torch_dtype_##typename = aoti_torch_dtype_##typename()
-
-  static auto cached_torch_device_type_cpu = aoti_torch_device_type_cpu();
-  static auto cached_torch_device_type_cuda = aoti_torch_device_type_cuda();
 } // extern "C"
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -1421,6 +1421,7 @@ def __init__(self):
         self.declared_int_array_vars = set()
         self.tmp_tensor_id = count()  # for tmp tensor local variable declarations
         self.arg_var_id = count()
+        self.used_cached_devices = set()
         self.used_cached_dtypes = set()
         self.cached_output_id = count()
         self.scalar_to_tensor_id = count()
@@ -2047,6 +2048,8 @@ def finalize_prefix(self):
         if config.abi_compatible:
             for dtype in self.used_cached_dtypes:
                 cached_dtypes_buffer.writeline(f"CACHE_TORCH_DTYPE({dtype});")
+            for device in self.used_cached_devices:
+                cached_dtypes_buffer.writeline(f"CACHE_TORCH_DEVICE({device});")
         cached_dtypes_buffer.splice(self.prefix)
         self.prefix = cached_dtypes_buffer
 
@@ -2521,6 +2524,7 @@ def generate_inf_and_nan_checker(self, nodes):
 
     def codegen_device(self, device):
         if config.abi_compatible:
+            self.used_cached_devices.add(device.type)
             return f"cached_torch_device_type_{device.type},{device.index if device.index else 0}"
         else:
             from .cpp import DEVICE_TO_ATEN
@@ -3078,7 +3082,11 @@ def write_header(self):
         super().write_header()
 
         self.header.splice("#include <filesystem>")
-        if not config.abi_compatible:
+        if config.abi_compatible:
+            self.header.splice(
+                "#include <torch/csrc/inductor/aoti_runtime/utils_cuda.h>"
+            )
+        else:
             self.header.splice(
                 """
                 #include <c10/cuda/CUDAGuard.h>
diff --git a/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h b/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
@@ -1,7 +1,6 @@
 #pragma once
 
-#include <torch/csrc/inductor/aoti_runtime/model.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 
 #include <assert.h>
 #include <cstdint>
diff --git a/torch/csrc/inductor/aoti_runtime/interface.h b/torch/csrc/inductor/aoti_runtime/interface.h
@@ -1,34 +1,10 @@
 #pragma once
 
-#include <stddef.h>
-#include <stdint.h>
-
 // WARNING: Be careful when adding new includes here. This header will be used
 // in model.so, and should not refer to any aten/c10 headers except the stable
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-
-#ifdef __GNUC__
-#define AOT_INDUCTOR_EXPORT __attribute__((__visibility__("default")))
-#else // !__GNUC__
-#ifdef _WIN32
-#define AOT_INDUCTOR_EXPORT __declspec(dllexport)
-#else // !_WIN32
-#define AOT_INDUCTOR_EXPORT
-#endif // _WIN32
-#endif // __GNUC__
-
-using AOTIRuntimeError = int32_t;
-#define AOTI_RUNTIME_SUCCESS 0
-#define AOTI_RUNTIME_FAILURE 1
-
-#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)                                \
-  if ((call) != AOTI_RUNTIME_SUCCESS) {                                    \
-    throw std::runtime_error(                                              \
-        std::string(#call " API call failed at ") + __FILE__ + ", line " + \
-        std::to_string(__LINE__));                                         \
-  }
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 
 extern "C" {
 struct AOTInductorModelOpaque;
diff --git a/torch/csrc/inductor/aoti_runtime/model.h b/torch/csrc/inductor/aoti_runtime/model.h
@@ -1,22 +1,15 @@
 #pragma once
 
-#include <functional>
-#include <iostream>
-#include <memory>
 #include <optional>
 #include <regex>
-#include <sstream>
-#include <stdexcept>
-#include <string>
 #include <unordered_map>
-#include <vector>
 
 // WARNING: Be careful when adding new includes here. This header will be used
 // in model.so, and should not refer to any aten/c10 headers except the stable
 // C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
 // applies to other files under torch/csrc/inductor/aoti_runtime/.
 #include <torch/csrc/inductor/aoti_runtime/device_utils.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 
 #define AOTI_RUNTIME_CHECK(EXPR, MSG) \
   do {                                \
@@ -26,14 +19,6 @@
     }                                 \
   } while (0)
 
-#if defined(__GNUC__) || defined(__clang__)
-#define AOTI_NOINLINE __attribute__((noinline))
-#elif _MSC_VER
-#define AOTI_NOINLINE __declspec(noinline)
-#else
-#define AOTI_NOINLINE
-#endif
-
 // At codegen time, we write out a binary file called constants.bin.
 // We then turn the raw binary to an object file that exposes this
 // symbol and link it into the final .so.
@@ -63,146 +48,10 @@ CUDAPtr RAII_cudaMalloc(size_t num_bytes) {
 
 } // anonymous namespace
 
-AOTI_NOINLINE static void throw_exception(
-    const char* call,
-    const char* file,
-    int64_t line) {
-  std::stringstream ss;
-  ss << call << " API call failed at " << file << ", line " << line;
-  throw std::runtime_error(ss.str());
-}
-
-#define AOTI_TORCH_ERROR_CODE_CHECK(call)       \
-  if ((call) != AOTI_TORCH_SUCCESS) {           \
-    throw_exception(#call, __FILE__, __LINE__); \
-  }
-
-using DeleterFnPtr = void (*)(void*);
-
 namespace torch {
 namespace aot_inductor {
-
-inline void noop_deleter(void*) {}
-
-inline void delete_tensor_object(void* ptr) {
-  AOTI_TORCH_ERROR_CODE_CHECK(
-      aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
-}
-
-// RAIIAtenTensorHandle steals the tensor objects created by the libtorch C ABI
-class RAIIAtenTensorHandle {
- public:
-  RAIIAtenTensorHandle() : handle_(nullptr, noop_deleter) {}
-  RAIIAtenTensorHandle(const RAIIAtenTensorHandle& other) = delete;
-  RAIIAtenTensorHandle& operator=(const RAIIAtenTensorHandle& other) = delete;
-
-  // Steal the ownership from another RAIIAtenTensorHandle using std::move
-  RAIIAtenTensorHandle(RAIIAtenTensorHandle&& other) = default;
-  RAIIAtenTensorHandle& operator=(RAIIAtenTensorHandle&& other) = default;
-
-  // Steal the ownership from raw AtenTensorHandle
-  RAIIAtenTensorHandle(AtenTensorHandle handle)
-      : handle_(handle, delete_tensor_object) {}
-
-  ~RAIIAtenTensorHandle() {
-    handle_.reset();
-  }
-
-  // Return a raw AtenTensorHandle to be used by aoti_torch functions
-  // Note: this function does NOT transfer the ownership of the handle
-  operator AtenTensorHandle() const {
-    return handle_.get();
-  }
-
-  AtenTensorHandle release() {
-    return handle_.release();
-  }
-
-  AtenTensorHandle get() const {
-    return handle_.get();
-  }
-
-  void reset() {
-    handle_.reset();
-  }
-
-  int64_t size(int64_t d) {
-    int64_t size;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(handle_.get(), d, &size));
-    return size;
-  }
-
-  int64_t stride(int64_t d) {
-    int64_t stride;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_get_stride(handle_.get(), d, &stride));
-    return stride;
-  }
-
-  int64_t storage_offset() {
-    int64_t storage_offset;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_get_storage_offset(handle_.get(), &storage_offset));
-    return storage_offset;
-  }
-
- private:
-  std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
-};
-
 using ConstantMap = std::unordered_map<std::string, RAIIAtenTensorHandle>;
 
-class ConstantHandle {
- public:
-  ConstantHandle() = default;
-
-  explicit ConstantHandle(AtenTensorHandle handle) : handle_(handle) {
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle_, &data_));
-  }
-
-  operator AtenTensorHandle() const {
-    return handle_;
-  }
-
-  AtenTensorHandle tensor() const {
-    return handle_;
-  }
-
-  void* data_ptr() const {
-    return data_;
-  }
-
- private:
-  AtenTensorHandle handle_;
-  void* data_ = nullptr;
-};
-
-inline void* get_data_ptr_wrapper(const ConstantHandle& constant) {
-  return constant.data_ptr();
-}
-
-inline const ConstantHandle& unwrap_raii_handle_if_needed(
-    const ConstantHandle& handle) {
-  return handle;
-}
-
-// Shouldn't be called.
-inline AtenTensorHandle wrap_with_raii_handle_if_needed(
-    const ConstantHandle& handle) = delete;
-
-// Steal the ownership from raw AtenTensorHandle to RAIIAtenTensorHandle
-inline std::vector<RAIIAtenTensorHandle> steal_from_raw_handles_to_raii_handles(
-    AtenTensorHandle* handles,
-    size_t size) {
-  std::vector<RAIIAtenTensorHandle> result;
-  result.reserve(size);
-  for (size_t i = 0; i < size; i++) {
-    result.emplace_back(handles[i]);
-    handles[i] = nullptr;
-  }
-  return result;
-}
-
 // valid device strs are: cpu, cuda, cuda:0, cuda:1, ...
 // Update the list here if more devices are supported in the future
 inline void parse_device_str(
@@ -644,24 +493,5 @@ class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
   std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
 };
 
-#ifdef USE_CUDA
-class AOTICudaStreamGuard {
- public:
-  AOTICudaStreamGuard(cudaStream_t stream, int32_t device_index) {
-    CUDAStreamGuardHandle ptr;
-    AOTI_TORCH_ERROR_CODE_CHECK(
-        aoti_torch_create_cuda_stream_guard(stream, device_index, &ptr));
-    guard_ =
-        std::unique_ptr<void, std::function<void(void*)>>(ptr, [](void* ptr) {
-          AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_cuda_stream_guard(
-              reinterpret_cast<CUDAStreamGuardHandle>(ptr)));
-        });
-  }
-
- private:
-  std::unique_ptr<void, std::function<void(void*)>> guard_;
-};
-#endif // USE_CUDA
-
 } // namespace aot_inductor
 } // namespace torch
diff --git a/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h b/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
@@ -1,7 +1,6 @@
 #pragma once
 
-#include <torch/csrc/inductor/aoti_runtime/model.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
 
 namespace torch {
 namespace aot_inductor {
diff --git a/torch/csrc/inductor/aoti_runtime/utils.h b/torch/csrc/inductor/aoti_runtime/utils.h
diff --git a/torch/csrc/inductor/aoti_runtime/utils_cuda.h b/torch/csrc/inductor/aoti_runtime/utils_cuda.h

Original file line number	Diff line number	Diff line change
`@@ -341,8 +341,4 @@ AOTIRuntimeError AOTInductorModelUpdateConstantsMap(`
`341`	`341`	`})`
`342`	`342`	`}`
`343`	`343`
`344`		`-#define CACHE_TORCH_DTYPE(typename) static auto cached_torch_dtype_##typename = aoti_torch_dtype_##typename()`
`345`		`-`
`346`		`- static auto cached_torch_device_type_cpu = aoti_torch_device_type_cpu();`
`347`		`- static auto cached_torch_device_type_cuda = aoti_torch_device_type_cuda();`
`348`	`344`	`} // extern "C"`