Update on "[AOTI] Refine the C shim autogen mechanism"

desertfire · desertfire · commit aa988ca3a7b8 · 2024-05-06T09:39:31.000-07:00
Summary: Based on the discussions in #120513. Instead of auto-generate C shim fallback ops for thousands of ops, we maintain a list of fallback ops based on torch/_inductor/lowering.py, and only generate C shim functions for those ops. At the torchgen time, we will re-generate C shim files and compare the header file contents against the existing C shim headers. If there is any change, the compilation will fail with prompt on how to proceed. This makes sure the ABI-compatible C shim layer is small enough to maintain in the long run. [ghstack-poisoned]
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -468,6 +468,7 @@ lazy_tensor_core_python_sources = [
 inductor_core_resources = [
     "torch/csrc/inductor/aoti_runner/model_container_runner.cpp",
     "torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp",
+    "torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.cpp",
     "torch/csrc/inductor/aoti_torch/shim_common.cpp",
     "torch/csrc/inductor/aoti_torch/tensor_converter.cpp",
     "torch/csrc/inductor/inductor_ops.cpp",
@@ -656,6 +657,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/cuda/comm.cpp",
     "torch/csrc/cuda/memory_snapshot.cpp",
     "torch/csrc/inductor/aoti_runner/model_container_runner_cuda.cpp",
+    "torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.cpp",
     "torch/csrc/inductor/aoti_torch/shim_cuda.cpp",
     "torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp",
     "torch/csrc/profiler/stubs/cuda.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -352,7 +352,6 @@ if(NOT INTERN_DISABLE_AUTOGRAD AND NOT BUILD_LITE_INTERPRETER)
     "${TORCH_SRC_DIR}/csrc/autograd/generated/TraceType_4.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
     "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
-    "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_cpu.cpp"
   )
   if(BUILD_LAZY_TS_BACKEND)
     list(APPEND GENERATED_CXX_TORCH
@@ -407,17 +406,12 @@ set(GENERATED_TESTING_PYTHON
   "${TORCH_SRC_DIR}/testing/_internal/generated/annotated_fn_args.py"
   )
 
-set(GENERATED_CXX_TORCH_CUDA
-  "${TORCH_SRC_DIR}/csrc/inductor/aoti_torch/generated/c_shim_cuda.cpp"
-  )
-
 set(TORCH_GENERATED_CODE
   ${GENERATED_CXX_TORCH}
   ${GENERATED_H_TORCH}
   ${GENERATED_CXX_PYTHON}
   ${GENERATED_H_PYTHON}
   ${GENERATED_TESTING_PYTHON}
-  ${GENERATED_CXX_TORCH_CUDA}
   )
 
 set(GEN_PER_OPERATOR_FLAG)
@@ -966,7 +960,7 @@ endif()
 # Compile exposed libraries.
 if(USE_ROCM)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-  list(APPEND Caffe2_HIP_SRCS ${GENERATED_CXX_TORCH_CUDA})
+  list(APPEND Caffe2_HIP_SRCS)
   hip_add_library(torch_hip ${Caffe2_HIP_SRCS})
   if(USE_FLASH_ATTENTION)
     target_link_libraries(torch_hip PRIVATE __caffe2_aotriton)
@@ -986,7 +980,7 @@ if(USE_ROCM)
   endif()
 elseif(USE_CUDA)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-  list(APPEND Caffe2_GPU_SRCS ${GENERATED_CXX_TORCH_CUDA})
+  list(APPEND Caffe2_GPU_SRCS)
   if(CUDA_SEPARABLE_COMPILATION)
     # Separate compilation fails when kernels using `thrust::sort_by_key`
     # are linked with the rest of CUDA code. Workaround by linking them separately.
diff --git a/torchgen/gen.py b/torchgen/gen.py
@@ -3,7 +3,6 @@
 import json
 import os
 import pathlib
-import shutil
 
 from collections import defaultdict, namedtuple, OrderedDict
 from dataclasses import dataclass, field
@@ -2401,43 +2400,35 @@ def headers_for_aoti() -> str:
             existing_c_shim_path = "torch/csrc/inductor/aoti_torch/generated"
             header_file_name = f"c_shim_{dispatch_key.lower()}.h"
             cpp_file_name = f"c_shim_{dispatch_key.lower()}.cpp"
-            aoti_fm.write(
-                header_file_name,
-                lambda: gen_aoti_c_shim(
-                    fallback_native_functions,
-                    dispatch_key,
-                    backend_indices,
-                    header=True,
-                    includes="",
-                ),
+            new_header = gen_aoti_c_shim(
+                fallback_native_functions,
+                dispatch_key,
+                backend_indices,
+                header=True,
+                includes="",
             )
-            aoti_fm.write(
-                cpp_file_name,
-                lambda: gen_aoti_c_shim(
-                    fallback_native_functions,
-                    dispatch_key,
-                    backend_indices,
-                    header=False,
-                    includes=headers_for_aoti() + "\n" + extra_headers,
-                ),
+            new_cpp = gen_aoti_c_shim(
+                fallback_native_functions,
+                dispatch_key,
+                backend_indices,
+                header=False,
+                includes=headers_for_aoti() + "\n" + extra_headers,
             )
+
             if update_aoti_c_shim:
-                shutil.copy2(
-                    os.path.join(aoti_fm.install_dir, header_file_name),
-                    os.path.join(existing_c_shim_path, header_file_name),
+                aoti_fm.write(
+                    header_file_name,
+                    lambda: new_header,
                 )
-                shutil.copy2(
-                    os.path.join(aoti_fm.install_dir, cpp_file_name),
-                    os.path.join(existing_c_shim_path, cpp_file_name),
+                aoti_fm.write(
+                    cpp_file_name,
+                    lambda: new_cpp,
                 )
             else:
                 with open(
                     os.path.join(existing_c_shim_path, header_file_name)
-                ) as old_file, open(
-                    os.path.join(aoti_fm.install_dir, header_file_name)
-                ) as new_file:
+                ) as old_file:
                     old_header = old_file.read()
-                    new_header = new_file.read()
                     assert (
                         old_header == new_header
                     ), """
@@ -2765,18 +2756,6 @@ def main() -> None:
         help="output directory",
         default="build/aten/src/ATen",
     )
-    parser.add_argument(
-        "--aoti-install-dir",
-        "--aoti_install_dir",
-        help="output directory for AOTInductor shim",
-        default="build/aoti/generated",
-    )
-    parser.add_argument(
-        "--update-aoti-c-shim",
-        action="store_true",
-        help="Update AOTInductor C shim after changing torchgen/aoti/fallback_ops.py. "
-        "WARNING: Do not use this unless you are sure what you are doing!!!",
-    )
     parser.add_argument(
         "--rocm",
         action="store_true",
@@ -2841,6 +2820,12 @@ def main() -> None:
         default=["headers", "sources", "declarations_yaml"],
         help="Generate only a subset of files",
     )
+    parser.add_argument(
+        "--update-aoti-c-shim",
+        action="store_true",
+        help="Update AOTInductor C shim after changing torchgen/aoti/fallback_ops.py. "
+        "WARNING: Do not use this unless you are sure what you are doing!!!",
+    )
 
     options = parser.parse_args()
 
@@ -2897,15 +2882,15 @@ def main() -> None:
     pathlib.Path(core_install_dir).mkdir(parents=True, exist_ok=True)
     ops_install_dir = f"{options.install_dir}/ops"
     pathlib.Path(ops_install_dir).mkdir(parents=True, exist_ok=True)
-    aoti_install_dir = f"{options.aoti_install_dir}"
-    pathlib.Path(aoti_install_dir).mkdir(parents=True, exist_ok=True)
 
     core_fm = make_file_manager(options=options, install_dir=core_install_dir)
     cpu_fm = make_file_manager(options=options)
     cpu_vec_fm = make_file_manager(options=options)
     cuda_fm = make_file_manager(options=options)
     ops_fm = make_file_manager(options=options, install_dir=ops_install_dir)
-    aoti_fm = make_file_manager(options=options, install_dir=aoti_install_dir)
+    aoti_fm = make_file_manager(
+        options=options, install_dir="torch/csrc/inductor/aoti_torch/generated"
+    )
 
     # Only a limited set of dispatch keys get CPUFunctions.h headers generated
     # for them; this is the set