[AOTI] Update cpp wrapper codegen to use v2 C shim (#120714)

desertfire · pytorchmergebot · commit 6ddf5cf85e3c · 2024-03-05T04:32:32.000Z
Summary: To use the torchgen-ed v2 C shim interface, cpp wrapper codegen needs to update its rule for generating the right parameter and function call. Because changing the emitted code will cause a FC breakage, we add a flag to control the behavior. Differential Revision: [D54258086](https://our.internmc.facebook.com/intern/diff/D54258086) Pull Request resolved: #120714 Approved by: https://github.com/chenyang78 ghstack dependencies: #120513
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -24,8 +24,9 @@ class CppWrapperCpu(WrapperCodeGen):
     """
 
     def __init__(self):
+        if not hasattr(self, "device"):
+            self.device = "cpu"
         super().__init__()
-
         self.declare = "auto "
         self.declare_maybe_reference = "decltype(auto) "
         self.ending = ";"
@@ -149,7 +150,12 @@ def write_header(self):
             )
 
         if config.abi_compatible:
-            self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
+            if config.c_shim_version == "1":
+                self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
+            else:
+                self.header.splice(
+                    f"#include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>"
+                )
         else:
             if not V.graph.aot_mode:
                 self.header.splice("#include <pybind11/pybind11.h>")
@@ -924,7 +930,11 @@ def generate_c_shim_extern_kernel_call(self, kernel, args):
         kernel_suffix = kernel_tokens[-1]
         if kernel_suffix == "call":
             kernel_suffix = kernel_tokens[-2]
-        shim_fn = f"aoti_torch_{kernel_suffix}"
+        if config.c_shim_version == "1":
+            shim_fn = f"aoti_torch_{kernel_suffix}"
+        else:
+            shim_fn = f"aoti_torch_{self.device}_{kernel_suffix}"
+
         # HACK: val_to_arg_str jams multiple arguments together using a comma. If that
         # ever breaks, it needs to be reworked to be able to return multiple arguments,
         # and the split-on-comma code here needs to be removed.
@@ -1676,12 +1686,24 @@ def val_to_cpp_arg_str(self, type_, val, is_legacy_abi) -> str:
         ):
             if val is None:
                 return "0"  # nullptr is not available in C
-            if isinstance(val, (bool, int, str, float)):
+            if not isinstance(type_.getElementType(), torch.TensorType):
                 var_name = f"var_{next(self.arg_var_id)}"
                 self.writeline(f"auto {var_name} = {self.val_to_arg_str(val)};")
                 return f"&{var_name}"
-            if not isinstance(type_.getElementType(), torch.TensorType):
-                return f"&{self.val_to_arg_str(val)}"
+            elif config.c_shim_version == "2":
+                # Similar to other data type, use pointer to denote optional tensor arg in v2 C shim
+                base_handle = self.val_to_arg_str(val)
+                if "wrap_with_raii_handle_if_needed" in base_handle:
+                    # wrap_with_raii_handle_if_needed creates a temp RAIIAtenTensorHandle, so we need to
+                    # explicitly store it. Otherwise, it will be destroyed before the fallback kernel call.
+                    tmp_var_name = f"var_{next(self.arg_var_id)}"
+                    self.writeline(
+                        f"RAIIAtenTensorHandle {tmp_var_name} = {base_handle};"
+                    )
+                    base_handle = tmp_var_name
+                var_name = f"var_{next(self.arg_var_id)}"
+                self.writeline(f"AtenTensorHandle {var_name} = {base_handle}.get();")
+                return f"&{var_name}"
 
         return self.val_to_arg_str(val)
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_cuda.py b/torch/_inductor/codegen/cpp_wrapper_cuda.py
@@ -43,6 +43,7 @@ class CppWrapperCuda(CppWrapperCpu):
     """
 
     def __init__(self):
+        self.device = "cuda"
         super().__init__()
         self.grid_id = count()
         self.cuda = True
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -41,6 +41,10 @@ def enable_autotune_remote_cache():
     os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"
 )
 
+c_shim_version = os.environ.get(
+    "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2"
+)
+
 # dead code elimination
 dce = False
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -4855,7 +4855,10 @@ def is_not_write(arg):
         self.init_args_default_value(kernel._schema)
 
     def is_legacy_abi_kernel(self):
-        return "_scaled_dot_product_flash_attention" in str(self.python_kernel_name)
+        return (
+            config.c_shim_version == "1"
+            and "_scaled_dot_product_flash_attention" in str(self.python_kernel_name)
+        )
 
     def init_args_default_value(self, schema):
         self.args_default_value = [
@@ -4908,6 +4911,7 @@ def __repr__(self):
         self.abi_compatible_kernel = (
             f"{self.cpp_kernel_name}_v2"
             if self.cpp_kernel_name in {"at::_scaled_dot_product_flash_attention"}
+            and config.c_shim_version == "1"
             else self.cpp_kernel_name
         )
 
@@ -5065,7 +5069,13 @@ def codegen(self, wrapper):
             # Aten Fallback Ops
             assert isinstance(kernel, torch._ops.OpOverload)
             if V.graph.cpp_wrapper:
-                if config.is_fbcode() and kernel not in has_c_shim:
+                if (
+                    config.is_fbcode()
+                    and kernel not in has_c_shim
+                    # C shim v2 is torchgen-ed, which should cover all aten ops.
+                    # If you do hit a missed op, please update gen_aoti_c_shim.py.
+                    and config.c_shim_version == "1"
+                ):
                     log.warning(
                         "%s is missing a c-shim implementation, using proxy executor as fallback",
                         kernel,

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,10 @@ def enable_autotune_remote_cache():`
`41`	`41`	`os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"`
`42`	`42`	`)`
`43`	`43`
	`44`	`+c_shim_version = os.environ.get(`
	`45`	`+ "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2"`
	`46`	`+)`
	`47`	`+`
`44`	`48`	`# dead code elimination`
`45`	`49`	`dce = False`
`46`	`50`