[AOTI] Update cpp wrapper codegen to use v2 C shim

desertfire · desertfire · commit 29b51ee7d698 · 2024-02-28T19:50:37.000-08:00
Summary: To use the torchgen-ed v2 C shim interface, cpp wrapper codegen needs to update its rule for generating the right parameter and function call. Because changing the emitted code will cause a FC breakage, we add a flag to control the behavior. ghstack-source-id: 0bcec4e Pull Request resolved: #120714
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -24,8 +24,9 @@ class CppWrapperCpu(WrapperCodeGen):
     """
 
     def __init__(self):
+        if not hasattr(self, "device"):
+            self.device = "cpu"
         super().__init__()
-
         self.declare = "auto "
         self.declare_maybe_reference = "decltype(auto) "
         self.ending = ";"
@@ -148,7 +149,12 @@ def write_header(self):
             )
 
         if config.abi_compatible:
-            self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
+            if config.c_shim_version == "1":
+                self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
+            else:
+                self.header.splice(
+                    f"#include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>"
+                )
         else:
             if not V.graph.aot_mode:
                 self.header.splice("#include <pybind11/pybind11.h>")
@@ -915,7 +921,11 @@ def generate_c_shim_extern_kernel_call(self, kernel, args):
         kernel_suffix = kernel_tokens[-1]
         if kernel_suffix == "call":
             kernel_suffix = kernel_tokens[-2]
-        shim_fn = f"aoti_torch_{kernel_suffix}"
+        if config.c_shim_version == "1":
+            shim_fn = f"aoti_torch_{kernel_suffix}"
+        else:
+            shim_fn = f"aoti_torch_{self.device}_{kernel_suffix}"
+
         # HACK: val_to_arg_str jams multiple arguments together using a comma. If that
         # ever breaks, it needs to be reworked to be able to return multiple arguments,
         # and the split-on-comma code here needs to be removed.
@@ -1664,12 +1674,17 @@ def val_to_cpp_arg_str(self, type_, val, is_legacy_abi) -> str:
         ):
             if val is None:
                 return "0"  # nullptr is not available in C
-            if isinstance(val, (bool, int, str, float)):
+            if not isinstance(type_.getElementType(), torch.TensorType):
                 var_name = f"var_{next(self.arg_var_id)}"
                 self.writeline(f"auto {var_name} = {self.val_to_arg_str(val)};")
                 return f"&{var_name}"
-            if not isinstance(type_.getElementType(), torch.TensorType):
-                return f"&{self.val_to_arg_str(val)}"
+            elif config.c_shim_version == "2":
+                # Similar to other data type, use pointer to denote optional tensor arg in v2 C shim
+                var_name = f"var_{next(self.arg_var_id)}"
+                self.writeline(
+                    f"AtenTensorHandle {var_name} = {self.val_to_arg_str(val)}.get();"
+                )
+                return f"&{var_name}"
 
         return self.val_to_arg_str(val)
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_cuda.py b/torch/_inductor/codegen/cpp_wrapper_cuda.py
@@ -43,6 +43,7 @@ class CppWrapperCuda(CppWrapperCpu):
     """
 
     def __init__(self):
+        self.device = "cuda"
         super().__init__()
         self.grid_id = count()
         self.cuda = True
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -32,6 +32,10 @@ def is_fbcode():
     os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"
 )
 
+c_shim_version = os.environ.get(
+    "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2"
+)
+
 # dead code elimination
 dce = False
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -4812,7 +4812,10 @@ def is_not_write(arg):
         self.init_args_default_value(kernel._schema)
 
     def is_legacy_abi_kernel(self):
-        return "_scaled_dot_product_flash_attention" in str(self.python_kernel_name)
+        return (
+            config.c_shim_version == "1"
+            and "_scaled_dot_product_flash_attention" in str(self.python_kernel_name)
+        )
 
     def init_args_default_value(self, schema):
         self.args_default_value = [
@@ -4865,6 +4868,7 @@ def __repr__(self):
         self.abi_compatible_kernel = (
             f"{self.cpp_kernel_name}_v2"
             if self.cpp_kernel_name in {"at::_scaled_dot_product_flash_attention"}
+            and config.c_shim_version == "1"
             else self.cpp_kernel_name
         )
 
@@ -5022,7 +5026,13 @@ def codegen(self, wrapper):
             # Aten Fallback Ops
             assert isinstance(kernel, torch._ops.OpOverload)
             if V.graph.cpp_wrapper:
-                if config.is_fbcode() and kernel not in has_c_shim:
+                if (
+                    config.is_fbcode()
+                    and kernel not in has_c_shim
+                    # C shim v2 is torchgen-ed, which should cover all aten ops.
+                    # If you do hit a missed op, please update gen_aoti_c_shim.py.
+                    and config.c_shim_version == "1"
+                ):
                     log.warning(
                         "%s is missing a c-shim implementation, using proxy executor as fallback",
                         kernel,

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,10 @@ def is_fbcode():`
`32`	`32`	`os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"`
`33`	`33`	`)`
`34`	`34`
	`35`	`+c_shim_version = os.environ.get(`
	`36`	`+ "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2"`
	`37`	`+)`
	`38`	`+`
`35`	`39`	`# dead code elimination`
`36`	`40`	`dce = False`
`37`	`41`