pytorch
diff --git a/‎benchmarks/dynamo/common.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/dynamo/common.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/inductor/aot/cpp/CMakeLists.txt‎
Lines changed: 23 additions & 0 deletions b/‎test/inductor/aot/cpp/CMakeLists.txt‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎test/inductor/aot/cpp/test.cpp‎
Lines changed: 41 additions & 0 deletions b/‎test/inductor/aot/cpp/test.cpp‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎test/inductor/aot/cpp/test.py‎
Lines changed: 22 additions & 0 deletions b/‎test/inductor/aot/cpp/test.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎test/inductor/aot/cpp/test.sh‎
Lines changed: 8 additions & 0 deletions b/‎test/inductor/aot/cpp/test.sh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎torch/_inductor/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎torch/_inductor/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎torch/_inductor/codecache.py‎
Lines changed: 46 additions & 0 deletions b/‎torch/_inductor/codecache.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cpp.py‎
Lines changed: 12 additions & 4 deletions b/‎torch/_inductor/codegen/cpp.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎torch/_inductor/codegen/cpp_prefix.h‎
Lines changed: 1 addition & 0 deletions b/‎torch/_inductor/codegen/cpp_prefix.h‎
Lines changed: 1 addition & 0 deletions
@@ -175,6 +175,7 @@ class CI(NamedTuple):
     # TIMM
     "cait_m36_384",  # Accuracy
     "pnasnet5large",  # OOM
+    "xcit_large_24_p8_224",  # OOM https://github.com/pytorch/pytorch/issues/95984
 ]
 
 CI_SKIP[CI("inductor", training=True)] = [
 
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(test)
+
+set(Torch_DIR "../../../../torch/share/cmake/Torch")
+find_package(Torch REQUIRED)
+
+add_executable(test test.cpp ${CMAKE_BINARY_DIR}/aot_inductor_output.h)
+
+add_custom_command(
+        OUTPUT ${CMAKE_BINARY_DIR}/aot_inductor_output.h
+        COMMAND python ${CMAKE_SOURCE_DIR}/test.py
+        DEPENDS ${CMAKE_SOURCE_DIR}/test.py
+)
+add_custom_target(generate_header ALL
+    DEPENDS ${CMAKE_BINARY_DIR}/aot_inductor_output.h)
+
+add_library(aot_inductor_output SHARED IMPORTED)
+set_property(TARGET aot_inductor_output PROPERTY
+             IMPORTED_LOCATION ${CMAKE_BINARY_DIR}/aot_inductor_output.so)
+
+target_link_libraries(test "${TORCH_LIBRARIES}" aot_inductor_output)
+
+set_property(TARGET test PROPERTY CXX_STANDARD 17)
@@ -0,0 +1,41 @@
+//#include <gtest/gtest.h>
+#include <iostream>
+
+#include "build/aot_inductor_output.h"
+
+/*
+class Net(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = torch.ones(32, 64)
+
+    def forward(self, x):
+        x = torch.relu(x + self.weight)
+        return x
+*/
+struct Net : torch::nn::Module {
+  Net() {
+    weight = register_parameter("weight", torch::ones({32, 64}));
+  }
+  torch::Tensor forward(torch::Tensor input) {
+    return torch::relu(input + weight);
+  }
+  torch::Tensor weight;
+};
+
+int main() {
+    torch::Tensor x = at::randn({32, 64});
+    Net net;
+    torch::Tensor results_ref = net.forward(x);
+
+    // TODO: we need to provide an API to concatenate args and weights
+    std::vector<torch::Tensor> inputs = {x};
+    for (const auto& pair : net.named_parameters()) {
+      inputs.push_back(pair.value());
+    }
+    torch::Tensor results_opt = aot_inductor_entry(inputs);
+
+    assert(torch::allclose(results_ref, results_opt));
+    printf("PASS\n");
+    return 0;
+}
@@ -0,0 +1,22 @@
+import torch
+import torch._dynamo
+import torch._inductor
+import torch._inductor.config
+
+torch._inductor.config.aot_codegen_output_prefix = "aot_inductor_output"
+
+
+class Net(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = torch.ones(32, 64)
+
+    def forward(self, x):
+        x = torch.relu(x + self.weight)
+        return x
+
+
+inp = torch.randn((32, 64), device="cpu")
+module, _ = torch._dynamo.export(Net(), inp)
+so_path = torch._inductor.aot_compile(module, [inp])
+print(so_path)
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -euxo pipefail
+
+mkdir -p build
+cd build
+cmake ..
+make
+./test
@@ -25,3 +25,24 @@ def compile(
     from .compile_fx import compile_fx
 
     return compile_fx(gm, example_inputs, config_patches=options)
+
+
+def aot_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    options: Optional[Dict[str, Any]] = None,
+) -> str:
+    """
+    Ahead-of-time compile a given FX graph with TorchInductor into a shared library.
+
+    Args:
+        gm: The FX graph to compile.
+        example_inputs:  List of tensor inputs.
+        options:  Optional dict of config options.  See `torch._inductor.config`.
+
+    Returns:
+        Path to the generated shared library
+    """
+    from .compile_fx import compile_fx
+
+    return compile_fx(gm, example_inputs, config_patches=options, aot_mode=True)()
@@ -534,6 +534,52 @@ def cpp_compile_command(
     ).strip()
 
 
+class AotCodeCache:
+    cache = dict()
+    clear = staticmethod(cache.clear)
+
+    @classmethod
+    def compile(cls, source_code):
+        from .codegen.wrapper import CppWrapperCodeGen
+
+        # TODO: update cpp_compile_command for different platforms
+        picked_vec_isa = pick_vec_isa()
+        key, input_path = write(
+            source_code,
+            "cpp",
+            code_hash(repr(cpp_compile_command("i", "o", vec_isa=picked_vec_isa))),
+        )
+        if key not in cls.cache:
+            from filelock import FileLock
+
+            lock_dir = get_lock_dir()
+            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            with lock:
+                output_so = (
+                    os.path.join(os.getcwd(), f"{config.aot_codegen_output_prefix}.so")
+                    if config.aot_codegen_output_prefix
+                    else f"{input_path[:-3]}.so"
+                )
+
+                output_header = f"{output_so[:-3]}.h"
+                with open(output_header, "w") as header_file:
+                    header_file.writelines("#include <torch/torch.h>\n\n")
+                    header_file.writelines(f"{CppWrapperCodeGen.decl_str};\n")
+
+                log.info(f"AOT-Inductor compiles code into: {output_so}")
+                if not os.path.exists(output_so):
+                    cmd = cpp_compile_command(
+                        input=input_path, output=output_so, vec_isa=picked_vec_isa
+                    ).split(" ")
+                    try:
+                        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+                    except subprocess.CalledProcessError as e:
+                        raise exc.CppCompileError(cmd, e.output) from e
+
+                cls.cache[key] = output_so
+        return cls.cache[key]
+
+
 class CppCodeCache:
     cache = dict()
     clear = staticmethod(cache.clear)
 
@@ -2040,7 +2040,12 @@ def codegen_define_and_call(self, wrapper):
         )
         if enable_kernel_profile:
             code.writelines(["#include <ATen/record_function.h>"])
-        code.writelines([cpp_prefix(), "" f'extern "C" void kernel({arg_defs})'])
+        kernel_decl_name = kernel_name if V.graph.aot_mode else "kernel"
+
+        if not V.graph.aot_mode or self.count == 1:
+            code.writeline(cpp_prefix())
+
+        code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})')
         with code.indent():
             if enable_kernel_profile:
                 graph_id = V.graph.graph_id
@@ -2055,9 +2060,12 @@ def codegen_define_and_call(self, wrapper):
             code.splice(self.loops_code)
 
         codecache_def = IndentedBuffer()
-        codecache_def.writeline("async_compile.cpp('''")
-        codecache_def.splice(code)
-        codecache_def.writeline("''')")
+        if V.graph.aot_mode:
+            codecache_def.splice(code)
+        else:
+            codecache_def.writeline("async_compile.cpp('''")
+            codecache_def.splice(code)
+            codecache_def.writeline("''')")
 
         codecache_str = codecache_def.getvalue()
         # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
 
@@ -5,6 +5,7 @@
 #include <limits>
 #include <omp.h>
 
+#include <ATen/ATen.h>
 #include <ATen/core/PhiloxRNGEngine.h>
 #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
 #include <ATen/cpu/vec/functional.h>
Original file line number	Diff line number	Diff line change
`@@ -175,6 +175,7 @@ class CI(NamedTuple):`
`175`	`175`	`# TIMM`
`176`	`176`	`"cait_m36_384", # Accuracy`
`177`	`177`	`"pnasnet5large", # OOM`
	`178`	`+ "xcit_large_24_p8_224", # OOM https://github.com/pytorch/pytorch/issues/95984`
`178`	`179`	`]`
`179`	`180`
`180`	`181`	`CI_SKIP[CI("inductor", training=True)] = [`