[halide-backend] Support manual schedules (#129321)

jansel · pytorchmergebot · commit 9ae78a578caf · 2024-06-29T14:06:28.000Z
Currently using this for some by-hand hacking, but might need to implement our own scheduler later. Pull Request resolved: #129321 Approved by: https://github.com/shunting314 ghstack dependencies: #126417, #129025, #129026, #127506, #129036, #129320
diff --git a/test/inductor/test_halide.py b/test/inductor/test_halide.py
@@ -120,6 +120,80 @@ def generate(g):
         fn(a, b, c)
         self.assertEqual(c, a + b)
 
+    def test_manual_schedule(self):
+        fn = HalideCodeCache.generate_halide(
+            HalideMeta(
+                argtypes=[
+                    HalideInputSpec(
+                        ctype="float*",
+                        name="in_ptr0",
+                        shape=["1024L"],
+                        stride=["1L"],
+                        offset="0",
+                    ),
+                    HalideInputSpec(
+                        ctype="float*",
+                        name="in_ptr1",
+                        shape=["1024L"],
+                        stride=["1L"],
+                        offset="0",
+                    ),
+                    HalideInputSpec(
+                        ctype="float*",
+                        name="out_ptr0",
+                        shape=["1024L"],
+                        stride=["1L"],
+                        offset="0",
+                    ),
+                ],
+                target="host-no_runtime",
+                scheduler=None,
+            ),
+            textwrap.dedent(
+                """
+                import halide as hl
+
+                @hl.generator(name="kernel")
+                class Kernel:
+                    in_ptr0 = hl.InputBuffer(hl.Float(32), 1)
+                    in_ptr1 = hl.InputBuffer(hl.Float(32), 1)
+                    out_ptr0 = hl.OutputBuffer(hl.Float(32), 1)
+
+                    def generate(g):
+                        in_ptr0 = g.in_ptr0
+                        in_ptr1 = g.in_ptr1
+                        out_ptr0 = g.out_ptr0
+                        xindex = hl.Var('xindex')
+                        x0 = xindex
+                        tmp0 = hl.Func()
+                        tmp0[xindex] = in_ptr0[x0]
+                        tmp1 = hl.Func()
+                        tmp1[xindex] = in_ptr1[x0]
+                        tmp2 = hl.Func()
+                        tmp2[xindex] = tmp0[xindex] + tmp1[xindex]
+                        out_ptr0[x0] = tmp2[xindex]
+
+                        assert not g.using_autoscheduler()
+                        i = hl.Var()
+                        j = hl.Var()
+                        out_ptr0.compute_root()
+                        out_ptr0.split(xindex, i, j, 32)
+                        out_ptr0.parallel(i)
+                        out_ptr0.vectorize(j)
+                        tmp2.compute_at(out_ptr0, i)
+                        tmp2.store_at(out_ptr0, i)
+                        tmp1.compute_inline()
+
+                __name__ == '__main__' and hl.main()
+                """
+            ),
+        )
+        a = torch.randn(1024)
+        b = torch.randn(1024)
+        c = torch.randn(1024)
+        fn(a, b, c)
+        self.assertEqual(c, a + b)
+
 
 if test_torchinductor.HAS_CPU and HAS_HALIDE:
     SweepInputsCpuHalideTest = make_halide(test_torchinductor.SweepInputsCpuTest)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
@@ -2814,26 +2814,23 @@ def generate_halide_async(cls, meta: HalideMeta, source_code: str, submit_fn=Non
         jobs = []
         if need_compile:
             write_atomic(genfile, source_code)
-            jobs.append(
-                functools.partial(
-                    subprocess.check_call,
-                    [
-                        sys.executable,
-                        genfile,
-                        "-g",
-                        "kernel",
-                        "-o",
-                        f"{dirpath}",
-                        "-f",
-                        "halide_kernel",
-                        "-e",
-                        "static_library,h,schedule,conceptual_stmt",
-                        "-p",
-                        cls.find_libautoschedule(meta.scheduler),
-                        *meta.args(),
-                    ],
-                )
-            )
+            cmd = [
+                sys.executable,
+                genfile,
+                "-g",
+                "kernel",
+                "-o",
+                f"{dirpath}",
+                "-f",
+                "halide_kernel",
+                "-e",
+                "static_library,h,schedule",
+            ]
+            if meta.scheduler:
+                cmd.extend(["-p", cls.find_libautoschedule(meta.scheduler)])
+            cmd.extend(meta.args())
+            jobs.append(functools.partial(subprocess.check_call, cmd))
+
         binding_types = [
             arg.bindings_type() for arg in meta.argtypes if arg.alias_of is None
         ]
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
@@ -1474,25 +1474,40 @@ def update_index(m):
 
         code.do_unindent(2)
         code.splice(
-            f"""
+            """
             if __name__ == "__main__":
                 hl.main()
-            else:
-                hl.load_plugin({HalideCodeCache.find_libautoschedule(meta.scheduler)!r})
-                target = hl.Target({meta.target!r})
-                autoscheduler = hl.AutoschedulerParams({meta.scheduler!r}, {meta.scheduler_flags!r})
-                with hl.GeneratorContext(target, autoscheduler):
-                    gen = Kernel()
-                    pipeline = gen._build_pipeline()
-                    # gen.compile_to_callable() does not run the autoscheduler
-                    pipeline.apply_autoscheduler(target, autoscheduler)
-                    kernel = pipeline.compile_to_callable([
-                            gen._get_input_parameter(a.name)._to_argument()
-                            for a in gen._get_arginfos()
-                            if a.dir == hl.ArgInfoDirection.Input
-                        ], target)
-            """
+            """.rstrip(),
         )
+        if meta.scheduler:
+            code.splice(
+                f"""
+                else:
+                    hl.load_plugin({HalideCodeCache.find_libautoschedule(meta.scheduler)!r})
+                    target = hl.Target({meta.target!r})
+                    autoscheduler = hl.AutoschedulerParams({meta.scheduler!r}, {meta.scheduler_flags!r})
+                    with hl.GeneratorContext(target, autoscheduler):
+                        gen = Kernel()
+                        pipeline = gen._build_pipeline()
+                        # gen.compile_to_callable() does not run the autoscheduler
+                        pipeline.apply_autoscheduler(target, autoscheduler)
+                        kernel = pipeline.compile_to_callable([
+                                gen._get_input_parameter(a.name)._to_argument()
+                                for a in gen._get_arginfos()
+                                if a.dir == hl.ArgInfoDirection.Input
+                            ], target)
+                """,
+                strip=True,
+            )
+        else:
+            code.splice(
+                f"""
+                  else:
+                      with hl.GeneratorContext(hl.Target({meta.target!r})):
+                          kernel = Kernel().compile_to_callable()
+                  """,
+                strip=True,
+            )
         return code.getvalue()
 
     @staticmethod
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
@@ -160,15 +160,19 @@ def is_buffer(self):
 class HalideMeta(typing.NamedTuple):
     argtypes: List[HalideInputSpec]
     target: str
-    scheduler: str
-    scheduler_flags: Dict[str, Union[int, str]]
+    scheduler: Optional[str] = None
+    scheduler_flags: Optional[Dict[str, Union[int, str]]] = None
     cuda_device: Optional[int] = None
 
     def args(self):
         """Command line args to pass to halide generator"""
-        args = [f"target={self.target}", f"autoscheduler={self.scheduler}"]
-        for k, v in self.scheduler_flags.items():
-            args.append(f"autoscheduler.{k}={v}")
+        args = [f"target={self.target}"]
+        if self.scheduler:
+            args.append(f"autoscheduler={self.scheduler}")
+        if self.scheduler_flags:
+            assert self.scheduler
+            for k, v in self.scheduler_flags.items():
+                args.append(f"autoscheduler.{k}={v}")
         return args
 
     def is_cuda(self):