Revert "[cutlass backend] Forward fix for less aligned gemm shapes (#148521)"

pytorchmergebot · pytorchmergebot · commit ae6bb584838e · 2025-03-06T06:59:39.000Z
This reverts commit ad49cfc. Reverted #148521 on behalf of https://github.com/davidberard98 due to broke lint: [GH job link](https://github.com/pytorch/pytorch/actions/runs/13690720601/job/38283359447) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/ad49cfc9f0a8a4d8881b3734edd8c33a087c8b97) ([comment](#148521 (comment)))
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
@@ -9,7 +9,6 @@
 from pathlib import Path
 from typing import Callable, Optional
 
-from torch._inductor.exc import InductorError
 from torch._inductor.utils import clear_inductor_caches
 from torch.export import Dim
 from torch.testing._internal.logging_utils import log_settings
@@ -962,80 +961,6 @@ def select_no_algorithm(*args, **kwargs):
                             cuda_template_count += 1
                     assert cuda_template_count > 0, "No CUDATemplateCaller choices"
 
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
-    def test_cutlass_backend_shape_coverage_mm(
-        self,
-    ):
-        """
-        Checks if cutlass backend produces some ops for a variety of shapes.
-
-        This test doesn't compile and check the correctness of the ops.
-
-        NOTE: K has to be even.
-        """
-
-        inputs = [
-            (torch.randn(128, 500).cuda().half(), torch.randn(500, 576).cuda().half()),
-            (
-                torch.randn(500, 128).cuda().half(),
-                torch.randn(128, 576).cuda().half(),
-            ),
-            (torch.randn(128, 250).cuda().half(), torch.randn(250, 576).cuda().half()),
-            (
-                torch.randn(250, 128).cuda().half(),
-                torch.randn(128, 576).cuda().half(),
-            ),
-            (
-                torch.randn(125, 128).cuda().half(),
-                torch.randn(128, 576).cuda().half(),
-            ),
-        ]
-
-        def select_no_algorithm(*args, **kwargs):
-            raise NoValidChoicesError
-
-        with fresh_inductor_cache(), config.patch(
-            {
-                "max_autotune": True,
-                "max_autotune_gemm_backends": "CUTLASS",
-                "cuda.cutlass_max_profiling_configs": 2,
-                "autotune_fallback_to_aten": False,
-            }
-        ), mock.patch(
-            "torch._inductor.kernel.mm.autotune_select_algorithm",
-            wraps=select_no_algorithm,
-        ) as sa:
-            for input in inputs:
-                A, B = input
-                M, K = A.shape
-                _, N = B.shape
-
-                with self.assertRaises(InductorError, r".*NoValidChoicesError.*"):
-                    torch.compile(torch.mm, dynamic=False)(*input)
-
-                self.assertTrue(
-                    sa.called,
-                    f"autotune_select_algorithm was not called  with shape M={M}, N={N}, K={K}",
-                )
-                args, _ = sa.call_args
-                op_name, choices, _, __ = args
-                assert op_name == "mm"
-                cuda_template_count = 0
-                for choice in choices:
-                    if isinstance(choice, CUDATemplateCaller):
-                        choice_info = choice.info_dict()
-                        op_conf_name = choice_info.get("op_conf_name", "")
-                        assert isinstance(op_conf_name, str)
-                        cuda_template_count += 1
-
-                self.assertGreater(
-                    cuda_template_count,
-                    0,
-                    "No CUDATemplateCaller choices found for matmul with shape "
-                    f"M={M}, N={N}, K={K}",
-                )
-
     @unittest.skipIf(not SM80OrLater, "need sm_80")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_get_max_alignment(self):
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -841,16 +841,6 @@ def filter_op(
         # Set epilogue.
         # TODO: update epilogue functor according to epilogues.
         op.element_epilogue = op.accumulator_type()
-
-        # Set bias layout and alignment.
-        status = self._set_bias_layout_and_alignment(op)
-        if not status:
-            log.debug(
-                "Skipping due to bias layout and alignment setting failure. op: %s", op
-            )
-            return None
-
-        # Apply regex filters at the end when configuration name doesn't change anymore
         if inductor_cuda_config.cutlass_op_allowlist_regex is not None:
             if not re.search(
                 inductor_cuda_config.cutlass_op_allowlist_regex, op.configuration_name()
@@ -862,6 +852,14 @@ def filter_op(
             ):
                 return None
 
+        # Set bias layout and alignment.
+        status = self._set_bias_layout_and_alignment(op)
+        if not status:
+            log.debug(
+                "Skipping due to bias layout and alignment setting failure. op: %s", op
+            )
+            return None
+
         return op
 
     def gen_ops(self) -> "list[tuple[str, cutlass_gemm_op.GemmOperation]]":  # type: ignore[name-defined]  # noqa: F821
@@ -1214,29 +1212,46 @@ def _set_bias_layout_and_alignment(
         self,
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
     ) -> bool:
-        import cutlass_library.library as cutlass_lib
-
         has_bias = len(self.input_nodes) >= 3 and self.input_nodes[2] is not None
         if has_bias:
-            Bias = self.input_nodes[2]
-            # bias dtype
-            op.C.element = cutlass_utils.torch_dtype_to_cutlass_type(
-                Bias.get_layout().dtype
-            )
-            assert op.C.element == op.D.element, (
-                f"Expect C and D to have the same dtype, found {op.C.element} and {op.D.element}"
-            )
-
-            # Bias layout
-            bias_layout = CUTLASSGemmTemplate.cutlass_layout(Bias.get_layout())
+            bias = self.input_nodes[2]
+            bias_layout = CUTLASSGemmTemplate.cutlass_layout(bias.get_layout())
             op.C.layout = bias_layout
-
-            # Bias alignment
-            status = self.set_alignment(Bias.get_layout(), op.C)
+            status = self.set_alignment(bias.get_layout(), op.C)
             if not status:
                 return False
+        return True
+
+    def _dtype_match(
+        self,
+        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+    ) -> bool:
+        """
+        Checking dtypes of C (i.e. bias) here, since that is the one not checked in the base class.
+        """
+
+        if not super()._dtype_match(op):
+            return False
+
+        assert cutlass_utils.try_import_cutlass()
+        from cutlass_library.library import DataType  # type: ignore[import]
+
+        has_bias = len(self.input_nodes) >= 3 and self.input_nodes[2] is not None
+
+        if op.C.element == DataType.void:
+            if has_bias:
+                # op expects no bias, but bias exists
+                return False
         else:
-            op.C.element = cutlass_lib.DataType.void
+            # op expects bias. Needs to check if bias exists and is of the right dtype
+            if not (
+                has_bias
+                and cutlass_utils.dtype_match(
+                    self.input_nodes[2].get_dtype(), op.C.element
+                )
+            ):
+                return False
+
         return True
 
     def _define_gemm_instance(