use 4 warps for small block config in mm (#95383)

ngimel · web-flow · commit a90b4f09ace6 · 2023-02-24T09:12:36.000-05:00
* use 4 warps for small block config in mm

* Update test/inductor/test_select_algorithm.py

* Update test/inductor/test_select_algorithm.py
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
@@ -62,11 +62,30 @@ def test_addmm(self):
         def foo(input, weight, bias):
             return torch.addmm(bias, input, weight)
 
-        foo(
+        inps = (
             torch.randn(20, 33, device="cuda"),
             torch.randn(33, 16, device="cuda"),
             torch.randn(20, 16, device="cuda"),
         )
+
+        foo(*inps)
+        # Autotuning checks correctness of each version
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
+    @patch.object(select_algorithm, "VERIFY", dict(atol=5e-2, rtol=5e-2))
+    @patches
+    def test_addmm_fp16(self):
+        @torch.compile
+        def foo(input, weight, bias):
+            return torch.addmm(bias, input, weight)
+
+        inps = (
+            torch.randn(2, 320, device="cuda", dtype=torch.half),
+            torch.randn(320, 320, device="cuda", dtype=torch.half).t(),
+            torch.empty(320, device="cuda", dtype=torch.half),
+        )
+
+        foo(*inps)
         # Autotuning checks correctness of each version
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
@@ -45,7 +45,7 @@ def mm_configs():
             {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64}, num_stages=3, num_warps=8
         ),
         triton.Config(
-            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128}, num_stages=2, num_warps=8
+            {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128}, num_stages=2, num_warps=4
         ),
         triton.Config(
             {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16}, num_stages=2, num_warps=4