[SDPA] Fix bug in parsing scaled_dot_product_attention arguments (#95311) (#95397)

drisspg · web-flow · commit 2180f342c40d · 2023-02-24T09:18:19.000-05:00
Fixes #95266 Pull Request resolved: #95311 Approved by: https://github.com/cpuhrsch
diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
@@ -60,6 +60,11 @@ def make_dynamic_cls(cls):
     # Cannot call sizes() on tensor with symbolic sizes/strides
 )
 
+unittest.expectedFailure(
+    DynamicShapesMiscTests.test_parsing_sdpa_dynamic_shapes
+    # Cannot call sizes() on tensor with symbolic sizes/strides
+)
+
 
 # DynamicShapesSubGraphTests
 unittest.expectedFailure(
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
@@ -3145,6 +3145,53 @@ def forward(self, query, key, value):
         self.assertEqual(compiled.device.index, 0)
         self.assertEqual(compiled.dtype, torch.float16)
 
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FUSED_SDPA or not SM80OrLater,
+        "Can't run fused SDPA on this platform",
+    )
+    def test_parsing_sdpa(self):
+        class MyModule(torch.nn.Module):
+            def forward(self, query, key, value):
+                out = F.scaled_dot_product_attention(query, key, value, None, 0, True)
+                out = F.scaled_dot_product_attention(
+                    query=query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query,
+                    key=key,
+                    value=value,
+                    attn_mask=None,
+                    dropout_p=0,
+                    is_causal=True,
+                )
+                out = F.scaled_dot_product_attention(
+                    query, key, value, None, dropout_p=0, is_causal=True
+                )
+                return out
+
+        device = "cuda"
+        dtype = torch.float16
+        seq_len_q = 1
+        seq_len_k = 1
+        head_dim = 8
+        query = torch.ones(
+            1, 8, seq_len_q, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        key = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        value = torch.ones(
+            1, 8, seq_len_k, head_dim, device=device, dtype=dtype, requires_grad=True
+        )
+        module = MyModule()
+        opt_mod = torch._dynamo.optimize("inductor")(module)
+        opt_mod(query, key, value)
+
     def test_autocast_cpu(self):
         class MyModule(torch.nn.Module):
             def forward(self, x):
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
@@ -481,9 +481,34 @@ def get_state_from_generator():
             if self.value == torch._C._nn.scaled_dot_product_attention:
                 # See:[Note] SDPA_flash's meta function returns incorrect Philox seed and offset
                 # in pytorch/torch/_meta_registrations.py
-                fake_query = args[0].as_proxy().node.meta["example_value"]
-                fake_key = args[1].as_proxy().node.meta["example_value"]
-                fake_value = args[2].as_proxy().node.meta["example_value"]
+                all_kwargs = kwargs.copy()
+                all_kwargs.update(
+                    dict(
+                        zip(
+                            (
+                                "query",
+                                "key",
+                                "value",
+                                "attn_mask",
+                                "dropout_p",
+                                "is_causal",
+                            ),
+                            args,
+                        )
+                    )
+                )
+                fake_query = all_kwargs["query"].as_proxy().node.meta["example_value"]
+                fake_key = all_kwargs["key"].as_proxy().node.meta["example_value"]
+                fake_value = all_kwargs["value"].as_proxy().node.meta["example_value"]
+                fake_mask = all_kwargs.get("attn_mask")
+                if isinstance(fake_mask, TensorVariable):
+                    fake_mask = fake_mask.as_proxy().node.meta["example_value"]
+                else:
+                    fake_mask = None
+                dropout_p = kwargs.get("dropout_p")
+                dropout_p = dropout_p.value if dropout_p is not None else 0.0
+                is_causal = kwargs.get("is_causal")
+                is_causal = is_causal.value if is_causal is not None else False
                 # We look through the stack to find a cuda autocast context
                 # If we do we will convert the fake tensors to torch.float16
                 is_cuda_autocast_context = False
@@ -502,15 +527,10 @@ def get_state_from_generator():
                     fake_value = fake_value.clone().to(amp_dtype)
 
                 backend_choice = torch._fused_sdp_choice(
-                    fake_query, fake_key, fake_value
+                    fake_query, fake_key, fake_value, fake_mask, dropout_p, is_causal
                 )
                 if backend_choice == torch.backends.cuda.SDPBackend.FLASH_ATTENTION:
-                    dropout_p = kwargs.get("dropout_p")
-                    # Lets see if they passed it in as not an arg
-                    if len(args) >= 5:
-                        dropout_p = args[4]
-
-                    if dropout_p is not None and dropout_p.value != 0.0:
+                    if dropout_p is not None and dropout_p != 0.0:
                         unimplemented(
                             "FlashAttention with dropout is not supported in cuda graphs"
                         )