Update functorch supported autograd.Function to allow mark_dirty (#91222)

soulitzer · pytorchmergebot · commit 1b2ee4d0e11e · 2022-12-28T03:53:47.000Z
Fixes #90225 Uses what was originally in #89860 Pull Request resolved: #91222 Approved by: https://github.com/zou3519
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
@@ -989,7 +989,7 @@ def forward(x, y):
                 return x, y
 
             @staticmethod
-            def setup_context(ctx, inputs, outputs):
+            def setup_context(ctx, inputs, output):
                 ctx.set_materialize_grads(False)
 
             @staticmethod
@@ -1016,7 +1016,7 @@ def forward(x, y):
                 return x * y
 
             @staticmethod
-            def setup_context(ctx, inputs, outputs):
+            def setup_context(ctx, inputs, output):
                 return
 
             @staticmethod
@@ -1039,8 +1039,8 @@ def forward(input):
                 return torch.tensor(input_np ** 3, device=input.device), input_np
 
             @staticmethod
-            def setup_context(ctx, inputs, outputs):
-                ctx.input_np = outputs[1]
+            def setup_context(ctx, inputs, output):
+                ctx.input_np = output[1]
                 ctx.device = inputs[0].device
 
             @staticmethod
@@ -1097,7 +1097,7 @@ def forward(x):
                 return x.clone()
 
             @staticmethod
-            def setup_context(ctx, inputs, outputs):
+            def setup_context(ctx, inputs, output):
                 return
 
             @staticmethod
@@ -1125,8 +1125,8 @@ def forward(input):
                 return torch.tensor(input_np ** 3, device=input.device), dinput
 
             @staticmethod
-            def setup_context(ctx, outputs, input):
-                ctx.save_for_backward(input, outputs[1])
+            def setup_context(ctx, inputs, output):
+                ctx.save_for_backward(inputs, output[1])
 
             @staticmethod
             def backward(ctx, grad_output, grad_saved):
@@ -1173,7 +1173,7 @@ def forward(input):
                 pass
 
             @staticmethod
-            def setup_context(ctx, outputs, input):
+            def setup_context(ctx, inputs, output):
                 pass
 
             @staticmethod
@@ -1199,7 +1199,7 @@ def forward(input):
                 pass
 
             @staticmethod
-            def setup_context(ctx, outputs, input):
+            def setup_context(ctx, inputs, output):
                 pass
 
             @staticmethod
@@ -1224,7 +1224,7 @@ def forward(input):
                 pass
 
             @staticmethod
-            def setup_context(ctx, outputs, x, y):
+            def setup_context(ctx, inputs, output):
                 pass
 
             @staticmethod
@@ -1249,7 +1249,7 @@ def forward(input):
                 return input
 
             @staticmethod
-            def setup_context(ctx, outputs, input):
+            def setup_context(ctx, inputs, output):
                 pass
 
             @staticmethod
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
@@ -319,7 +319,6 @@ def is_inplace(op, variant):
 
 vjp_fail = {
     xfail('tensor_split'),  # data_ptr composite compliance
-    xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
 }
 
 aliasing_ops = {
@@ -462,7 +461,7 @@ def wrapped_fn(*args, **kwargs):
         xfail('nn.functional._scaled_dot_product_attention', device_type='cuda'),
 
         xfail('nn.functional.rrelu'),  # in-place test errors out with no formula implemented
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
+        xfail('NumpyExpMarkDirtyAutogradFunction'),  # TODO: https://github.com/pytorch/pytorch/issues/91280
 
         # --- Non-Contiguous Failures! ---
         # This is expected to fail as the operator
@@ -966,6 +965,7 @@ def test_vmapvjp(self, device, dtype, op):
         # skip because this is flaky depending on what the max_norm is!
         skip('nn.functional.embedding', ''),
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail('NumpyExpMarkDirtyAutogradFunction'),  # vmap: inplace into a regular tensor
         # ----------------------------------------------------------------------
 
         # ---------------------------- BUGS ------------------------------------
@@ -1003,7 +1003,6 @@ def test_vmapvjp(self, device, dtype, op):
         xfail("_native_batch_norm_legit"),
 
         xfail('nn.functional.prelu'),
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
         # ----------------------------------------------------------------------
     }
 
@@ -1475,6 +1474,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
 
         # Not actually a problem
         xfail('NumpyCubeNotComposableAutogradFunction'),  # not composable
+        xfail('NumpyExpMarkDirtyAutogradFunction'),  # vmap: inplace into a regular tensor
 
         # Potential bugs/errors
         xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
@@ -1948,7 +1948,6 @@ def f(x):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_vmapvjpvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_vmapvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
@@ -1993,7 +1992,6 @@ def inner(primals, cotangents):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_vjpvmapvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_vjpvmapvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
@@ -2032,7 +2030,6 @@ def test_vjpvmapvmap(self, device, dtype, op):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_vjpvjpvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_vjpvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
@@ -2063,7 +2060,6 @@ def test_vjpvjpvmap(self, device, dtype, op):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_jvpvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_jvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
@@ -2092,7 +2088,6 @@ def test_jvpvmap(self, device, dtype, op):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_jvpvmapvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_jvpvmapvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
@@ -2127,7 +2122,6 @@ def test_jvpvmapvmap(self, device, dtype, op):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_vmapjvpvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_vmapjvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
@@ -2163,7 +2157,6 @@ def test_vmapjvpvmap(self, device, dtype, op):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_jvpjvpvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_jvpjvpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
@@ -2193,7 +2186,6 @@ def test_jvpjvpvmap(self, device, dtype, op):
     @ops(autograd_function_db, allowed_dtypes=(torch.float32,))
     @skipOps('TestOperators', 'test_jvpvjpvmap', {
         xfail('NumpyCubeNotComposableAutogradFunction'),  # Not composable
-        xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
     })
     def test_jvpvjpvmap(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=True)
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -552,7 +552,7 @@ def forward(x):
                 return x ** 2
 
             @staticmethod
-            def setup_context(ctx, inputs, outputs):
+            def setup_context(ctx, inputs, output):
                 x, = inputs
                 ctx.save_for_backward(x)
 
@@ -576,9 +576,9 @@ def forward(x):
                 return x ** 2, two_x
 
             @staticmethod
-            def setup_context(ctx, inputs, outputs):
+            def setup_context(ctx, inputs, output):
                 x, = inputs
-                _, two_x = outputs
+                _, two_x = output
                 ctx.two_x = two_x
 
             @staticmethod
@@ -599,7 +599,7 @@ def forward(x, shape, scale_forward, scale_backward):
                 return x.reshape(shape) * scale_forward
 
             @staticmethod
-            def setup_context(ctx, inputs, outputs):
+            def setup_context(ctx, inputs, output):
                 x, shape, scale_forward, scale_backward = inputs
                 ctx.scale_backward = scale_backward
                 ctx.x_shape = x.shape
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
@@ -17,6 +17,7 @@ def _unwrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
 def _wrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
 def _unwrap_batched(tensor: Tensor, level: int) -> Tuple[Tensor, Optional[int]]: ...
 def current_level() -> int: ...
+def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
 
 def set_autograd_function_allowed(allowed: bool) -> None: ...
 def get_autograd_function_allowed() -> bool: ...
diff --git a/torch/_functorch/autograd_function.py b/torch/_functorch/autograd_function.py
@@ -13,7 +13,9 @@
     unwrap_batched,
     vmap,
     restore_vmap,
+    _add_batch_dim,
 )
+from torch._functorch.vmap import _broadcast_to_and_flatten
 from torch.autograd.forward_ad import _set_fwd_grad_enabled
 from typing import Any, NamedTuple, Tuple
 
@@ -101,16 +103,20 @@ def forward(*operands):
         # the transform. _SingleLevelFunction will turn off both fwd and bwd
         # gradient computation and we need to turn it back on here.
         with torch.enable_grad(), _set_fwd_grad_enabled(True), interpreter.lower():
-            output = custom_function_call(autograd_function, *unwrapped_operands)
+            unwrapped_output = custom_function_call(autograd_function, *unwrapped_operands)
 
-        return pytree.tree_map_only(
-            torch.Tensor,
-            lambda x: _wrap_for_grad(x, level),
-            output)
+        # See NOTE [mark_dirty object identity check]
+        def wrap_fn(output):
+            return _wrap_for_grad(output, level)
+
+        return wrap_outputs_maintaining_identity(
+            unwrapped_output,
+            unwrapped_operands,
+            operands,
+            wrap_fn)
 
-    def setup_context(ctx, outputs, *operands):
-        ctx.mark_dirty = mark_dirty_error
-        return autograd_function.setup_context(ctx, outputs, *operands)
+    def setup_context(ctx, inputs, output):
+        return autograd_function.setup_context(ctx, inputs, output)
 
     # backward is only used if the transform is TransformType.Grad
     def backward(ctx, *grads):
@@ -139,24 +145,39 @@ def jvp(ctx, *tangents):
     )
     return Generated
 
+# NOTE [mark_dirty object identity check]
+# autograd.Function's ctx.mark_dirty expect a returned input
+# to have the same object identity as the input.
+# Mode-only functorch will greatly simplify this logic.
+def wrap_outputs_maintaining_identity(outputs, unwrapped_inputs, orig_inputs, wrap_fn, out_dims=None):
+    flat_unwrapped_inputs, _ = pytree.tree_flatten(unwrapped_inputs)
+    flat_orig_inputs, _ = pytree.tree_flatten(orig_inputs)
+
+    unwrapped_input_to_orig_input = {
+        id(unwrapped): orig
+        for unwrapped, orig in zip(flat_unwrapped_inputs, flat_orig_inputs)
+    }
+
+    flat_outputs, spec = pytree.tree_flatten(outputs)
+    result = []
+
+    if out_dims is not None:
+        flat_out_dims = _broadcast_to_and_flatten(out_dims, spec)
+
+    for i, output in enumerate(flat_outputs):
+        if not isinstance(output, torch.Tensor):
+            result.append(output)
+            continue
+        if id(output) in unwrapped_input_to_orig_input:
+            result.append(unwrapped_input_to_orig_input[id(output)])
+            continue
+        if out_dims is not None:
+            assert flat_out_dims is not None
+            result.append(wrap_fn(output, flat_out_dims[i]))
+        else:
+            result.append(wrap_fn(output))
 
-# https://github.com/pytorch/pytorch/issues/90225
-# If an input was marked as dirty, and the autograd.Function returns the input
-# from the forward, then the grad rule for custom_function_call must also
-# return the corresponding input from the forward() of the Generated autograd.Function
-#
-# We haven't figured out how to do this yet. One possibility is to rely
-# on if the return from the redispatched custom_function_call in Generated.forward
-# has the same object id as one of the inputs,
-# but https://github.com/pytorch/pytorch/issues/90209 means we cannot rely on
-# that property.
-def mark_dirty_error(*args, **kwargs):
-    raise RuntimeError(
-        'NYI: we do not yet support ctx.mark_dirty with functorch transforms. '
-        'Please try to avoid modifying inputs to the autograd.Function in-place '
-        'by using out-of-place operations or by cloning the inputs. '
-        'Please see https://github.com/pytorch/pytorch/issues/90209 for more details'
-    )
+    return pytree.tree_unflatten(result, spec)
 
 
 # NOTE: [functorch vjp and autograd interaction]
@@ -172,8 +193,8 @@ def mark_dirty_error(*args, **kwargs):
 #         return x.exp()
 #
 #     @staticmethod
-#     def setup_context(ctx, outputs, x):
-#         y = outputs
+#     def setup_context(ctx, inputs, output):
+#         y = output
 #         ctx.save_for_backward(y)
 #
 #     @staticmethod
@@ -244,12 +265,20 @@ def custom_function_call_vmap(interpreter, autograd_function, *operands):
     with interpreter.lower():
         unwrapped_output, out_dims = autograd_function.vmap(info, in_dims, *unwrapped_operands)
 
+    # See NOTE [mark_dirty object identity check]
+    def wrap_fn(output, out_dim):
+        return output if out_dim is None else _add_batch_dim(output, out_dim, current_level)
+
     # TODO: raise better error message to the user when they don't follow the API.
     # Should probably mimic the logic of _process_batched_inputs,
     # but that one is hyperspecialized on error messages.
     # https://github.com/pytorch/pytorch/issues/90224
-    output = wrap_batched(unwrapped_output, out_dims, current_level)
-    return output
+    return wrap_outputs_maintaining_identity(
+        unwrapped_output,
+        unwrapped_operands,
+        operands,
+        wrap_fn,
+        out_dims=out_dims)
 
 
 def custom_function_call_vmap_generate_rule(interpreter, autograd_function, *operands):
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
diff --git a/torch/testing/_internal/autograd_function_db.py b/torch/testing/_internal/autograd_function_db.py