Add support for index_put_ in NT (#135722)

ani300 · pytorchmergebot · commit ad637a4c5cb8 · 2024-10-30T17:17:59.000Z
Pull Request resolved: #135722 Approved by: https://github.com/jbschlosser
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
@@ -6194,6 +6194,38 @@ def test_copy_(self, device):
         ):
             a.copy_(b)
 
+    # This can't happen in the opinfo tests due to subprocess creation
+    @unittest.skipIf(
+        TEST_WITH_ROCM,
+        "In ROCm, kernel asserts are disabled due to performance overhead",
+    )
+    def test_index_put_error(self, device):
+        import subprocess
+
+        with self.subTest():
+            r = subprocess.call(
+                [
+                    sys.executable,
+                    "-c",
+                    """\
+import torch
+offsets = torch.tensor([0, 2, 5, 7], device='cuda')
+lengths = torch.tensor([2, 2, 2], device='cuda')
+indices = [
+    torch.tensor([0, 1, 2], device='cuda'),
+    torch.tensor([0, 2, 1], device='cuda'),
+    torch.tensor([0, 0, 0], device='cuda'),
+]
+a = torch.nested.nested_tensor_from_jagged(
+    torch.zeros(7, 3, device='cuda'), offsets, lengths
+)
+a[indices] = 1.0
+torch.cuda.synchronize()
+""",
+                ]
+            )
+            self.assertTrue(r != 0)
+
     @skipIfTorchDynamo("Dynamo doesn't know how to trace prof.events()")
     def test_profiler_sequence_nr(self):
         with torch.profiler.profile() as prof:
@@ -7915,6 +7947,12 @@ def test_forward(self, device, dtype, op):
             out_ref = op.ref(op, sample)
             self.assertEqualIgnoringNestedInts(out, out_ref)
 
+            # TODO: Revisit once https://github.com/pytorch/pytorch/pull/138369 lands
+            # TODO: Add xfails for other inplace ops instead of hardcoding
+            if op.inplace_variant and "index_put" in op.full_name:
+                op.inplace_variant(sample.input, *sample.args, **sample.kwargs)
+                self.assertEqualIgnoringNestedInts(sample.input, out_ref)
+
     @withXFails(BACKWARD_FAILURES)
     @ops(
         [op for op in njt_op_db if op.supports_njt and op.supports_autograd],
@@ -7970,6 +8008,32 @@ def f(*args, **kwargs):
             else:
                 self.assertEqual(out_compile, out_ref)
 
+            # TODO: Revisit once https://github.com/pytorch/pytorch/pull/138369 lands
+            # TODO: Add xfails for other inplace ops instead of hardcoding
+            if op.inplace_variant and "index_put" in op.full_name:
+                op_fn = op.inplace_variant
+
+                def in_f(*args, **kwargs):
+                    return op_fn(*args, **kwargs)
+
+                compiled_in_f = torch.compile(
+                    in_f, fullgraph=True, backend="aot_eager_decomp_partition"
+                )
+
+                if sample.input.is_contiguous():
+                    compiled_in_f(sample.input, *sample.args, **sample.kwargs)
+                    if op.full_name in COMPARE_TENSOR_COMPONENT_EQUALITY:
+                        self.assertEqualIgnoringNestedInts(sample.input, out_ref)
+                    else:
+                        self.assertEqual(sample.input, out_ref)
+                else:
+                    # see https://github.com/pytorch/pytorch/issues/106456
+                    with self.assertRaisesRegex(
+                        RuntimeError,
+                        "Mutations on non-contiguous inputs are currently not allowed on tensor subclasses",
+                    ):
+                        compiled_in_f(sample.input, *sample.args, **sample.kwargs)
+
     @withXFails(COMPILE_BACKWARD_FAILURES)
     @ops(
         [op for op in njt_op_db if op.supports_njt and op.supports_autograd],
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
@@ -1558,6 +1558,99 @@ def slice_tensor(func, *args, **kwargs):
     return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
 
 
+@register_jagged_func(
+    torch.ops.aten.index_put.default,
+    "input: jt_all, indices: any, values: t, accumulate: any?",
+)
+@register_jagged_func(
+    torch.ops.aten.index_put_.default,
+    "input: jt_all, indices: any, values: t, accumulate: any?",
+)
+def index_put_(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp: NestedTensor = new_kwargs.pop("input")
+
+    # For index_put_ to work, we add together the indices of the ragged dimension
+    # and the batch dimension, adding the offsets of each ragged dimension to its
+    # indices
+
+    indices = new_kwargs.pop("indices")
+
+    assert len(indices) <= inp.dim()
+
+    if len(indices) < inp._ragged_idx + 1:
+        if not inp.is_contiguous():
+            raise RuntimeError(
+                "index_put(): If ragged dimension is not part of indices, this only works on contiguous NJTs"
+            )
+        # Ragged dim is NOT part of indices, we need to pad the nested tensor to apply func
+        from .nested_tensor import nested_from_padded
+
+        min_seqlen = inp._maybe_min_seqlen
+        max_seqlen = inp._maybe_max_seqlen
+        padded_max_S = max_seqlen
+        total_L = inp._values.shape[inp._ragged_idx - 1]
+        if padded_max_S is None:
+            # use upper bound on max seqlen if it's not present
+            padded_max_S = total_L
+
+        padded_shape = (
+            *inp.shape[: inp._ragged_idx],
+            padded_max_S,
+            *inp.shape[inp._ragged_idx + 1 :],
+        )
+        padded_inp = inp.to_padded_tensor(0.0, output_size=padded_shape)
+        new_njt = nested_from_padded(
+            func(padded_inp, indices, **new_kwargs),
+            offsets=inp._offsets,
+            ragged_idx=inp._ragged_idx,
+            sum_S=total_L,
+            min_seqlen=min_seqlen,
+            max_seqlen=max_seqlen,
+        )
+
+        if func == torch.ops.aten.index_put_.default:
+            inp._values.copy_(new_njt.values())
+            return inp
+        return new_njt
+
+    # We can run on the underlying values directly
+
+    # Validate indices
+    if inp.lengths() is None:
+        lengths = inp.offsets().diff()
+    else:
+        lengths = inp.lengths()
+    torch._assert_async(
+        torch.all(indices[inp._ragged_idx] < lengths),
+        "Some indices in the ragged dimension are out of bounds!",
+    )
+
+    # Recompute indices for _values
+    ragged_indices = inp.offsets()[indices[0]] + indices[inp._ragged_idx]
+    func_indices = (
+        # before ragged dim
+        indices[1 : inp._ragged_idx]
+        # ragged dim (combined with batch)
+        + [ragged_indices]
+        # after ragged dim
+        + indices[inp._ragged_idx + 1 :]
+    )
+
+    if func == torch.ops.aten.index_put_.default:
+        inp._values = func(inp._values, func_indices, **new_kwargs)
+        return inp
+
+    return NestedTensor(
+        func(inp._values, func_indices, **new_kwargs),
+        **extract_kwargs(inp),
+        lengths=inp.lengths(),
+    )
+
+
 @register_jagged_func(
     torch.ops.aten.convolution.default,
     "input: jt, weight: t, bias: t?, stride: any, padding: any, "
diff --git a/torch/testing/_internal/opinfo/definitions/nested.py b/torch/testing/_internal/opinfo/definitions/nested.py
@@ -106,6 +106,29 @@ def _slice_input(t, i=i, inp=nt_inp):
         args = tree_map(_slice_input, sample.args)
         kwargs = tree_map(_slice_input, sample.kwargs)
 
+        # Handle indices in index_put
+        if "index_put" in op.full_name and "indices" in kwargs:
+            if len(kwargs["indices"]) > 1:
+                # If after unrolling we still have indices left, use them
+                kwargs["indices"] = [t[i] for t in kwargs["indices"][1:]]
+            else:
+                # If no indices are left, create them so they match the NJT implementation
+                sequence_put = kwargs["indices"][0].tolist()
+                if i in sequence_put:
+                    kwargs["indices"] = [
+                        torch.tensor(
+                            list(range(inp.shape[0])),
+                            dtype=torch.int32,
+                            device=kwargs["indices"][0].device,
+                        )
+                    ]
+                else:
+                    kwargs["indices"] = [
+                        torch.tensor(
+                            [], dtype=torch.int32, device=kwargs["indices"][0].device
+                        )
+                    ]
+
         from torch._prims_common import canonicalize_dims
 
         # Need to adjust dim to apply on NJT component
@@ -115,7 +138,6 @@ def _slice_input(t, i=i, inp=nt_inp):
 
         # TODO: handle this
         assert "dims" not in kwargs
-
         out_ref_component = op.op(inp, *args, **kwargs)
 
         # TODO: handle list / tuple / non-NJT outputs
@@ -449,6 +471,46 @@ def sample_inputs_nn_functional_embedding(
     )
 
 
+def sample_inputs_index_put(
+    op_info, device, dtype, requires_grad, op_kwargs=None, **kwargs
+):
+    for njt in _sample_njts(
+        device=device, dtype=dtype, requires_grad=requires_grad, dims=[2, 3, 4]
+    ):
+        for dim in range(njt.dim()):
+            indices = [
+                torch.tensor(list(range(njt.size(0))), device=njt.device),
+                *[
+                    torch.tensor([0] * njt.size(0), device=njt.device)
+                    for _ in range(dim - 1)
+                ],
+            ]
+            yield SampleInput(
+                njt.clone().detach(),
+                kwargs={
+                    "indices": indices,
+                    "values": torch.tensor(1.0, device=njt.device),
+                },
+            )
+
+    # Non-cont NJT for completeness
+    offsets = torch.tensor([0, 2, 5, 7], device=device)
+    lengths = torch.tensor([2, 2, 2], device=device)
+    indices = [
+        torch.tensor([0, 1, 2], device=device),
+        torch.tensor([0, 1, 1], device=device),
+        torch.tensor([0, 0, 0], device=device),
+    ]
+    a = torch.nested.nested_tensor_from_jagged(
+        torch.zeros(7, 3, device=device), offsets, lengths
+    )
+
+    yield SampleInput(
+        a.clone().detach(),
+        kwargs={"indices": indices, "values": torch.tensor(1.0, device=a.device)},
+    )
+
+
 def sample_inputs_nn_functional_embedding_bag(
     op_info, device, dtype, requires_grad, **kwargs
 ):
@@ -591,6 +653,7 @@ def sample_inputs_nn_functional_rms_norm(
     "to": sample_inputs_to,
     "matmul": sample_inputs_matmul,
     "masked_select": sample_inputs_masked_select,
+    "index_put": sample_inputs_index_put,
 }
 
 njt_references = {