Update on "Add prepend argument to nn.Module hooks"

mrshenli · mrshenli · commit 510d0aaba47b · 2022-10-25T15:00:39.000Z
cc ezyang gchanan

[ghstack-poisoned]
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-0d7807d59520289b2065b4db4a138b7fba2f61fd
+9c112935abe400222cca8f9fbc2d8386e0f25e80
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -93,24 +93,6 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                   { return mps::trunc_tensor(mpsGraph, inputTensor); });
 }
 
-TORCH_IMPL_FUNC(signbit_out_mps) (const Tensor& self, const Tensor& output) {
-  mps::unary_op(self, output, "signbit_out_mps",
-                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
-                    MPSGraphTensor* output;
-                    // signbit is not implemented for int64 type.
-                    // workaround for `Function signbitOp_i64 was not found in the library`
-                    if ([inputTensor dataType] == MPSDataTypeInt64) {
-                      MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:inputTensor.dataType];
-                      output = [mpsGraph lessThanWithPrimaryTensor:inputTensor
-                                                   secondaryTensor:zeroTensor
-                                                              name:nil];
-                    } else {
-                      output = [mpsGraph signbitWithTensor: inputTensor name: nil];
-                    }
-                    return mps::castMPSTensor(mpsGraph, output, ScalarType::Bool);
-                 });
-}
-
 TORCH_IMPL_FUNC(sign_out_mps) (const Tensor& self, const Tensor& output) {
   mps::unary_op(self, output, "sign_out_mps",
                 ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -8533,7 +8533,6 @@
   dispatch:
     CPU: signbit_out
     CUDA: signbit_out
-    MPS: signbit_out_mps
     SparseCPU, SparseCUDA: signbit_sparse_out
     SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out
 
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -234,18 +234,17 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_nested(
         return std::make_tuple(Tensor(), Tensor());
     }
 }
-namespace{
+
 
 /**
  * This function is used to calculate two pieces of metadata that are needed
  * for use with flash-attention and efficient_attention kernels. They are the
  * cumulative sequence_length over a batch of sequences and the maximum sequence
  * length.
  *
- * @return A tuple of cumulative sequence lengths and the maximum sequence length,
- * and the last element in the cumulative_sequence_lengths
+ * @return A tuple of cumulative sequence lengths and the maximum sequence length
  */
-std::tuple<Tensor, int64_t, int64_t> cumulative_and_max_seq_len(Tensor qkv) {
+std::tuple<Tensor, int64_t> cumulative_and_max_seq_len(Tensor qkv) {
   TORCH_CHECK(
       qkv.is_nested(),
       "QKV must be nested for flash cumulative_seq_len calculation.")
@@ -275,7 +274,7 @@ std::tuple<Tensor, int64_t, int64_t> cumulative_and_max_seq_len(Tensor qkv) {
   // Send to GPU, this is pretty light weight calc for normal batch size
   // but maybe this needs to be on gpu
   cumulative_seqlen = cumulative_seqlen.to(TensorOptions().device(at::kCUDA));
-  return std::tuple<Tensor, int64_t, int64_t>{cumulative_seqlen, max_seqlen, sum};
+  return std::tuple<Tensor, int64_t>{cumulative_seqlen, max_seqlen};
 }
 
 /**
@@ -338,7 +337,6 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) {
   return true;
 }
 
-} // namespace
 std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
     const Tensor& query,
     const Tensor& key,
@@ -356,19 +354,19 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
 
-  auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(q_t);
-  auto cumulative_and_max_k_and_nnz_k = cumulative_and_max_seq_len(k_t);
+  auto cumulative_and_max_q = cumulative_and_max_seq_len(q_t);
+  auto cumulative_and_max_k = cumulative_and_max_seq_len(k_t);
 
   // K and V have to have the same Nnz, should probably torch_check
   // assume in order to not iterate over v
 
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q);
-  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k_and_nnz_k);
+  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
+  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k);
 
-  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q);
+  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
 
-  const int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q);
-  const int64_t Nnz_kv = std::get<2>(cumulative_and_max_k_and_nnz_k);
+  const int64_t Nnz_q = cumulative_sequence_length_q[-1].item<int64_t>();
+  const int64_t Nnz_kv = cumulative_sequence_length_k[-1].item<int64_t>();
 
   Tensor query_buffer_reshaped;
   Tensor key_buffer_reshaped;
@@ -462,15 +460,15 @@ Tensor flash_attention_helper(
   int64_t head_dim{query.size(-1)};
   int64_t num_heads{query.size(-2)};
 
-  auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(query);
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q);
-  int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q);
+  auto cumulative_and_max_q = cumulative_and_max_seq_len(query);
+  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
+  int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
 
   TORCH_CHECK(
       key.is_same(key) && query.is_same(value),
       "Key and Value must be the same tensor");
 
-  int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q);
+  int64_t Nnz_q{cumulative_sequence_length_q[-1].item<int64_t>()};
 
   // For the packed case we need to set the output size for dim 2 to 1
   auto atten_size = get_nested_size_tensor(query).clone();
diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
@@ -329,6 +329,7 @@ def fn(
                         self.assertTrue(isinstance(ten, FakeTensor))
                     self.assertEqual(ten.device.type, 'cuda')
 
+    @skipIfRocm
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_fallback_memory_prop(self):
         m = nn.Conv2d(16, 33, 3, stride=2, device="cuda", dtype=torch.half)
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -4175,20 +4175,6 @@ def helper(shape):
 
         helper((2, 8, 4, 5))
 
-    def test_signbit(self):
-        def helper(shape, dtype):
-            cpu_x = torch.randn(shape, device='cpu').to(dtype)
-            x = cpu_x.clone().to('mps')
-
-            signbit_result = torch.signbit(x)
-            signbit_result_cpu = torch.signbit(cpu_x)
-
-            self.assertEqual(signbit_result, signbit_result_cpu)
-
-        helper((2, 8, 4, 5), torch.int)
-        helper((2, 8, 4, 5), torch.float)
-        helper((2, 8, 4, 5), torch.int64)
-
     # Test neg
     def test_neg(self):
         def helper(shape):
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -527,8 +527,8 @@ class Value:
 
 # Defined in torch/csrc/jit/ir/ir.h
 class Block:
-    def inputs(self) -> Iterator[Value]: ...
-    def outputs(self) -> Iterator[Value]: ...
+    def inputs(self) -> List[Value]: ...
+    def outputs(self) -> List[Value]: ...
     def nodes(self) -> Iterator[Node]: ...
     def paramNode(self) -> Node: ...
     def returnNode(self) -> Node: ...
@@ -542,11 +542,11 @@ class Node:
     def __getitem__(self, key: str) -> Any: ...
     def schema(self) -> str: ...
     def input(self) -> Value: ...
-    def inputs(self) -> Iterator[Value]: ...
+    def inputs(self) -> List[Value]: ...
     def inputsAt(self, idx: _int) -> Value: ...
     def inputsSize(self) -> _int: ...
     def output(self) -> Value: ...
-    def outputs(self) -> Iterator[Value]: ...
+    def outputs(self) -> List[Value]: ...
     def outputsAt(self, idx: _int) -> Value: ...
     def outputsSize(self) -> _int: ...
     def hasMultipleOutputs(self) -> _bool: ...
@@ -622,8 +622,8 @@ class Node:
 
 # Defined in torch/torch/csrc/jit/ir/ir.h
 class Graph:
-    def inputs(self) -> Iterator[Value]: ...
-    def outputs(self) -> Iterator[Value]: ...
+    def inputs(self) -> List[Value]: ...
+    def outputs(self) -> List[Value]: ...
     def nodes(self) -> Iterator[Node]: ...
     def param_node(self) -> Node: ...
     def return_node(self) -> Node: ...

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0d7807d59520289b2065b4db4a138b7fba2f61fd`
	`1`	`+9c112935abe400222cca8f9fbc2d8386e0f25e80`