[MPS] View fixes (#95323)

kulinseth · razarmehr · qqaatw · web-flow · commit beaa5c5908d8 · 2023-02-24T09:09:49.000-05:00
* [MPS] Fix the uint8 type issue with View ops kernels (#95145) This should fix the problem in Resnet model with image artifacts due to saturation on int8 type and also the incorrect class recognition reported in #86954. Fixes #86954 Pull Request resolved: #95145 Approved by: https://github.com/kulinseth, https://github.com/DenisVieriu97 * [MPS] Fix tensor with non-zero storage offset graph gathering (#91071) Previously, the "can slice" flag in Placeholder constructor in `OperationUtils.mm` is conditioned on whether the numbers of dimensions of base shape and view shape are the same. This doesn't consider the situation that a view tensor could be the base tensor's sliced and then unsqueezed version, resulting in different num of dims. For example, if we want to stack `y_mps` and `x_mps` on the last dim: ``` t_mps = torch.tensor([1, 2, 3, 4], device="mps") x_mps = t_mps[2:] # [3, 4] y_mps = t_mps[:2] # [1, 2] res_mps = torch.stack((y_mps, x_mps), dim=-1) ``` the kernel will unsqueeze both of them on the last dim and then concatenate them, which is equivalent to: ``` res_mps = torch.cat((y_mps.unsqueeze(-1), x_mps.unsqueeze(-1)), dim=-1) ``` `x_mps.unsqueeze(-1)` is an unsqueezed and contiguous tensor with a storage offset, this kind of tensors should be sliceable without cloning its storage. Fixes #87856 Fixes #91065 Pull Request resolved: #91071 Approved by: https://github.com/kulinseth * [MPS] Fix fill_ where input tensor has a storage offset (#95113) Fixes #94390 Apart from fixing the issue above, this PR also fixes a bug that when an input tensor can be sliced, a sliced array view is created. This array view seems to be not writable or have a different storage from the original tensor, causing incorrect results with the in-place `fill`. Pull Request resolved: #95113 Approved by: https://github.com/kulinseth * [MPS] Fix view op slicing for 2nd dim in case of 0 offset (#95381) * Fix view op slicing for 2nd dim in case of 0 offset Pull Request resolved: #95381 Approved by: https://github.com/razarmehr --------- Co-authored-by: Ramin Azarmehr <razarmehr@apple.com> Co-authored-by: Li-Huai (Allan) Lin <qqaatw@gmail.com> Co-authored-by: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -289,7 +289,7 @@ void printTensorNDArray(const Tensor& t) {
   } else {
     if (!mpsShape) {
       mpsShape = getMPSShape(_tensor);
-  }
+    }
 
     _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
                                                       shape:mpsShape
diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@@ -12,7 +12,7 @@
   }
   Tensor output = self;
   bool needsCopyToOutput = false;
-  if (!self.is_contiguous()) {
+  if (!self.is_contiguous() || self.storage_offset()) {
     output = empty_mps(self.sizes(), self.scalar_type(), c10::nullopt, kMPS);
     needsCopyToOutput = true;
   }
@@ -89,7 +89,7 @@ bool fill_mps_tensor_(Tensor& self, uint8_t value) {
   if (self.is_contiguous()) {
     MPSStream* stream = getCurrentMPSStream();
     auto storage_byte_offset = self.storage_offset() * self.itemsize();
-    stream->fill(mps::getMTLBufferStorage(self), 0, self.nbytes(), storage_byte_offset);
+    stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);
     return true;
   }
   return false;
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
@@ -424,38 +424,76 @@
 }
 
 static
-std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape) {
+std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape, const bool squeeze) {
   bool hasMPSShape = (mpsShape != nil);
   std::vector<int64_t> src_view_shape;
   if (hasMPSShape) {
     int src_ndim_view = [mpsShape count];
-    src_view_shape.resize(src_ndim_view);
-    for (const auto i : c10::irange(src_ndim_view)) {
-      src_view_shape[i] = [mpsShape[i] intValue];
+    if (squeeze) {
+      for (const auto i : c10::irange(src_ndim_view)) {
+        if ([mpsShape[i] intValue] == 1)
+          continue;
+        src_view_shape.emplace_back([mpsShape[i] intValue]);
+      }
+    } else {
+      src_view_shape.resize(src_ndim_view);
+      for (const auto i : c10::irange(src_ndim_view)) {
+        src_view_shape[i] = [mpsShape[i] intValue];
+      }
     }
+
   } else {
-    src_view_shape = src.sizes().vec();
+    if (squeeze) {
+      IntArrayRef src_shape = src.sizes();
+      size_t src_ndim_view = src_shape.size();
+      for (const auto i : c10::irange(src_ndim_view)) {
+        if (src_shape[i] == 1)
+          continue;
+        src_view_shape.emplace_back(src_shape[i]);
+      }
+    } else {
+      src_view_shape = src.sizes().vec();
+    }
   }
 
   return src_view_shape;
 }
 
+
+std::vector<int64_t> getSqueezedBaseShape(const Tensor& src, IntArrayRef shape) {
+  std::vector<int64_t> src_base_shape;
+  for (const auto i : c10::irange(shape.size())) {
+    if (shape[i] == 1)
+      continue;
+    src_base_shape.emplace_back(shape[i]);
+  }
+
+  return src_base_shape;
+}
+
+
 bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
   if (!src.is_contiguous()) {
     return false;
   }
 
   IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
+  std::vector<int64_t> src_base_squeezed_shape = getSqueezedBaseShape(src, src_base_shape);
   size_t src_ndim_base = src_base_shape.size();
-  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
-  size_t src_ndim_view = src_view_shape.size();
+  size_t src_squeezed_ndim_base = src_base_squeezed_shape.size();
+  std::vector<int64_t> src_view_squeezed_shape = getViewShape(src, mpsShape, true);
+  size_t src_ndim_view = getViewShape(src, mpsShape, false).size();
+  size_t src_squeezed_ndim_view = src_view_squeezed_shape.size();
+
   if (src_ndim_base != src_ndim_view) {
     return false;
   }
 
-  for (const auto i: c10::irange(src_ndim_base)) {
-    if (src_view_shape[i] > src_base_shape[i]) {
-      return false;
+  if (src_squeezed_ndim_base == src_squeezed_ndim_view) {
+    for (const auto i: c10::irange(src_squeezed_ndim_base)) {
+      if (src_view_squeezed_shape[i] > src_base_squeezed_shape[i]) {
+        return false;
+      }
     }
   }
 
@@ -464,40 +502,63 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
 
 MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
   IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
-  int src_ndim_base = src_base_shape.size();
-  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
-  int src_ndim_view = src_view_shape.size();
-
-  TORCH_CHECK(src_ndim_base == src_ndim_view);
+  size_t src_ndim_base = src_base_shape.size();
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape, false);
+  size_t src_ndim_view = src_view_shape.size();
 
   MPSNDArray *srcTensorNDArrayView = nil;
   MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
   MPSNDArray *srcTensorNDArray = nil;
   id<MTLCommandBuffer> commandBuffer = getCurrentMPSStream()->commandBuffer();
 
+  int64_t base_idx = 0;
+
+  std::vector<int64_t> src_base_shape_vec;
+
+  if (src_ndim_view != src_ndim_base) {
+    src_base_shape_vec.reserve(src_ndim_view);
+    for (const auto i : c10::irange(src_ndim_view)) {
+      if (src_view_shape[i] == 1 && src_base_shape[base_idx] != 1) {
+        src_base_shape_vec.emplace_back(1);
+      } else {
+        src_base_shape_vec.emplace_back(src_base_shape[base_idx]);
+        if (base_idx < src_ndim_base - 1)
+          base_idx += 1;
+      }
+    }
+    src_base_shape = IntArrayRef(src_base_shape_vec);
+    src_ndim_base = src_base_shape.size();
+  }
+
   srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
   srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
 
-  int firstDimToSlice = 0;
+  size_t firstDimToSlice = 0;
   while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
     firstDimToSlice++;
   }
 
-  int view_numel = 1;
+  int64_t view_numel = 1;
   for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
     view_numel *= src_base_shape[i];
   }
 
-  int sliceOffset = src.storage_offset() / view_numel;
+  int64_t sliceOffset = src.storage_offset() / view_numel;
   // There are cases where both dimensions of a view can shrink
   // E.g: x = torch.randn((3,6))[1, 1:3]
-  int nextSliceOffset = src.storage_offset() % view_numel;
+  int64_t nextSliceOffset = 0;
+  bool sliceNextDim = (firstDimToSlice < (src_base_shape.size() - 1)) &&
+                      (src_view_shape[firstDimToSlice + 1] != src_base_shape[firstDimToSlice + 1]);
 
   [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
-  if (nextSliceOffset) {
+  if (sliceNextDim) {
+    if (firstDimToSlice + 1 == src_base_shape.size() - 1) {
+      nextSliceOffset = src.storage_offset() % src_base_shape[src_base_shape.size() - 1];
+    } else {
+      nextSliceOffset = (src.storage_offset() % view_numel) / (view_numel / src_base_shape[firstDimToSlice + 1]);
+    }
     [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice+1])}];
   }
-
   srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer
                                                            descriptor:srcTensorNDArrayDesc
                                                              aliasing:MPSAliasingStrategyShallAlias];
@@ -696,7 +757,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
     {c10::ScalarType::Int,   "int"},
     {c10::ScalarType::Short, "short"},
     {c10::ScalarType::Char,  "char"},
-    {c10::ScalarType::Byte,  "char"},
+    {c10::ScalarType::Byte,  "uchar"},
     {c10::ScalarType::Bool,  "bool"},
   };
 
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -435,6 +435,27 @@ def helper(val, shape):
         helper(0, [1024])
         helper(0.2, [2, 3])
 
+    def test_fill_storage_offset(self):
+        shape = [2, 10]
+        val = 0.2
+        tensor = torch.ones(shape, device="mps")
+        tensor_mps = tensor[:][1].fill_(val)
+        tensor_0 = torch.ones(shape, device="cpu")
+        tensor_cpu = tensor_0[:][1].fill_(val)
+
+        self.assertEqual(tensor_mps, tensor_cpu)
+
+        shape = [1, 10]
+        val = 0.0
+        tensor = torch.ones(shape, device="mps")
+        val_tensor_mps = torch.tensor(val, device="mps")
+        tensor_mps = tensor[:, 9].fill_(val_tensor_mps)
+        tensor_0 = torch.ones(shape, device="cpu")
+        val_tensor_cpu = torch.tensor(val, device="cpu")
+        tensor_cpu = tensor_0[:, 9].fill_(val_tensor_cpu)
+
+        self.assertEqual(tensor_mps, tensor_cpu)
+
     def test_cdist_large(self, device="mps"):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(100, 10, device=device)
@@ -1806,6 +1827,63 @@ def test_slice_reshape_contg_view(self):
 
         self.assertEqual(r_mps, r_cpu)
 
+    def test_contiguous_slice_2d(self):
+        def helper(shape):
+            for i in range(0, shape[0]):
+                for j in range(0, shape[1]):
+                    t_mps = torch.randn(shape, device="mps")
+                    t_cpu = t_mps.detach().clone().cpu()
+
+                    y_mps = t_mps[i:, :j]
+                    y_cpu = t_cpu[i:, :j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[i:, j]
+                    y_cpu = t_cpu[i:, j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[i, :j]
+                    y_cpu = t_cpu[i, :j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[:i, :j]
+                    y_cpu = t_cpu[:i, :j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[:i, j]
+                    y_cpu = t_cpu[:i, j]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+                    y_mps = t_mps[:i, j:]
+                    y_cpu = t_cpu[:i, j:]
+                    self.assertEqual(y_mps + 1, y_cpu + 1)
+
+        l = []
+        for N in range(1, 3):
+            l.append(N)
+            for C in range(1, 3):
+                l.append(C)
+                helper(l)
+                for D in range(1, 3):
+                    l.append(D)
+                    helper(l)
+                    for H in range(1, 3):
+                        l.append(H)
+                        helper(l)
+                        for W in range(1, 3):
+                            l.append(W)
+                            helper(l)
+                            l.pop()
+                        l.pop()
+                    l.pop()
+                l.pop()
+            l.pop()
+
+        helper([9, 15, 4])
+        helper([9, 3, 2])
+        helper([3, 4, 18, 22])
+        helper([3, 4, 18, 22, 150])
+
     def test_view_slice(self):
         # https://github.com/pytorch/pytorch/issues/83995
         NUM_SAMPLES = 60
@@ -1899,25 +1977,28 @@ def helper(operator):
             if operator == "<=":
                 res_mps = x_mps <= y_mps
                 res_cpu = x_cpu <= y_cpu
-            if operator == "<":
+            elif operator == "<":
                 res_mps = x_mps < y_mps
                 res_cpu = x_cpu < y_cpu
-            if operator == ">=":
+            elif operator == ">=":
                 res_mps = x_mps >= y_mps
                 res_cpu = x_cpu >= y_cpu
-            if operator == ">":
+            elif operator == ">":
                 res_mps = x_mps >= y_mps
                 res_cpu = x_cpu >= y_cpu
-            if operator == "==":
+            elif operator == "==":
                 res_mps = x_mps == y_mps
                 res_cpu = x_cpu == y_cpu
-            if operator == "!=":
+            elif operator == "!=":
                 res_mps = x_mps != y_mps
                 res_cpu = x_cpu != y_cpu
+            elif operator == "stack":
+                res_mps = torch.stack((y_mps, x_mps), dim=-1)
+                res_cpu = torch.stack((y_cpu, x_cpu), dim=-1)
 
             self.assertEqual(res_mps, res_cpu)
 
-        for op in ["<=", "<", ">=", ">", "==", "!="]:
+        for op in ["<=", "<", ">=", ">", "==", "!=", "stack"]:
             helper(op)
 
     def test_slice_of_slice(self):

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`}`
`13`	`13`	`Tensor output = self;`
`14`	`14`	`bool needsCopyToOutput = false;`
`15`		`- if (!self.is_contiguous()) {`
	`15`	`+ if (!self.is_contiguous() \|\| self.storage_offset()) {`
`16`	`16`	`output = empty_mps(self.sizes(), self.scalar_type(), c10::nullopt, kMPS);`
`17`	`17`	`needsCopyToOutput = true;`
`18`	`18`	`}`
`@@ -89,7 +89,7 @@ bool fill_mps_tensor_(Tensor& self, uint8_t value) {`
`89`	`89`	`if (self.is_contiguous()) {`
`90`	`90`	`MPSStream* stream = getCurrentMPSStream();`
`91`	`91`	`auto storage_byte_offset = self.storage_offset() * self.itemsize();`
`92`		`- stream->fill(mps::getMTLBufferStorage(self), 0, self.nbytes(), storage_byte_offset);`
	`92`	`+ stream->fill(mps::getMTLBufferStorage(self), 0, self.storage().nbytes(), storage_byte_offset);`
`93`	`93`	`return true;`
`94`	`94`	`}`
`95`	`95`	`return false;`