Fix the bug in THCTensor_(baddbmm) and ATen's addmm_cuda for strided views input (#42425)

IvanYashchuk · facebook-github-bot · commit b9e68e03c4f1 · 2020-08-04T16:11:07.000-07:00
Summary: Fixes #42418. The problem was that the non-contiguous batched matrices were passed to `gemmStridedBatched`. The following code fails on master and works with the proposed patch: ```python import torch x = torch.tensor([[1., 2, 3], [4., 5, 6]], device='cuda:0') c = torch.as_strided(x, size=[2, 2, 2], stride=[3, 1, 1]) torch.einsum('...ab,...bc->...ac', c, c) ``` Pull Request resolved: #42425 Reviewed By: glaringlee Differential Revision: D22925266 Pulled By: ngimel fbshipit-source-id: a72d56d26c7381b7793a047d76bcc5bd45a9602c
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -34,11 +34,12 @@ Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) {
 Tensor prepare_matrix_for_cublas(Tensor& tensor, bool& transpose_tensor) {
   Tensor tensor_;
   IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
 
-  if ((tensor_strides[0] == 1) && (tensor_strides[1] != 0)) {
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
     tensor_ = tensor;
     transpose_tensor = false;
-  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] != 0)) {
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
     tensor_ = tensor;
     transpose_tensor = true;
   } else {
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -51,13 +51,15 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t,
   char transpose_batch1, transpose_batch2;
   int64_t lda, ldb, ldc;
   THCTensor *result_, *batch1_, *batch2_;
-  if (result->stride(1) == 1)
+  if (result->stride(1) == 1 &&
+   (result->size(2) == 1 || result->stride(2) >= std::max<int64_t>(1, result->size(1))))
   {
     transpose_result = false;
     result_ = result;
     ldc = result_->stride(2);
   }
-  else if (result->stride(2) == 1)
+  else if (result->stride(2) == 1 &&
+   (result->size(1) == 1 || result->stride(1) >= std::max<int64_t>(1, result->size(2))))
   {
     transpose_result = true;
 
@@ -80,15 +82,19 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t,
     ldc = result_->stride(2);
   }
 
+  const int64_t m = result->size(transpose_result ? 2 : 1);
+  const int64_t n = result->size(transpose_result ? 1 : 2);
+  const int64_t k = batch1->size(transpose_result ? 1 : 2);
+
   if (batch1->stride(transpose_result ? 2 : 1) == 1 &&
-   batch1->stride(transpose_result ? 1 : 2) != 0)
+   batch1->stride(transpose_result ? 1 : 2) >= std::max<int64_t>(1, m))
   {
     transpose_batch1 = 'n';
     batch1_ = batch1;
     lda = batch1_->stride(transpose_result ? 1 : 2);
   }
   else if (batch1->stride(transpose_result ? 1 : 2) == 1 &&
-   batch1->stride(transpose_result ? 2 : 1) != 0)
+   batch1->stride(transpose_result ? 2 : 1) >= std::max<int64_t>(1, k))
   {
     transpose_batch1 = 't';
     batch1_ = batch1;
@@ -107,14 +113,14 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t,
   }
 
   if (batch2->stride(transpose_result ? 2 : 1) == 1 &&
-   batch2->stride(transpose_result ? 1 : 2) != 0)
+   batch2->stride(transpose_result ? 1 : 2) >= std::max<int64_t>(1, k))
   {
     transpose_batch2 = 'n';
     batch2_ = batch2;
     ldb = batch2_->stride(transpose_result ? 1 : 2);
   }
   else if (batch2->stride(transpose_result ? 1 : 2) == 1 &&
-   batch2->stride(transpose_result ? 2 : 1) != 0)
+   batch2->stride(transpose_result ? 2 : 1) >= std::max<int64_t>(1, n))
   {
     transpose_batch2 = 't';
     batch2_ = batch2;
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -17083,6 +17083,23 @@ def genf_float(x, y):
 
             _test_mm(n, m, p, dtype, genf)
 
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.float32, torch.float64)
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    def test_strided_mm_bmm(self, device, dtype):
+        # Tests strided view case with stride smaller than corresponding dimension size
+        x = torch.tensor([[1., 2., 3.], [4., 5., 6.]], dtype=dtype, device=device)
+        new_shape = [2, 2, 2]
+        new_stride = [3, 1, 1]
+        sx = torch.as_strided(x, size=new_shape, stride=new_stride)
+
+        torch_fn = lambda x: torch.bmm(x, x)  # noqa: E731
+        np_fn = lambda x: np.matmul(x, x)  # noqa: E731
+        self.compare_with_numpy(torch_fn, np_fn, sx)
+
+        torch_fn = lambda x: torch.mm(x, x)  # noqa: E731
+        self.compare_with_numpy(torch_fn, np_fn, sx[0])
+
     @onlyCPU
     @dtypes(torch.float)
     def test_bmm(self, device, dtype):

Original file line number	Diff line number	Diff line change
`@@ -51,13 +51,15 @@ void THCTensor_(baddbmm)(THCState state, THCTensor result, THCTensor *t,`
`51`	`51`	`char transpose_batch1, transpose_batch2;`
`52`	`52`	`int64_t lda, ldb, ldc;`
`53`	`53`	`THCTensor result_, batch1_, *batch2_;`
`54`		`- if (result->stride(1) == 1)`
	`54`	`+ if (result->stride(1) == 1 &&`
	`55`	`+ (result->size(2) == 1 \|\| result->stride(2) >= std::max<int64_t>(1, result->size(1))))`
`55`	`56`	`{`
`56`	`57`	`transpose_result = false;`
`57`	`58`	`result_ = result;`
`58`	`59`	`ldc = result_->stride(2);`
`59`	`60`	`}`
`60`		`- else if (result->stride(2) == 1)`
	`61`	`+ else if (result->stride(2) == 1 &&`
	`62`	`+ (result->size(1) == 1 \|\| result->stride(1) >= std::max<int64_t>(1, result->size(2))))`
`61`	`63`	`{`
`62`	`64`	`transpose_result = true;`
`63`	`65`
`@@ -80,15 +82,19 @@ void THCTensor_(baddbmm)(THCState state, THCTensor result, THCTensor *t,`
`80`	`82`	`ldc = result_->stride(2);`
`81`	`83`	`}`
`82`	`84`
	`85`	`+ const int64_t m = result->size(transpose_result ? 2 : 1);`
	`86`	`+ const int64_t n = result->size(transpose_result ? 1 : 2);`
	`87`	`+ const int64_t k = batch1->size(transpose_result ? 1 : 2);`
	`88`	`+`
`83`	`89`	`if (batch1->stride(transpose_result ? 2 : 1) == 1 &&`
`84`		`- batch1->stride(transpose_result ? 1 : 2) != 0)`
	`90`	`+ batch1->stride(transpose_result ? 1 : 2) >= std::max<int64_t>(1, m))`
`85`	`91`	`{`
`86`	`92`	`transpose_batch1 = 'n';`
`87`	`93`	`batch1_ = batch1;`
`88`	`94`	`lda = batch1_->stride(transpose_result ? 1 : 2);`
`89`	`95`	`}`
`90`	`96`	`else if (batch1->stride(transpose_result ? 1 : 2) == 1 &&`
`91`		`- batch1->stride(transpose_result ? 2 : 1) != 0)`
	`97`	`+ batch1->stride(transpose_result ? 2 : 1) >= std::max<int64_t>(1, k))`
`92`	`98`	`{`
`93`	`99`	`transpose_batch1 = 't';`
`94`	`100`	`batch1_ = batch1;`
`@@ -107,14 +113,14 @@ void THCTensor_(baddbmm)(THCState state, THCTensor result, THCTensor *t,`
`107`	`113`	`}`
`108`	`114`
`109`	`115`	`if (batch2->stride(transpose_result ? 2 : 1) == 1 &&`
`110`		`- batch2->stride(transpose_result ? 1 : 2) != 0)`
	`116`	`+ batch2->stride(transpose_result ? 1 : 2) >= std::max<int64_t>(1, k))`
`111`	`117`	`{`
`112`	`118`	`transpose_batch2 = 'n';`
`113`	`119`	`batch2_ = batch2;`
`114`	`120`	`ldb = batch2_->stride(transpose_result ? 1 : 2);`
`115`	`121`	`}`
`116`	`122`	`else if (batch2->stride(transpose_result ? 1 : 2) == 1 &&`
`117`		`- batch2->stride(transpose_result ? 2 : 1) != 0)`
	`123`	`+ batch2->stride(transpose_result ? 2 : 1) >= std::max<int64_t>(1, n))`
`118`	`124`	`{`
`119`	`125`	`transpose_batch2 = 't';`
`120`	`126`	`batch2_ = batch2;`