Enable channels_last_3d on SyncBatchNorm (#88401)

xwang233 · pytorchmergebot · commit f5df68509097 · 2022-11-15T19:25:53.000Z
This PR enabled the use of fast channels_last kernels on SyncBatchNorm with channels_last_3d memory format. With a small benchmark script here #88021 (comment), on V100, I got master: ``` DDP channels_last=False, run_forward_backward, time: 0.8945400714874268 sec DDP channels_last=True, run_forward_backward, time: 1.4736433029174805 sec ``` This PR: ``` DDP channels_last=False, run_forward_backward, time: 0.8927242755889893 sec DDP channels_last=True, run_forward_backward, time: 0.48697471618652344 sec ``` This PR is a follow-up of #46906 Close #88021 Pull Request resolved: #88401 Approved by: https://github.com/ngimel
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
@@ -48,8 +48,11 @@ bool is_mixed_type(const Tensor& input, const Args&... parameters) {
 }
 
 inline bool batch_norm_use_channels_last_kernels(const at::Tensor& self) {
-  return (self.is_contiguous(at::MemoryFormat::ChannelsLast) ||
-          (self.is_contiguous() && self.strides()[1] == 1));
+  return (
+    self.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+    self.is_contiguous(at::MemoryFormat::ChannelsLast3d) ||
+    (self.is_contiguous() && self.strides()[1] == 1)
+  );
 }
 
 enum class Impl {
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -10283,16 +10283,16 @@ def test_sync_batchnorm_accuracy_cuda(self):
         #   fwd: torch.batch_norm_stats, torch.batch_norm_gather_stats_with_counts, torch.batch_norm_elemt
         #   bwd: torch.batch_norm_backward_reduce, torch.batch_norm_backward_elemt
 
-        def _batch_norm_stats(data):
+        def _batch_norm_stats(data, memory_format, mean_axes):
             mean1, _ = torch.batch_norm_stats(data, 1e-5)
-            mean2, _ = torch.batch_norm_stats(data.to(memory_format=torch.channels_last), 1e-5)
-            mean_ref = torch.mean(data, (0, 2, 3), keepdim=False)
+            mean2, _ = torch.batch_norm_stats(data.to(memory_format=memory_format), 1e-5)
+            mean_ref = torch.mean(data, mean_axes, keepdim=False)
 
             self.assertEqual(mean_ref, mean1)
             self.assertEqual(mean_ref, mean2)
 
-        data = torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda')
-        _batch_norm_stats(data)
+        _batch_norm_stats(torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last, (0, 2, 3))
+        _batch_norm_stats(torch.randn(1, 96, 112, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last_3d, (0, 2, 3, 4))
 
     def test_flatten(self):
         tensor_input = torch.randn(2, 1, 2, 3)
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
@@ -7,7 +7,10 @@ class SyncBatchNorm(Function):
 
     @staticmethod
     def forward(self, input, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size):
-        if not input.is_contiguous(memory_format=torch.channels_last):
+        if not (
+            input.is_contiguous(memory_format=torch.channels_last) or
+            input.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
             input = input.contiguous()
         if weight is not None:
             weight = weight.contiguous()
@@ -104,7 +107,10 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
 
     @staticmethod
     def backward(self, grad_output):
-        if not grad_output.is_contiguous(memory_format=torch.channels_last):
+        if not (
+            grad_output.is_contiguous(memory_format=torch.channels_last) or
+            grad_output.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
             grad_output = grad_output.contiguous()
         saved_input, weight, mean, invstd, count_tensor = self.saved_tensors
         grad_input = grad_weight = grad_bias = None
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -5324,6 +5324,10 @@ def test_post_localSGD_optimizer_step_reload(self):
         )
         @skip_if_no_gpu
         def test_DistributedDataParallel_SyncBatchNorm_Channels_Last(self):
+            self._test_DistributedDataParallel_SyncBatchNorm_with_memory_format(torch.channels_last)
+            self._test_DistributedDataParallel_SyncBatchNorm_with_memory_format(torch.channels_last_3d)
+
+        def _test_DistributedDataParallel_SyncBatchNorm_with_memory_format(self, memory_format):
             group, group_id, rank = self._init_global_test()
             num_processes = dist.get_world_size()
             local_bs = 2
@@ -5336,14 +5340,15 @@ def test_DistributedDataParallel_SyncBatchNorm_Channels_Last(self):
                 model_gpu, device_ids=[rank]
             )
 
-            memory_format = torch.channels_last
+            shapes = [global_bs, 2, 4, 4] + ([] if memory_format is torch.channels_last else [4])
+
             input_gpu = (
-                torch.randn(global_bs, 2, 4, 4, dtype=torch.float)
+                torch.randn(*shapes, dtype=torch.float)
                 .cuda(rank)
                 .to(memory_format=memory_format)
             )
             target_gpu = (
-                torch.randn(global_bs, 2, 4, 4, dtype=torch.float)
+                torch.randn(*shapes, dtype=torch.float)
                 .cuda(rank)
                 .to(memory_format=memory_format)
             )

Original file line number	Diff line number	Diff line change
`@@ -48,8 +48,11 @@ bool is_mixed_type(const Tensor& input, const Args&... parameters) {`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`inline bool batch_norm_use_channels_last_kernels(const at::Tensor& self) {`
`51`		`- return (self.is_contiguous(at::MemoryFormat::ChannelsLast) \|\|`
`52`		`- (self.is_contiguous() && self.strides()[1] == 1));`
	`51`	`+ return (`
	`52`	`+ self.is_contiguous(at::MemoryFormat::ChannelsLast) \|\|`
	`53`	`+ self.is_contiguous(at::MemoryFormat::ChannelsLast3d) \|\|`
	`54`	`+ (self.is_contiguous() && self.strides()[1] == 1)`
	`55`	`+ );`
`53`	`56`	`}`
`54`	`57`
`55`	`58`	`enum class Impl {`