enable channels_last_3d on SyncBatchNorm

xwang233 · xwang233 · commit f5fdfa203263 · 2022-11-02T21:46:30.000-07:00
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
@@ -48,8 +48,11 @@ bool is_mixed_type(const Tensor& input, const Args&... parameters) {
 }
 
 inline bool batch_norm_use_channels_last_kernels(const at::Tensor& self) {
-  return (self.is_contiguous(at::MemoryFormat::ChannelsLast) ||
-          (self.is_contiguous() && self.strides()[1] == 1));
+  return (
+    self.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+    self.is_contiguous(at::MemoryFormat::ChannelsLast3d) ||
+    (self.is_contiguous() && self.strides()[1] == 1)
+  );
 }
 
 enum class Impl {
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -10269,16 +10269,16 @@ def test_sync_batchnorm_accuracy_cuda(self):
         #   fwd: torch.batch_norm_stats, torch.batch_norm_gather_stats_with_counts, torch.batch_norm_elemt
         #   bwd: torch.batch_norm_backward_reduce, torch.batch_norm_backward_elemt
 
-        def _batch_norm_stats(data):
+        def _batch_norm_stats(data, memory_format, mean_axes):
             mean1, _ = torch.batch_norm_stats(data, 1e-5)
-            mean2, _ = torch.batch_norm_stats(data.to(memory_format=torch.channels_last), 1e-5)
-            mean_ref = torch.mean(data, (0, 2, 3), keepdim=False)
+            mean2, _ = torch.batch_norm_stats(data.to(memory_format=memory_format), 1e-5)
+            mean_ref = torch.mean(data, mean_axes, keepdim=False)
 
             self.assertEqual(mean_ref, mean1)
             self.assertEqual(mean_ref, mean2)
 
-        data = torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda')
-        _batch_norm_stats(data)
+        _batch_norm_stats(torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last, (0, 2, 3))
+        _batch_norm_stats(torch.randn(1, 96, 112, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last_3d, (0, 2, 3, 4))
 
     def test_flatten(self):
         tensor_input = torch.randn(2, 1, 2, 3)
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
@@ -7,7 +7,7 @@ class SyncBatchNorm(Function):
 
     @staticmethod
     def forward(self, input, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size):
-        if not input.is_contiguous(memory_format=torch.channels_last):
+        if not (input.is_contiguous(memory_format=torch.channels_last) or input.is_contiguous(memory_format=torch.channels_last_3d)):
             input = input.contiguous()
         if weight is not None:
             weight = weight.contiguous()
@@ -104,7 +104,7 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
 
     @staticmethod
     def backward(self, grad_output):
-        if not grad_output.is_contiguous(memory_format=torch.channels_last):
+        if not (grad_output.is_contiguous(memory_format=torch.channels_last) or grad_output.is_contiguous(memory_format=torch.channels_last_3d)):
             grad_output = grad_output.contiguous()
         saved_input, weight, mean, invstd, count_tensor = self.saved_tensors
         grad_input = grad_weight = grad_bias = None
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -5319,6 +5319,54 @@ def test_post_localSGD_optimizer_step_reload(self):
                     tmp_file
                 )
 
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel"
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm_Channels_Last_3D(self):
+            group, group_id, rank = self._init_global_test()
+            num_processes = dist.get_world_size()
+            local_bs = 2
+            bs_offset = int(rank * 2)
+            global_bs = int(num_processes * 2)
+
+            model = ONLY_SBN_NET
+            model_gpu = copy.deepcopy(model).cuda(rank)
+            model_DDP = nn.parallel.DistributedDataParallel(
+                model_gpu, device_ids=[rank]
+            )
+
+            memory_format = torch.channels_last_3d
+            input_gpu = (
+                torch.randn(global_bs, 2, 4, 4, 4, dtype=torch.float)
+                .cuda(rank)
+                .to(memory_format=memory_format)
+            )
+            target_gpu = (
+                torch.randn(global_bs, 2, 4, 4, 4, dtype=torch.float)
+                .cuda(rank)
+                .to(memory_format=memory_format)
+            )
+            loss = nn.MSELoss()
+
+            # check two model parameters over 5 iterations
+            self._test_DDP_niter(
+                model_gpu,
+                model_DDP,
+                input_gpu,
+                target_gpu,
+                loss,
+                local_bs,
+                rank,
+                global_bs,
+                True,
+                bs_offset,
+                dist.get_world_size(),
+                memory_format=memory_format,
+            )
+            self._barrier()
+
         @sandcastle_skip_if(
             BACKEND not in DistTestCases.backend_feature["ddp"],
             f"The {BACKEND} backend does not support DistributedDataParallel"

Original file line number	Diff line number	Diff line change
`@@ -48,8 +48,11 @@ bool is_mixed_type(const Tensor& input, const Args&... parameters) {`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`inline bool batch_norm_use_channels_last_kernels(const at::Tensor& self) {`
`51`		`- return (self.is_contiguous(at::MemoryFormat::ChannelsLast) \|\|`
`52`		`- (self.is_contiguous() && self.strides()[1] == 1));`
	`51`	`+ return (`
	`52`	`+ self.is_contiguous(at::MemoryFormat::ChannelsLast) \|\|`
	`53`	`+ self.is_contiguous(at::MemoryFormat::ChannelsLast3d) \|\|`
	`54`	`+ (self.is_contiguous() && self.strides()[1] == 1)`
	`55`	`+ );`
`53`	`56`	`}`
`54`	`57`
`55`	`58`	`enum class Impl {`