add depthwise conv support for mkldnn (#8782)

mingfeima · facebook-github-bot · commit e7f49d144433 · 2018-07-15T17:40:55.000-07:00
Summary: Change-Id: I3836dacc63afc1b5e31b1d706bba6bb13699ba41 beneficial for depth wise convolution on CPU, such as mobilenet, etc. Pull Request resolved: #8782 Reviewed By: SsnL Differential Revision: D8790869 Pulled By: ezyang fbshipit-source-id: 29f410763ce403c2438fc527aa354ff02e1829bf
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
@@ -124,8 +124,7 @@ auto ConvParams::use_mkldnn(const at::Tensor& input) const -> bool {
          input.type().scalarType() == kFloat && // only on CPU Float Tensors
          !is_dilated() && // doesn't support dilation
          !transposed && // or transposed tensors
-         input.ndimension() == 4 && // must be in NCHW format
-         groups == 1;
+         input.ndimension() == 4; // must be in NCHW format
 #endif
   return false;
 }
@@ -369,7 +368,7 @@ at::Tensor _convolution(
       throw std::runtime_error(ss.str());
     }
 
-    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation);
+    output = at::mkldnn_convolution(input, weight, bias, params.padding, params.stride, params.dilation, params.groups);
 #endif
   } else {
     if (params.groups == 1) {
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -8,25 +8,25 @@ namespace at { namespace native {
 
 at::Tensor mkldnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
-    IntList padding, IntList stride, IntList dilation) {
+    IntList padding, IntList stride, IntList dilation, int64_t groups) {
   throw std::runtime_error("mkldnn_convolution_forward: ATen not compiled with MKLDNN support");
 }
 
 at::Tensor mkldnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
-    IntList padding, IntList stride, IntList dilation, bool bias_defined) {
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
   throw std::runtime_error("mkldnn_convolution_backward_input: ATen not compiled with MKLDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor> mkldnn_convolution_backward_weights(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
-    IntList padding, IntList stride, IntList dilation, bool bias_defined) {
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) {
   throw std::runtime_error("mkldnn_convolution_backward_weights: ATen not compiled with MKLDNN support");
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask) {
+    IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) {
   throw std::runtime_error("mkldnn_convolution_backward: ATen not compiled with MKLDNN support");
 }
 
@@ -52,7 +52,7 @@ constexpr int max_dim = 3;
 
 std::vector<int64_t> conv_output_size(
     IntList input_size, IntList weight_size,
-    IntList padding, IntList stride, IntList dilation)
+    IntList padding, IntList stride, IntList dilation, int64_t groups)
 {
   auto dim = input_size.size();
   std::vector<int64_t> output_size(dim);
@@ -68,12 +68,14 @@ std::vector<int64_t> conv_output_size(
 
 at::Tensor mkldnn_convolution(
     const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias,
-    IntList padding, IntList stride, IntList dilation)
+    IntList padding, IntList stride, IntList dilation, int64_t groups)
 {
   auto output = input.type().tensor(conv_output_size(
-    input.sizes(), weight.sizes(), padding, stride, dilation));
+    input.sizes(), weight.sizes(), padding, stride, dilation, groups));
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
+
+  int32_t g = groups;
   
   int32_t n = input.size(0);
   int32_t ic = input.size(1);
@@ -95,11 +97,11 @@ at::Tensor mkldnn_convolution(
   auto data_t = memory::data_type::f32;
   auto format_any = memory::format::any;
   auto format_nchw = memory::format::nchw;
-  auto format_oihw = memory::format::oihw;
+  auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw;
   auto format_x = memory::format::x;
 
   memory::dims input_tz = {n, ic, ih, iw};
-  memory::dims weight_tz = {oc, ic, kh, kw};
+  memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
   memory::dims bias_tz = {oc};
   memory::dims output_tz = {n, oc, oh, ow};
   memory::dims _stride = {sh, sw};
@@ -127,7 +129,7 @@ at::Tensor mkldnn_convolution(
 
   auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
     input.data_ptr());
-  auto weight_usr_memory = memory({{{weight_tz}, data_t,  format_oihw}, cpu_engine},
+  auto weight_usr_memory = memory({{{weight_tz}, data_t,  format_weight}, cpu_engine},
     weight.data_ptr());
   auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
     output.data_ptr());
@@ -178,12 +180,14 @@ at::Tensor mkldnn_convolution(
 
 Tensor mkldnn_convolution_backward_input(
     IntList input_size, const at::Tensor& grad_output, const at::Tensor& weight,
-    IntList padding, IntList stride, IntList dilation, bool bias_defined)
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
 {
   auto grad_input = grad_output.type().tensor(input_size);
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
 
+  int32_t g = groups;
+
   int32_t n = grad_input.size(0);
   int32_t ic = grad_input.size(1);
   int32_t ih = grad_input.size(2);
@@ -204,10 +208,10 @@ Tensor mkldnn_convolution_backward_input(
   auto data_t = memory::data_type::f32;
   auto format_any = memory::format::any;
   auto format_nchw = memory::format::nchw;
-  auto format_oihw = memory::format::oihw;
+  auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw;
 
   memory::dims input_tz = {n, ic, ih, iw};
-  memory::dims weight_tz = {oc, ic, kh, kw};
+  memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
   memory::dims bias_tz = {oc};
   memory::dims output_tz = {n, oc, oh, ow};
   memory::dims _stride = {sh, sw};
@@ -245,7 +249,7 @@ Tensor mkldnn_convolution_backward_input(
 
   auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
     grad_output.data_ptr());
-  auto weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
+  auto weight_usr_memory = memory({{{weight_tz}, data_t, format_weight}, cpu_engine},
     weight.data_ptr());
   auto grad_input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine},
     grad_input.data_ptr());
@@ -288,7 +292,7 @@ Tensor mkldnn_convolution_backward_input(
 
 std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
     IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input,
-    IntList padding, IntList stride, IntList dilation, bool bias_defined)
+    IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined)
 {
   auto grad_weight = grad_output.type().tensor(weight_size);
 
@@ -299,6 +303,8 @@ std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
 
   auto cpu_engine = CpuEngine::Instance().get_engine();
 
+  int32_t g = groups;
+
   int32_t n = input.size(0);
   int32_t ic = input.size(1);
   int32_t ih = input.size(2);
@@ -319,11 +325,11 @@ std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
   auto data_t = memory::data_type::f32;
   auto format_any = memory::format::any;
   auto format_nchw = memory::format::nchw;
-  auto format_oihw = memory::format::oihw;
+  auto format_weight = (g!= 1) ? memory::format::goihw : memory::format::oihw;
   auto format_x = memory::format::x;
 
   memory::dims input_tz = {n, ic, ih, iw};
-  memory::dims weight_tz = {oc, ic, kh, kw};
+  memory::dims weight_tz = (g!= 1) ? memory::dims{g, oc/g, ic/g, kh, kw} : memory::dims{oc, ic, kh, kw};
   memory::dims bias_tz = {oc};
   memory::dims output_tz = {n, oc, oh, ow};
   memory::dims _stride = {sh, sw};
@@ -369,7 +375,7 @@ std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
     input.data_ptr());
   auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine},
     grad_output.data_ptr());
-  auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine},
+  auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_weight}, cpu_engine},
     grad_weight.data_ptr());
   std::shared_ptr<memory> grad_bias_memory;
 
@@ -419,18 +425,18 @@ std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights(
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_convolution_backward(
     const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask)
+    IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask)
 {
   Tensor grad_output = grad_output_t.contiguous();
 
   Tensor grad_input, grad_weight, grad_bias;
   if (output_mask[0]) {
     grad_input = at::mkldnn_convolution_backward_input(
-      input.sizes(), grad_output, weight, padding, stride, dilation, output_mask[2]);
+      input.sizes(), grad_output, weight, padding, stride, dilation, groups, output_mask[2]);
   }
   if (output_mask[1] || output_mask[2]) {
     std::tie(grad_weight, grad_bias) = at::mkldnn_convolution_backward_weights(
-      weight.sizes(), grad_output, input, padding, stride, dilation, output_mask[2]);
+      weight.sizes(), grad_output, input, padding, stride, dilation, groups, output_mask[2]);
   }
 
   return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -913,16 +913,16 @@
 
 - func: min_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
 
-- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation) -> Tensor
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups) -> Tensor
   variants: function
 
-- func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, bool bias_defined) -> Tensor
+- func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> Tensor
   variants: function
 
-- func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, bool bias_defined) -> (Tensor, Tensor)
+- func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> (Tensor, Tensor)
   variants: function
 
-- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
+- func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
   variants: function
 
 - func: mm(Tensor self, Tensor mat2) -> Tensor
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -1209,8 +1209,8 @@
   input, hx, cx, weight: "_cudnn_rnn_backward(input, weight, weight_stride0, result4, hx, cx, result0, grads[0], grads[1], grads[2], mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, retain_variables ? result3.clone() : result3, grad_input_mask)"
 
 # mkldnn
-- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList stride, IntList dilation)
-  self, weight, bias: mkldnn_convolution_backward(self, grad, weight, padding, stride, dilation, grad_input_mask)
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList stride, IntList dilation, int64_t groups)
+  self, weight, bias: mkldnn_convolution_backward(self, grad, weight, padding, stride, dilation, groups, grad_input_mask)
 
 # fft
 - name: _fft_with_size(Tensor self, int64_t signal_ndim, bool complex_input, bool complex_output, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes)