MAINT: speed up istft by using col2im (the original python code used … (#42826)

mattip · facebook-github-bot · commit 77bd4d3426b5 · 2020-08-12T08:39:12.000-07:00
Summary: Fixes #42213 The [original python code](https://github.com/pytorch/audio/blob/v0.5.0/torchaudio/functional.py#L178) from `torchaudio` was converted to a native function, but used `eye` to allocate a Tensor and was much slower. Using `at::col2im` (which is the equivalent of `torch.nn.functional.fold`) solved the slowdown. Pull Request resolved: #42826 Reviewed By: smessmer Differential Revision: D23043673 Pulled By: mthrok fbshipit-source-id: 3f5d0779a87379b002340ea19c9ae5042a43e94e
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
@@ -367,16 +367,22 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
   Tensor y_tmp = input * window_tmp.view({1, 1, n_fft});  // size: (channel, n_frames, n_fft)
   y_tmp = y_tmp.transpose(1, 2);  // size: (channel, n_fft, frame)
 
-  const Tensor eye = at::native::eye(n_fft, options).unsqueeze(1);
-  Tensor y = at::conv_transpose1d(y_tmp, eye,
-                                  /*bias*/ Tensor(),
-                                  /*stride*/ {hop_length,},
-                                  /*padding*/{0,});  // size: (channel, n_frames, n_fft)
+  Tensor y = at::col2im(y_tmp,
+                                  /*output_size*/ {1, (n_frames - 1) * hop_length + n_fft},
+                                  /*kernel_size*/ {1, n_fft},
+                                  /*dilation*/    {1, 1},
+                                  /*padding*/     {0, 0},
+                                  /*stride*/      {1, hop_length}
+                                 ).squeeze(2);
   window_tmp = window_tmp.pow(2).view({n_fft, 1}).repeat({1, n_frames}).unsqueeze(0);  // size: (1, n_fft, n_frames)
-  Tensor window_envelop = at::conv_transpose1d(window_tmp, eye,
-                                               /*bias*/ Tensor(),
-                                               /*stride*/ {hop_length, },
-                                               /*padding*/{0, });  // size: (1, 1, expected_output_signal_len)
+  Tensor window_envelop = at::col2im(window_tmp,
+                                  /*output_size*/ {1, (n_frames - 1) * hop_length + n_fft},
+                                  /*kernel_size*/ {1, n_fft},
+                                  /*dilation*/    {1, 1},
+                                  /*padding*/     {0, 0},
+                                  /*stride*/      {1, hop_length}
+                                 ).squeeze(2); // size: (1, 1, expected_output_signal_len)
+
   TORCH_INTERNAL_ASSERT(expected_output_signal_len == y.size(2));
   TORCH_INTERNAL_ASSERT(expected_output_signal_len == window_envelop.size(2));