Bidirectional GRU and LSTM C++ API forward fix (#22850)

pbelevich · facebook-github-bot · commit 965b97f5f031 · 2019-07-22T12:59:47.000-07:00
Summary: Fixing #17998 Pull Request resolved: #22850 Differential Revision: D16420854 Pulled By: pbelevich fbshipit-source-id: 76f38be40d8479fb9cafba92939cea61d81fd336
diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp
@@ -5,6 +5,7 @@
 #include <torch/optim/adam.h>
 #include <torch/types.h>
 #include <torch/utils.h>
+#include <ATen/core/grad_mode.h>
 
 #include <test/cpp/api/support.h>
 
@@ -247,3 +248,229 @@ TEST_F(RNNTest, BidirectionalFlattenParameters) {
   GRU gru(GRUOptions(100, 256).layers(2).bidirectional(true));
   gru->flatten_parameters();
 }
+
+template <typename Impl>
+void copyParameters(torch::nn::ModuleHolder<Impl>& target, size_t t_i,
+                    const torch::nn::ModuleHolder<Impl>& source, size_t s_i) {
+  at::NoGradGuard guard;
+  target->w_ih[t_i].copy_(source->w_ih[s_i]);
+  target->w_hh[t_i].copy_(source->w_hh[s_i]);
+  target->b_ih[t_i].copy_(source->b_ih[s_i]);
+  target->b_hh[t_i].copy_(source->b_hh[s_i]);
+}
+
+// This test is a port of python code introduced here:
+// https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
+// Reverse forward of bidrectional GRU should act
+// as regular forward of unidirectional GRU
+void BidirectionalGRUReverseForward(bool cuda) {
+  auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
+                                   .device(cuda ? torch::kCUDA : torch::kCPU);
+  auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
+  auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});
+
+  auto gru_options = GRUOptions(1, 1).layers(1).batch_first(false);
+  GRU bi_grus {gru_options.bidirectional(true)};
+  GRU reverse_gru {gru_options.bidirectional(false)};
+
+  if (cuda) {
+    bi_grus->to(torch::kCUDA);
+    reverse_gru->to(torch::kCUDA);
+  }
+
+  // Now make sure the weights of the reverse gru layer match
+  // ones of the (reversed) bidirectional's:
+  copyParameters(reverse_gru, 0, bi_grus, 1);
+
+  auto bi_output = bi_grus->forward(input);
+  auto reverse_output = reverse_gru->forward(input_reversed);
+
+  if (cuda) {
+    bi_output.output = bi_output.output.to(torch::kCPU);
+    bi_output.state = bi_output.state.to(torch::kCPU);
+    reverse_output.output = reverse_output.output.to(torch::kCPU);
+    reverse_output.state = reverse_output.state.to(torch::kCPU);
+  }
+
+  ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
+  auto size = bi_output.output.size(0);
+  for (int i = 0; i < size; i++) {
+    ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
+              reverse_output.output[size - 1 - i][0][0].item<float>());
+  }
+  // The hidden states of the reversed GRUs sits
+  // in the odd indices in the first dimension.
+  ASSERT_EQ(bi_output.state[1][0][0].item<float>(),
+            reverse_output.state[0][0][0].item<float>());
+}
+
+TEST_F(RNNTest, BidirectionalGRUReverseForward) {
+  BidirectionalGRUReverseForward(false);
+}
+
+TEST_F(RNNTest, BidirectionalGRUReverseForward_CUDA) {
+  BidirectionalGRUReverseForward(true);
+}
+
+// Reverse forward of bidrectional LSTM should act
+// as regular forward of unidirectional LSTM
+void BidirectionalLSTMReverseForwardTest(bool cuda) {
+  auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
+                                   .device(cuda ? torch::kCUDA : torch::kCPU);
+  auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
+  auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});
+
+  auto lstm_opt = GRUOptions(1, 1).layers(1).batch_first(false);
+
+  LSTM bi_lstm {lstm_opt.bidirectional(true)};
+  LSTM reverse_lstm {lstm_opt.bidirectional(false)};
+
+  if (cuda) {
+    bi_lstm->to(torch::kCUDA);
+    reverse_lstm->to(torch::kCUDA);
+  }
+
+  // Now make sure the weights of the reverse lstm layer match
+  // ones of the (reversed) bidirectional's:
+  copyParameters(reverse_lstm, 0, bi_lstm, 1);
+
+  auto bi_output = bi_lstm->forward(input);
+  auto reverse_output = reverse_lstm->forward(input_reversed);
+
+  if (cuda) {
+    bi_output.output = bi_output.output.to(torch::kCPU);
+    bi_output.state = bi_output.state.to(torch::kCPU);
+    reverse_output.output = reverse_output.output.to(torch::kCPU);
+    reverse_output.state = reverse_output.state.to(torch::kCPU);
+  }
+
+  ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
+  auto size = bi_output.output.size(0);
+  for (int i = 0; i < size; i++) {
+    ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
+              reverse_output.output[size - 1 - i][0][0].item<float>());
+  }
+  // The hidden states of the reversed LSTM sits
+  // in the odd indices in the first dimension.
+  ASSERT_EQ(bi_output.state[0][1][0][0].item<float>(),
+            reverse_output.state[0][0][0][0].item<float>());
+  ASSERT_EQ(bi_output.state[1][1][0][0].item<float>(),
+            reverse_output.state[1][0][0][0].item<float>());
+}
+
+TEST_F(RNNTest, BidirectionalLSTMReverseForward) {
+  BidirectionalLSTMReverseForwardTest(false);
+}
+
+TEST_F(RNNTest, BidirectionalLSTMReverseForward_CUDA) {
+  BidirectionalLSTMReverseForwardTest(true);
+}
+
+TEST_F(RNNTest, BidirectionalMultilayerGRU_CPU_vs_CUDA) {
+  // Create two GRUs with the same options
+  auto opt = GRUOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
+  GRU gru_cpu {opt};
+  GRU gru_cuda {opt};
+
+  // Copy weights and biases from CPU GRU to CUDA GRU
+  {
+    at::NoGradGuard guard;
+    const auto num_directions = gru_cpu->options.bidirectional_ ? 2 : 1;
+    for (int64_t layer = 0; layer < gru_cpu->options.layers_; layer++) {
+      for (auto direction = 0; direction < num_directions; direction++) {
+        const auto layer_idx = (layer * num_directions) + direction;
+        copyParameters(gru_cuda, layer_idx, gru_cpu, layer_idx);
+      }
+    }
+  }
+
+  gru_cpu->flatten_parameters();
+  gru_cuda->flatten_parameters();
+
+  // Move GRU to CUDA
+  gru_cuda->to(torch::kCUDA);
+
+  // Create the same inputs
+  auto input_opt = torch::TensorOptions()
+                    .dtype(torch::kFloat32).requires_grad(false);
+  auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
+                    .reshape({3, 1, 2});
+  auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
+                    .reshape({3, 1, 2}).to(torch::kCUDA);
+
+  // Call forward on both GRUs
+  auto output_cpu = gru_cpu->forward(input_cpu);
+  auto output_cuda = gru_cuda->forward(input_cuda);
+
+  output_cpu.output = output_cpu.output.to(torch::kCPU);
+  output_cpu.state = output_cpu.state.to(torch::kCPU);
+
+  // Assert that the output and state are equal on CPU and CUDA
+  ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
+  for (int i = 0; i < output_cpu.output.dim(); i++) {
+    ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
+  }
+  for (int i = 0; i < output_cpu.output.size(0); i++) {
+    for (int j = 0; j < output_cpu.output.size(1); j++) {
+      for (int k = 0; k < output_cpu.output.size(2); k++) {
+        ASSERT_NEAR(
+          output_cpu.output[i][j][k].item<float>(),
+          output_cuda.output[i][j][k].item<float>(), 1e-5);
+      }
+    }
+  }
+}
+
+TEST_F(RNNTest, BidirectionalMultilayerLSTM_CPU_vs_CUDA) {
+  // Create two LSTMs with the same options
+  auto opt = LSTMOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
+  LSTM lstm_cpu {opt};
+  LSTM lstm_cuda {opt};
+
+  // Copy weights and biases from CPU LSTM to CUDA LSTM
+  {
+    at::NoGradGuard guard;
+    const auto num_directions = lstm_cpu->options.bidirectional_ ? 2 : 1;
+    for (int64_t layer = 0; layer < lstm_cpu->options.layers_; layer++) {
+      for (auto direction = 0; direction < num_directions; direction++) {
+        const auto layer_idx = (layer * num_directions) + direction;
+        copyParameters(lstm_cuda, layer_idx, lstm_cpu, layer_idx);
+      }
+    }
+  }
+
+  lstm_cpu->flatten_parameters();
+  lstm_cuda->flatten_parameters();
+
+  // Move LSTM to CUDA
+  lstm_cuda->to(torch::kCUDA);
+
+  auto options = torch::TensorOptions()
+                  .dtype(torch::kFloat32).requires_grad(false);
+  auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, options)
+                  .reshape({3, 1, 2});
+  auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, options)
+                  .reshape({3, 1, 2}).to(torch::kCUDA);
+
+  // Call forward on both LSTMs
+  auto output_cpu = lstm_cpu->forward(input_cpu);
+  auto output_cuda = lstm_cuda->forward(input_cuda);
+
+  output_cpu.output = output_cpu.output.to(torch::kCPU);
+  output_cpu.state = output_cpu.state.to(torch::kCPU);
+
+  // Assert that the output and state are equal on CPU and CUDA
+  ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
+  for (int i = 0; i < output_cpu.output.dim(); i++) {
+    ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
+  }
+  for (int i = 0; i < output_cpu.output.size(0); i++) {
+    for (int j = 0; j < output_cpu.output.size(1); j++) {
+      for (int k = 0; k < output_cpu.output.size(2); k++) {
+        ASSERT_NEAR(
+          output_cpu.output[i][j][k].item<float>(),
+          output_cuda.output[i][j][k].item<float>(), 1e-5);
+      }
+    }
+  }
+}
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -102,6 +102,18 @@ void RNNImplBase<Derived>::to(torch::Dtype dtype, bool non_blocking) {
 template <typename Derived>
 void RNNImplBase<Derived>::to(torch::Device device, bool non_blocking) {
   nn::Module::to(device, non_blocking);
+  const auto num_directions = options.bidirectional_ ? 2 : 1;
+  for (int64_t layer = 0; layer < options.layers_; layer++) {
+    for (auto direction = 0; direction < num_directions; direction++) {
+      const auto layer_idx = (layer * num_directions) + direction;
+      w_ih[layer_idx] = w_ih[layer_idx].to(device, non_blocking);
+      w_hh[layer_idx] = w_hh[layer_idx].to(device, non_blocking);
+      if (options.with_bias_) {
+        b_ih[layer_idx] = b_ih[layer_idx].to(device, non_blocking);
+        b_hh[layer_idx] = b_hh[layer_idx].to(device, non_blocking);
+      }
+    }
+  }
   flatten_parameters();
 }
 
@@ -144,8 +156,10 @@ RNNOutput RNNImplBase<Derived>::generic_forward(
   if (!state.defined()) {
     // #layers, batch size, state size
     const auto batch_size = input.size(options.batch_first_ ? 0 : 1);
+    const auto num_directions = options.bidirectional_ ? 2 : 1;
     state = torch::zeros(
-        {options.layers_, batch_size, options.hidden_size_}, input.options());
+      {options.layers_ * num_directions, batch_size, options.hidden_size_},
+      input.options());
   }
   Tensor output, new_state;
   std::tie(output, new_state) = function(
@@ -269,8 +283,9 @@ RNNOutput LSTMImpl::forward(const Tensor& input, Tensor state) {
   if (!state.defined()) {
     // 2 for hidden state and cell state, then #layers, batch size, state size
     const auto batch_size = input.size(options.batch_first_ ? 0 : 1);
+    const auto num_directions = options.bidirectional_ ? 2 : 1;
     state = torch::zeros(
-        {2, options.layers_, batch_size, options.hidden_size_},
+        {2, options.layers_ * num_directions, batch_size, options.hidden_size_},
         input.options());
   }
   Tensor output, hidden_state, cell_state;