Skip to content

Commit 965b97f

Browse files
pbelevichfacebook-github-bot
authored andcommitted
Bidirectional GRU and LSTM C++ API forward fix (#22850)
Summary: Fixing #17998 Pull Request resolved: #22850 Differential Revision: D16420854 Pulled By: pbelevich fbshipit-source-id: 76f38be40d8479fb9cafba92939cea61d81fd336
1 parent e5797e9 commit 965b97f

File tree

2 files changed

+244
-2
lines changed

2 files changed

+244
-2
lines changed

test/cpp/api/rnn.cpp

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <torch/optim/adam.h>
66
#include <torch/types.h>
77
#include <torch/utils.h>
8+
#include <ATen/core/grad_mode.h>
89

910
#include <test/cpp/api/support.h>
1011

@@ -247,3 +248,229 @@ TEST_F(RNNTest, BidirectionalFlattenParameters) {
247248
GRU gru(GRUOptions(100, 256).layers(2).bidirectional(true));
248249
gru->flatten_parameters();
249250
}
251+
252+
template <typename Impl>
253+
void copyParameters(torch::nn::ModuleHolder<Impl>& target, size_t t_i,
254+
const torch::nn::ModuleHolder<Impl>& source, size_t s_i) {
255+
at::NoGradGuard guard;
256+
target->w_ih[t_i].copy_(source->w_ih[s_i]);
257+
target->w_hh[t_i].copy_(source->w_hh[s_i]);
258+
target->b_ih[t_i].copy_(source->b_ih[s_i]);
259+
target->b_hh[t_i].copy_(source->b_hh[s_i]);
260+
}
261+
262+
// This test is a port of python code introduced here:
263+
// https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
264+
// Reverse forward of bidrectional GRU should act
265+
// as regular forward of unidirectional GRU
266+
void BidirectionalGRUReverseForward(bool cuda) {
267+
auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
268+
.device(cuda ? torch::kCUDA : torch::kCPU);
269+
auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
270+
auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});
271+
272+
auto gru_options = GRUOptions(1, 1).layers(1).batch_first(false);
273+
GRU bi_grus {gru_options.bidirectional(true)};
274+
GRU reverse_gru {gru_options.bidirectional(false)};
275+
276+
if (cuda) {
277+
bi_grus->to(torch::kCUDA);
278+
reverse_gru->to(torch::kCUDA);
279+
}
280+
281+
// Now make sure the weights of the reverse gru layer match
282+
// ones of the (reversed) bidirectional's:
283+
copyParameters(reverse_gru, 0, bi_grus, 1);
284+
285+
auto bi_output = bi_grus->forward(input);
286+
auto reverse_output = reverse_gru->forward(input_reversed);
287+
288+
if (cuda) {
289+
bi_output.output = bi_output.output.to(torch::kCPU);
290+
bi_output.state = bi_output.state.to(torch::kCPU);
291+
reverse_output.output = reverse_output.output.to(torch::kCPU);
292+
reverse_output.state = reverse_output.state.to(torch::kCPU);
293+
}
294+
295+
ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
296+
auto size = bi_output.output.size(0);
297+
for (int i = 0; i < size; i++) {
298+
ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
299+
reverse_output.output[size - 1 - i][0][0].item<float>());
300+
}
301+
// The hidden states of the reversed GRUs sits
302+
// in the odd indices in the first dimension.
303+
ASSERT_EQ(bi_output.state[1][0][0].item<float>(),
304+
reverse_output.state[0][0][0].item<float>());
305+
}
306+
307+
TEST_F(RNNTest, BidirectionalGRUReverseForward) {
308+
BidirectionalGRUReverseForward(false);
309+
}
310+
311+
TEST_F(RNNTest, BidirectionalGRUReverseForward_CUDA) {
312+
BidirectionalGRUReverseForward(true);
313+
}
314+
315+
// Reverse forward of bidrectional LSTM should act
316+
// as regular forward of unidirectional LSTM
317+
void BidirectionalLSTMReverseForwardTest(bool cuda) {
318+
auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
319+
.device(cuda ? torch::kCUDA : torch::kCPU);
320+
auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
321+
auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});
322+
323+
auto lstm_opt = GRUOptions(1, 1).layers(1).batch_first(false);
324+
325+
LSTM bi_lstm {lstm_opt.bidirectional(true)};
326+
LSTM reverse_lstm {lstm_opt.bidirectional(false)};
327+
328+
if (cuda) {
329+
bi_lstm->to(torch::kCUDA);
330+
reverse_lstm->to(torch::kCUDA);
331+
}
332+
333+
// Now make sure the weights of the reverse lstm layer match
334+
// ones of the (reversed) bidirectional's:
335+
copyParameters(reverse_lstm, 0, bi_lstm, 1);
336+
337+
auto bi_output = bi_lstm->forward(input);
338+
auto reverse_output = reverse_lstm->forward(input_reversed);
339+
340+
if (cuda) {
341+
bi_output.output = bi_output.output.to(torch::kCPU);
342+
bi_output.state = bi_output.state.to(torch::kCPU);
343+
reverse_output.output = reverse_output.output.to(torch::kCPU);
344+
reverse_output.state = reverse_output.state.to(torch::kCPU);
345+
}
346+
347+
ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
348+
auto size = bi_output.output.size(0);
349+
for (int i = 0; i < size; i++) {
350+
ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
351+
reverse_output.output[size - 1 - i][0][0].item<float>());
352+
}
353+
// The hidden states of the reversed LSTM sits
354+
// in the odd indices in the first dimension.
355+
ASSERT_EQ(bi_output.state[0][1][0][0].item<float>(),
356+
reverse_output.state[0][0][0][0].item<float>());
357+
ASSERT_EQ(bi_output.state[1][1][0][0].item<float>(),
358+
reverse_output.state[1][0][0][0].item<float>());
359+
}
360+
361+
TEST_F(RNNTest, BidirectionalLSTMReverseForward) {
362+
BidirectionalLSTMReverseForwardTest(false);
363+
}
364+
365+
TEST_F(RNNTest, BidirectionalLSTMReverseForward_CUDA) {
366+
BidirectionalLSTMReverseForwardTest(true);
367+
}
368+
369+
TEST_F(RNNTest, BidirectionalMultilayerGRU_CPU_vs_CUDA) {
370+
// Create two GRUs with the same options
371+
auto opt = GRUOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
372+
GRU gru_cpu {opt};
373+
GRU gru_cuda {opt};
374+
375+
// Copy weights and biases from CPU GRU to CUDA GRU
376+
{
377+
at::NoGradGuard guard;
378+
const auto num_directions = gru_cpu->options.bidirectional_ ? 2 : 1;
379+
for (int64_t layer = 0; layer < gru_cpu->options.layers_; layer++) {
380+
for (auto direction = 0; direction < num_directions; direction++) {
381+
const auto layer_idx = (layer * num_directions) + direction;
382+
copyParameters(gru_cuda, layer_idx, gru_cpu, layer_idx);
383+
}
384+
}
385+
}
386+
387+
gru_cpu->flatten_parameters();
388+
gru_cuda->flatten_parameters();
389+
390+
// Move GRU to CUDA
391+
gru_cuda->to(torch::kCUDA);
392+
393+
// Create the same inputs
394+
auto input_opt = torch::TensorOptions()
395+
.dtype(torch::kFloat32).requires_grad(false);
396+
auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
397+
.reshape({3, 1, 2});
398+
auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
399+
.reshape({3, 1, 2}).to(torch::kCUDA);
400+
401+
// Call forward on both GRUs
402+
auto output_cpu = gru_cpu->forward(input_cpu);
403+
auto output_cuda = gru_cuda->forward(input_cuda);
404+
405+
output_cpu.output = output_cpu.output.to(torch::kCPU);
406+
output_cpu.state = output_cpu.state.to(torch::kCPU);
407+
408+
// Assert that the output and state are equal on CPU and CUDA
409+
ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
410+
for (int i = 0; i < output_cpu.output.dim(); i++) {
411+
ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
412+
}
413+
for (int i = 0; i < output_cpu.output.size(0); i++) {
414+
for (int j = 0; j < output_cpu.output.size(1); j++) {
415+
for (int k = 0; k < output_cpu.output.size(2); k++) {
416+
ASSERT_NEAR(
417+
output_cpu.output[i][j][k].item<float>(),
418+
output_cuda.output[i][j][k].item<float>(), 1e-5);
419+
}
420+
}
421+
}
422+
}
423+
424+
TEST_F(RNNTest, BidirectionalMultilayerLSTM_CPU_vs_CUDA) {
425+
// Create two LSTMs with the same options
426+
auto opt = LSTMOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
427+
LSTM lstm_cpu {opt};
428+
LSTM lstm_cuda {opt};
429+
430+
// Copy weights and biases from CPU LSTM to CUDA LSTM
431+
{
432+
at::NoGradGuard guard;
433+
const auto num_directions = lstm_cpu->options.bidirectional_ ? 2 : 1;
434+
for (int64_t layer = 0; layer < lstm_cpu->options.layers_; layer++) {
435+
for (auto direction = 0; direction < num_directions; direction++) {
436+
const auto layer_idx = (layer * num_directions) + direction;
437+
copyParameters(lstm_cuda, layer_idx, lstm_cpu, layer_idx);
438+
}
439+
}
440+
}
441+
442+
lstm_cpu->flatten_parameters();
443+
lstm_cuda->flatten_parameters();
444+
445+
// Move LSTM to CUDA
446+
lstm_cuda->to(torch::kCUDA);
447+
448+
auto options = torch::TensorOptions()
449+
.dtype(torch::kFloat32).requires_grad(false);
450+
auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, options)
451+
.reshape({3, 1, 2});
452+
auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, options)
453+
.reshape({3, 1, 2}).to(torch::kCUDA);
454+
455+
// Call forward on both LSTMs
456+
auto output_cpu = lstm_cpu->forward(input_cpu);
457+
auto output_cuda = lstm_cuda->forward(input_cuda);
458+
459+
output_cpu.output = output_cpu.output.to(torch::kCPU);
460+
output_cpu.state = output_cpu.state.to(torch::kCPU);
461+
462+
// Assert that the output and state are equal on CPU and CUDA
463+
ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
464+
for (int i = 0; i < output_cpu.output.dim(); i++) {
465+
ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
466+
}
467+
for (int i = 0; i < output_cpu.output.size(0); i++) {
468+
for (int j = 0; j < output_cpu.output.size(1); j++) {
469+
for (int k = 0; k < output_cpu.output.size(2); k++) {
470+
ASSERT_NEAR(
471+
output_cpu.output[i][j][k].item<float>(),
472+
output_cuda.output[i][j][k].item<float>(), 1e-5);
473+
}
474+
}
475+
}
476+
}

torch/csrc/api/src/nn/modules/rnn.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,18 @@ void RNNImplBase<Derived>::to(torch::Dtype dtype, bool non_blocking) {
102102
template <typename Derived>
103103
void RNNImplBase<Derived>::to(torch::Device device, bool non_blocking) {
104104
nn::Module::to(device, non_blocking);
105+
const auto num_directions = options.bidirectional_ ? 2 : 1;
106+
for (int64_t layer = 0; layer < options.layers_; layer++) {
107+
for (auto direction = 0; direction < num_directions; direction++) {
108+
const auto layer_idx = (layer * num_directions) + direction;
109+
w_ih[layer_idx] = w_ih[layer_idx].to(device, non_blocking);
110+
w_hh[layer_idx] = w_hh[layer_idx].to(device, non_blocking);
111+
if (options.with_bias_) {
112+
b_ih[layer_idx] = b_ih[layer_idx].to(device, non_blocking);
113+
b_hh[layer_idx] = b_hh[layer_idx].to(device, non_blocking);
114+
}
115+
}
116+
}
105117
flatten_parameters();
106118
}
107119

@@ -144,8 +156,10 @@ RNNOutput RNNImplBase<Derived>::generic_forward(
144156
if (!state.defined()) {
145157
// #layers, batch size, state size
146158
const auto batch_size = input.size(options.batch_first_ ? 0 : 1);
159+
const auto num_directions = options.bidirectional_ ? 2 : 1;
147160
state = torch::zeros(
148-
{options.layers_, batch_size, options.hidden_size_}, input.options());
161+
{options.layers_ * num_directions, batch_size, options.hidden_size_},
162+
input.options());
149163
}
150164
Tensor output, new_state;
151165
std::tie(output, new_state) = function(
@@ -269,8 +283,9 @@ RNNOutput LSTMImpl::forward(const Tensor& input, Tensor state) {
269283
if (!state.defined()) {
270284
// 2 for hidden state and cell state, then #layers, batch size, state size
271285
const auto batch_size = input.size(options.batch_first_ ? 0 : 1);
286+
const auto num_directions = options.bidirectional_ ? 2 : 1;
272287
state = torch::zeros(
273-
{2, options.layers_, batch_size, options.hidden_size_},
288+
{2, options.layers_ * num_directions, batch_size, options.hidden_size_},
274289
input.options());
275290
}
276291
Tensor output, hidden_state, cell_state;

0 commit comments

Comments
 (0)