pytorch · BowenBao · Apr 25, 2019
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -288,11 +288,13 @@ def _dispatch_rnn_test(self, name, *args, **kwargs):
 
     def _elman_rnn_test(self, layers, nonlinearity, bidirectional,
                         initial_state, packed_sequence, dropout):
+        batch_first = True if packed_sequence == 2 else False
         model = nn.RNN(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE,
                        layers,
                        nonlinearity=nonlinearity,
                        bidirectional=bidirectional,
-                       dropout=dropout)
+                       dropout=dropout,
+                       batch_first=batch_first)
 
         if packed_sequence == 1:
             model = RnnModelWithPackedSequence(model, False)
@@ -303,9 +305,7 @@ def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
             seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
-            inputs = rnn_utils.pad_sequence(inputs)
-            if packed_sequence == 2:
-                inputs = inputs.transpose(0, 1)
+            inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
 
             directions = 2 if bidirectional else 1
@@ -331,9 +331,10 @@ def make_input(batch_size):
 
     def _lstm_test(self, layers, bidirectional, initial_state,
                    packed_sequence, dropout):
+        batch_first = True if packed_sequence == 2 else False
         model = LstmFlatteningResult(
             RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
-            bidirectional=bidirectional, dropout=dropout)
+            bidirectional=bidirectional, dropout=dropout, batch_first=batch_first)
         if packed_sequence == 1:
             model = RnnModelWithPackedSequence(model, False)
         if packed_sequence == 2:
@@ -343,9 +344,7 @@ def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
             seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
-            inputs = rnn_utils.pad_sequence(inputs)
-            if packed_sequence == 2:
-                inputs = inputs.transpose(0, 1)
+            inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
 
             directions = 2 if bidirectional else 1
@@ -372,8 +371,9 @@ def make_input(batch_size):
 
     def _gru_test(self, layers, bidirectional, initial_state,
                   packed_sequence, dropout):
+        batch_first = True if packed_sequence == 2 else False
         model = nn.GRU(RNN_INPUT_SIZE, RNN_HIDDEN_SIZE, layers,
-                       bidirectional=bidirectional, dropout=dropout)
+                       bidirectional=bidirectional, dropout=dropout, batch_first=batch_first)
         if packed_sequence == 1:
             model = RnnModelWithPackedSequence(model, False)
         if packed_sequence == 2:
@@ -383,9 +383,7 @@ def make_input(batch_size):
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
             seq_lengths = list(reversed(sorted(map(int, seq_lengths))))
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
-            inputs = rnn_utils.pad_sequence(inputs)
-            if packed_sequence == 2:
-                inputs = inputs.transpose(0, 1)
+            inputs = rnn_utils.pad_sequence(inputs, batch_first=batch_first)
             inputs = [inputs]
 
             directions = 2 if bidirectional else 1

diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
@@ -1470,7 +1470,8 @@ def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
     assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
     layer_weights = [all_weights[i:i + weights_per_layer] for i in range(0, len(all_weights), weights_per_layer)]
     if batch_first:
-        return _unimplemented("RNN/GRU/LSTM", "batch_first")
+        # batch, seq, feat -> seq, batch, feat
+        input = g.op('Transpose', input, perm_i=[1, 0, 2])
     if dropout and train:
         return _unimplemented("RNN/GRU/LSTM", "dropout in training mode")
 
@@ -1572,6 +1573,9 @@ def retrieve_state(x, start, end):
         h_outs.append(h_out)
         if variant == 'LSTM':
             c_outs.append(c_out)
+    if batch_first:
+        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
+        prev_output = g.op('Transpose', prev_output, perm_i=[1, 0, 2])
     h_outs = h_out if num_layers == 1 else g.op('Concat', *h_outs, axis_i=0)
     if variant == 'RNN' or variant == 'GRU':
         return prev_output, h_outs