Handle sequence lengths correctly when exporting RNNs to ONNX (#4695)

anderspapitto · soumith · commit b2cfd961d3ef · 2018-02-06T21:40:27.000-05:00
* PackedSequence: store batch_sizes as tensor

rather than converting to a list of python integers. This maintains
the invariant that module's inputs/outputs are collections of
Variables.

In particular, this causes the JIT to no longer choke when flattening
and unflattening arguments.

* Handle sequence lengths correctly when exporting RNNs to ONNX

- when uniform sequence lengths are provided, correctly omit the
  argument when constructing the ONNX graph, so as to not fix the
  graph to the batch size.

- handle PackedSequences by floating them through the graph and
  eliminating them in an optimization pass. ONNX does not have packed
  sequences, but operates on a representation equivalent to
  PaddedSequence, so we hide the representation-switching from ONNX

- as a preliminary step towards handling PackedSequences, not directly
  tied to ONNX export, change batch_sizes from being an argument to
  the RNN operators into being an argument to the forward() function
  of those RNN operators. This more closely models the reality that
  batch_sizes are effectively part of the input sequences.
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -3054,9 +3054,11 @@ def compare_cpu_gpu(outputs_cpu, outputs_gpu):
                 grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size)
 
                 if variable_len:
-                    batch_sizes = [7, 5, 5, 2, 1, 1]
-                    input_val = rnn_utils.pack_padded_sequence(input_val, batch_sizes, batch_first=batch_first)
-                    grad_output = rnn_utils.pack_padded_sequence(grad_output, batch_sizes, batch_first=batch_first).data
+                    lengths = [7, 5, 5, 2, 1, 1]
+                    input_val = Variable(input_val)
+                    grad_output = Variable(grad_output)
+                    input_val = rnn_utils.pack_padded_sequence(input_val, lengths, batch_first=batch_first)
+                    grad_output = rnn_utils.pack_padded_sequence(grad_output, lengths, batch_first=batch_first).data
 
                 rnn = module(input_size,
                              hidden_size,
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
@@ -295,8 +295,11 @@ def unflatten_helper(input, proto):
         if not isinstance(proto, (list, tuple)):
             return input[0], input[1:]
         for e in proto:
-            res_e, input = unflatten_helper(input, e)
-            res.append(res_e)
+            if e is None:
+                res.append(e)
+            else:
+                res_e, input = unflatten_helper(input, e)
+                res.append(res_e)
         return type(proto)(res), input
 
     return unflatten_helper(input, proto)[0]
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
@@ -190,17 +190,22 @@ void encodeGraph(onnx::GraphProto * p_g, const std::shared_ptr<Graph> & g, const
     encodeValueInfo(v, output);
   }
   for (auto node : g->nodes()) {
-    if (node->kind() == kUndefined && !node->hasUses()) {
-      // Undefined nodes never show up in ONNX; they're just a tool
-      // to help symbolics do the right thing.
+    if (node->kind() == kUndefined) {
+      // Undefined nodes are used to implement optional inputs. One
+      // way to "not provide" an optional input is to create an
+      // Undefined node, and pass its output as that input.
       continue;
     }
     auto p_n = p_g->add_node();
     if (node->getSourceLocation()) {
       p_n->set_doc_string(node->getSourceLocation()->python_traceback);
     }
     for(auto input : node->inputs()) {
-      p_n->add_input(value_name(input));
+      if (input->node()->kind() == kUndefined) {
+        p_n->add_input("");
+      } else {
+        p_n->add_input(value_name(input));
+      }
     }
     for(auto output : node->outputs()) {
       p_n->add_output(value_name(output));
@@ -244,9 +249,6 @@ void validateGraph(const std::shared_ptr<Graph>& graph) {
       if (node->kind() == kExpand) {
         FAIL_EXPORT("Couldn't export operator expand; this usually means you used a form of broadcasting that ONNX does not currently support");
       }
-      if (node->kind() == kUndefined) {
-        FAIL_EXPORT("Couldn't export undefined constant tensor (please file an issue)")
-      }
       std::string n = node->kind().toString();
       if (n.size() == 0) {
         FAIL_EXPORT("Operator to export had empty name (please file an issue)")
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
@@ -45,6 +45,8 @@ _(value) \
 _(Subgraph) \
 _(BatchNormalization) \
 _(Conv) \
+_(PackPadded) \
+_(PadPacked) \
 _(ConvTranspose) \
 _(is_test) \
 _(epsilon) \
@@ -61,6 +63,9 @@ _(strides) \
 _(stride) \
 _(pads) \
 _(pad) \
+_(RNN) \
+_(LSTM) \
+_(GRU) \
 _(beta) \
 _(alpha) \
 _(dilations) \
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
@@ -211,6 +211,8 @@ struct Value {
     return uses_;
   }
 
+  void replaceFirstUseWith(Value * newValue);
+
   // Replaces all uses of this node with 'newValue'.
   //
   // Given:   %3 = f(%1, %2)
@@ -1031,13 +1033,18 @@ inline const Graph * Value::owningGraph() const {
   return node()->owningGraph();
 }
 
-inline void Value::replaceAllUsesWith(Value * newValue) {
+inline void Value::replaceFirstUseWith(Value * newValue) {
   JIT_ASSERT(owningGraph() == newValue->owningGraph());
-  for(auto u : uses()) {
-    u.user->inputs_[u.offset] = newValue;
-    newValue->uses_.push_back(u);
+  auto u = uses()[0];
+  u.user->inputs_[u.offset] = newValue;
+  newValue->uses_.push_back(u);
+  uses_.erase(uses_.begin());
+}
+
+inline void Value::replaceAllUsesWith(Value * newValue) {
+  while (!uses().empty()) {
+    replaceFirstUseWith(newValue);
   }
-  uses_.clear();
 }
 
 inline Node::Node(Graph * graph_, NodeKind kind_) :
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -20,6 +20,11 @@ std::unordered_set<NodeKind> broadcasting = {
   kGemm,
 };
 
+bool isRNN(const Node *node) {
+  auto k = node->kind();
+  return k == kRNN || k == kLSTM || k == kGRU;
+}
+
 bool isNopTranspose(const std::vector<int64_t> & perm) {
   for (int64_t i = 0, perm_size = perm.size(); i < perm_size; i++)
     if (perm[i] != i)
@@ -167,6 +172,87 @@ void fuseTransposeIntoGemm(std::shared_ptr<Graph>& graph) {
   }
 }
 
+// Why this is here:
+//
+//   Pytorch has a "packed" representation of sequences, as well as a
+//   "padded" representation. ONNX has only one representation,
+//   corresponding to pytorch's "padded". Therefore, we need to remove
+//   any use of packed sequences before exporting.
+//
+// What this does:
+//
+//   This code uses the observation that
+//     RNN(PackPadded(x)) == PackPadded(RNN(x))
+//   and converts the first form to the second whenever possible,
+//   "pushing" the packing operation past the RNN operation. Then,
+//   the removeNopPacking pass removes the packing operations
+//   entirely by pairing them with their inverse PadPacked. If the
+//   input graph does not pair the operations, export will fail.
+void pushPackingPastRnn(std::shared_ptr<Graph>& graph) {
+  for (auto it = graph->nodes().begin(); it != graph->nodes().end(); ++it) {
+    auto* n = *it;
+
+    if (n->kind() != kPackPadded) {
+      continue;
+    }
+    if (n->outputs()[0]->uses().size() != 1) {
+      // For now, only handle the case where there is one consumer.
+      continue;
+    }
+    Node * rnn = n->outputs()[0]->uses()[0].user;
+    if (!isRNN(rnn)) {
+      continue;
+    }
+
+    // remove PackPadded from in front of the RNN
+    n->outputs()[0]->replaceAllUsesWith(n->inputs()[0]);
+
+    // note there can be multiple uses of the length blob. If we are
+    // translating a multi-level RNN it will be an input to each level.
+    n->outputs()[1]->replaceFirstUseWith(n->inputs()[1]);
+
+    // and insert new PackPadded after the RNN
+    Node * newPackPadded = graph->create(kPackPadded, 2);
+    newPackPadded->insertAfter(rnn);
+
+    // make things consume from the new PackPadded
+    rnn->outputs()[0]->replaceAllUsesWith(newPackPadded->outputs()[0]);
+    n->outputs()[1]->replaceAllUsesWith(newPackPadded->outputs()[1]);
+
+    // setup the new PackPadded's inputs
+    newPackPadded->addInput(rnn->outputs()[0]);
+    newPackPadded->addInput(n->inputs()[1]);
+
+    it.destroyCurrent();
+  }
+}
+
+void removeNopPacking(std::shared_ptr<Graph>& graph) {
+  for (auto it = graph->nodes().begin(); it != graph->nodes().end(); ++it) {
+    auto* n = *it;
+
+    if (n->kind() != kPadPacked) {
+      continue;
+    }
+    Node* input = n->inputs()[0]->node();
+    if (input->kind() != kPackPadded) {
+      continue;
+    }
+    if (input->outputs()[0] != n->inputs()[0]) {
+      continue;
+    }
+    if (input->outputs()[1] != n->inputs()[1]) {
+      continue;
+    }
+    n->outputs()[0]->replaceAllUsesWith(input->inputs()[0]);
+    n->outputs()[1]->replaceAllUsesWith(input->inputs()[1]);
+
+    n->removeAllInputs();
+    it.destroyCurrent();
+  }
+}
+
+
 // This optimization does ONNX-specific peephole optimizations.
 //
 // At the moment, here are the optimizations it does:
@@ -175,8 +261,9 @@ void fuseTransposeIntoGemm(std::shared_ptr<Graph>& graph) {
 //    local information.  This optimization is not useful for PyTorch as 'expand'
 //    is free.
 //  - Fusing of consecutive transposes
-//  - Elimiation of NOP transposes
+//  - Elimination of NOP transposes
 //  - Fusing of transposes into Gemm
+//  - Elimination of PaddedSequences
 //
 // Before you write an optimization here, ask yourself, "Could I do this
 // optimization on ATen operators"?  If so, you should seriously consider
@@ -191,6 +278,8 @@ void PeepholeOptimizeONNX(std::shared_ptr<Graph>& graph) {
   fuseConsecutiveTransposes(graph);
   eliminateNopTranspose(graph);
   fuseTransposeIntoGemm(graph);
+  pushPackingPastRnn(graph);
+  removeNopPacking(graph);
 }
 
 }}
diff --git a/torch/nn/_functions/packing.py b/torch/nn/_functions/packing.py
@@ -14,7 +14,10 @@ def forward(ctx, input, lengths, batch_first):
 
         steps = []
         batch_sizes = []
-        lengths_iter = reversed(lengths)
+
+        # lengths is a Tensor, so we must convert to list before reversed()
+        lengths_iter = reversed(list(lengths))
+
         batch_size = input.size(1)
 
         if len(lengths) != batch_size:
diff --git a/torch/nn/_functions/rnn.py b/torch/nn/_functions/rnn.py
@@ -70,7 +70,7 @@ def StackedRNN(inners, num_layers, lstm=False, dropout=0, train=True):
     num_directions = len(inners)
     total_layers = num_layers * num_directions
 
-    def forward(input, hidden, weight):
+    def forward(input, hidden, weight, batch_sizes):
         assert(len(weight) == total_layers)
         next_hidden = []
 
@@ -82,7 +82,7 @@ def forward(input, hidden, weight):
             for j, inner in enumerate(inners):
                 l = i * num_directions + j
 
-                hy, output = inner(input, hidden[l], weight[l])
+                hy, output = inner(input, hidden[l], weight[l], batch_sizes)
                 next_hidden.append(hy)
                 all_output.append(output)
 
@@ -107,7 +107,7 @@ def forward(input, hidden, weight):
 
 
 def Recurrent(inner, reverse=False):
-    def forward(input, hidden, weight):
+    def forward(input, hidden, weight, batch_sizes):
         output = []
         steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
         for i in steps:
@@ -124,17 +124,16 @@ def forward(input, hidden, weight):
     return forward
 
 
-def variable_recurrent_factory(batch_sizes):
-    def fac(inner, reverse=False):
-        if reverse:
-            return VariableRecurrentReverse(batch_sizes, inner)
-        else:
-            return VariableRecurrent(batch_sizes, inner)
-    return fac
+def variable_recurrent_factory(inner, reverse=False):
+    if reverse:
+        return VariableRecurrentReverse(inner)
+    else:
+        return VariableRecurrent(inner)
 
 
-def VariableRecurrent(batch_sizes, inner):
-    def forward(input, hidden, weight):
+def VariableRecurrent(inner):
+    def forward(input, hidden, weight, batch_sizes):
+
         output = []
         input_offset = 0
         last_batch_size = batch_sizes[0]
@@ -172,8 +171,8 @@ def forward(input, hidden, weight):
     return forward
 
 
-def VariableRecurrentReverse(batch_sizes, inner):
-    def forward(input, hidden, weight):
+def VariableRecurrentReverse(inner):
+    def forward(input, hidden, weight, batch_sizes):
         output = []
         input_offset = input.size(0)
         last_batch_size = batch_sizes[-1]
@@ -183,7 +182,8 @@ def forward(input, hidden, weight):
             hidden = (hidden,)
             initial_hidden = (initial_hidden,)
         hidden = tuple(h[:batch_sizes[-1]] for h in hidden)
-        for batch_size in reversed(batch_sizes):
+        for i in reversed(range(len(batch_sizes))):
+            batch_size = batch_sizes[i]
             inc = batch_size - last_batch_size
             if inc > 0:
                 hidden = tuple(torch.cat((h, ih[last_batch_size:batch_size]), 0)
@@ -208,7 +208,7 @@ def forward(input, hidden, weight):
 
 
 def AutogradRNN(mode, input_size, hidden_size, num_layers=1, batch_first=False,
-                dropout=0, train=True, bidirectional=False, batch_sizes=None,
+                dropout=0, train=True, bidirectional=False, variable_length=False,
                 dropout_state=None, flat_weight=None):
 
     if mode == 'RNN_RELU':
@@ -222,10 +222,7 @@ def AutogradRNN(mode, input_size, hidden_size, num_layers=1, batch_first=False,
     else:
         raise Exception('Unknown mode: {}'.format(mode))
 
-    if batch_sizes is None:
-        rec_factory = Recurrent
-    else:
-        rec_factory = variable_recurrent_factory(batch_sizes)
+    rec_factory = variable_recurrent_factory if variable_length else Recurrent
 
     if bidirectional:
         layer = (rec_factory(cell), rec_factory(cell, reverse=True))
@@ -238,13 +235,13 @@ def AutogradRNN(mode, input_size, hidden_size, num_layers=1, batch_first=False,
                       dropout=dropout,
                       train=train)
 
-    def forward(input, weight, hidden):
-        if batch_first and batch_sizes is None:
+    def forward(input, weight, hidden, batch_sizes):
+        if batch_first and not variable_length:
             input = input.transpose(0, 1)
 
-        nexth, output = func(input, hidden, weight)
+        nexth, output = func(input, hidden, weight, batch_sizes)
 
-        if batch_first and batch_sizes is None:
+        if batch_first and not variable_length:
             output = output.transpose(0, 1)
 
         return output, nexth
@@ -254,7 +251,7 @@ def forward(input, weight, hidden):
 
 def CudnnRNN(mode, input_size, hidden_size, num_layers=1,
              batch_first=False, dropout=0, train=True, bidirectional=False,
-             batch_sizes=None, dropout_state=None, flat_weight=None):
+             variable_length=False, dropout_state=None, flat_weight=None):
     if dropout_state is None:
         dropout_state = {}
     mode = cudnn.rnn.get_cudnn_mode(mode)
@@ -265,7 +262,7 @@ def CudnnRNN(mode, input_size, hidden_size, num_layers=1,
                       "at every call, possibly greatly increasing memory usage. "
                       "To compact weights again call flatten_parameters().", stacklevel=5)
 
-    def forward(input, weight, hx):
+    def forward(input, weight, hx, batch_sizes):
         if mode == cudnn.CUDNN_LSTM:
             hx, cx = hx
         else:
@@ -283,7 +280,7 @@ def forward(input, weight, hx):
             hx, cx,
             mode, hidden_size, num_layers,
             batch_first, dropout, train, bool(bidirectional),
-            batch_sizes if batch_sizes else (),
+            list(batch_sizes.data) if variable_length else (),
             Variable(dropout_desc.state) if dropout_desc.state is not None else None)
 
         if cx is not None:
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py