fix sparse embedding backward when input contains only padding_idx

ssnl · ssnl · commit b3a77cce3d75 · 2018-04-02T20:31:07.000-04:00
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
@@ -66,13 +66,19 @@ Tensor embedding_sparse_backward(
     grad = grad.index(c);
   }
 
-  int64_t num_features = grad.size(-1);
+  int64_t num_features = grad_.size(-1);
   auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
+  auto& dense_type = grad.type();
+  auto& sparse_type = dense_type.toBackend(grad.is_cuda() ? kSparseCUDA : kSparseCPU);
+
+  // check if all our grad come from padding_idx
+  if (grad.numel() == 0) {
+    return sparse_type.sparse_coo_tensor(indices_.type().tensor(),
+                                         dense_type.tensor(), weight_size);
+  }
 
   auto index = indices.view({1, -1});
   auto values = grad.contiguous().view({-1, num_features});
-
-  auto& sparse_type = grad.type().toBackend(grad.is_cuda() ? kSparseCUDA : kSparseCPU);
   return sparse_type.sparse_coo_tensor(index, values, weight_size);
 }
 
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -1291,6 +1291,18 @@ def test_embedding_padding_idx(self):
         self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=25)
         self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=-25)
 
+        # test backward when input contains padding_idx
+        padding_idx = 0
+        embedding = nn.Embedding(5, 2, padding_idx=padding_idx)
+        for n in (1, 2):
+            for other_indices in ([], [1, 3], [2]):
+                indices = torch.LongTensor(other_indices + [padding_idx] * n)
+                pre = embedding.weight[padding_idx].clone()
+                embedding(indices).sum().backward()
+                after = (embedding.weight + embedding.weight.grad)[padding_idx]
+                embedding.zero_grad()
+                self.assertEqual(after, pre)
+
     def test_embedding_max_norm(self):
         embedding = nn.Embedding(22, 5, max_norm=1.0)
         input = Variable(torch.LongTensor([2, 8, 8, 6]))
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
@@ -19,17 +19,17 @@ class Embedding(Module):
         padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index.
         max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
         norm_type (float, optional): The p of the p-norm to compute for the max_norm option
-        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
+        scale_grad_by_freq (bool, optional): if given, this will scale gradients by the frequency of
                                                 the words in the mini-batch.
-        sparse (boolean, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
+        sparse (bool, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
                                     more details regarding sparse gradients.
 
     Attributes:
         weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
 
     Shape:
-        - Input: LongTensor `(N, W)`, N = mini-batch, W = number of indices to extract per mini-batch
-        - Output: `(N, W, embedding_dim)`
+        - Input: LongTensor of arbitrary shape containing the indices to extract
+        - Output: `(*, embedding_dim)`, where `*` is the input shape
 
     Notes:
         Keep in mind that only a limited number of optimizers support
@@ -166,10 +166,10 @@ class EmbeddingBag(Module):
         embedding_dim (int): the size of each embedding vector
         max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
         norm_type (float, optional): The p of the p-norm to compute for the max_norm option
-        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
+        scale_grad_by_freq (bool, optional): if given, this will scale gradients by the frequency of
                                                 the words in the dictionary.
         mode (string, optional): 'sum' | 'mean'. Specifies the way to reduce the bag. Default: 'mean'
-        sparse (boolean, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
+        sparse (bool, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
                                     more details regarding sparse gradients.
 
     Attributes: