pytorch · soumith · May 31, 2018 · May 30, 2018 · May 30, 2018 · May 30, 2018
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
@@ -996,6 +996,11 @@ Non-linear activation functions
 
 .. autofunction:: softshrink
 
+:hidden:`gumbel_softmax`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: gumbel_softmax
+
 :hidden:`log_softmax`
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -1070,6 +1075,19 @@ Dropout functions
 
 .. autofunction:: dropout3d
 
+Sparse functions
+----------------------------------
+
+:hidden:`embedding`
+~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: embedding
+
+:hidden:`embedding_bag`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: embedding_bag
+
 Distance functions
 ----------------------------------
 

diff --git a/test/test_nn.py b/test/test_nn.py
@@ -1626,9 +1626,9 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double):
         self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])
 
         # now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length
-        def _test_vs_Embedding(N, D, B, L):
-            es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse).to(device, dtype)
-            e = nn.Embedding(N, D).to(device, dtype)
+        def _test_vs_Embedding(N, D, B, L, max_norm=None):
+            es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, dtype)
+            e = nn.Embedding(N, D, max_norm=max_norm).to(device, dtype)
             e.weight.data.copy_(es.weight.data)
             input = torch.randint(N, (B, L), device=device, dtype=torch.long)
             offsets = torch.arange(0, B, device=device, dtype=torch.long).mul_(L)
@@ -1656,8 +1656,9 @@ def _test_vs_Embedding(N, D, B, L):
 
         N, D, B, L = random.randint(1, 100), random.randint(1, 100), random.randint(1, 50), random.randint(1, 50)
         _test_vs_Embedding(N, D, B, L)
-        for p in itertools.product([1, 2], repeat=4):
-            _test_vs_Embedding(*p)
+        for max_norm in (None, 3):
+            for p in itertools.product([1, 2], repeat=4):
+                _test_vs_Embedding(*p, max_norm=max_norm)
 
         # check that giving illegal input combos raises error
         es = nn.EmbeddingBag(10, 20, mode=mode, sparse=sparse)
@@ -6758,27 +6759,27 @@ def multimarginloss_weights_no_reduce_test():
     dict(
         module_name='Embedding',
         constructor_args=(4, 3),
-        input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
+        input_fn=lambda: torch.randperm(2).repeat(1, 2),
         jacobian_input=False,
         check_gradgrad=False,
     ),
     dict(
         module_name='EmbeddingBag',
         constructor_args=(4, 3),
-        input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
+        input_fn=lambda:torch.randperm(2).repeat(1, 2),
         jacobian_input=False,
         check_gradgrad=False,
     ),
     dict(
         fullname='EmbeddingBag_sparse',
         constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True),
-        input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
+        input_fn=lambda: torch.randperm(2).repeat(1, 2),
         jacobian_input=False,
         check_gradgrad=False,
     ),
     dict(
         constructor=lambda: nn.Embedding(4, 3, sparse=True),
-        input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
+        input_fn=lambda: torch.randperm(2).repeat(1, 2),
         jacobian_input=False,
         fullname='Embedding_sparse',
         check_gradgrad=False,

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -891,21 +891,25 @@ def _gumbel_softmax_sample(logits, tau=1, eps=1e-10):
 
 
 def gumbel_softmax(logits, tau=1, hard=False, eps=1e-10):
-    """
+    r"""
     Sample from the Gumbel-Softmax distribution and optionally discretize.
+
     Args:
-      logits: `[batch_size, n_class]` unnormalized log-probs
+      logits: `[batch_size, num_features]` unnormalized log probabilities
       tau: non-negative scalar temperature
-      hard: if ``True``, take `argmax`, but differentiate w.r.t. soft sample y
+      hard: if ``True``, the returned samples will be discretized as one-hot vectors,
+            but will be differentiated as if it is the soft sample in autograd
+
     Returns:
-      [batch_size, n_class] sample from the Gumbel-Softmax distribution.
-      If hard=True, then the returned sample will be one-hot, otherwise it will
-      be a probability distribution that sums to 1 across classes
+      Sampled tensor of shape ``batch_size x num_features`` from the Gumbel-Softmax distribution.
+      If ``hard=True``, the returned samples will be one-hot, otherwise they will
+      be probability distributions that sum to 1 across features
 
     Constraints:
-    - this implementation only works on batch_size x num_features tensor for now
 
-    based on
+    - Currently only work on 2D input :attr:`logits` tensor of shape ``batch_size x num_features``
+
+    Based on
     https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb ,
     (MIT license)
     """
@@ -983,6 +987,7 @@ def linear(input, weight, bias=None):
     Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.
 
     Shape:
+
         - Input: :math:`(N, *, in\_features)` where `*` means any number of
           additional dimensions
         - Weight: :math:`(out\_features, in\_features)`
@@ -1011,29 +1016,28 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
     The input to the module is a list of indices, and the embedding matrix,
     and the output is the corresponding word embeddings.
 
+    See :class:`torch.nn.Embedding` for more details.
+
     Args:
-        input: tensor, containing indices into the embedding matrix
-        weight:
+        input (LongTensor): Tensor containing indices into the embedding matrix
+        weight (Tensor): The embedding matrix
             Number of rows should correspond to the maximum possible index + 1,
             number of columns is the embedding size
-        padding_idx (int, optional): Entries at the given index do not contribute to the gradient
-        max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option
-        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
-                                                the words in the mini-batch.
-        sparse (boolean, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
-                                    more details regarding sparse gradients.
+        padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
+                                         (initialized to zeros) whenever it encounters the index.
+        max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
+                                    this before extracting. Note: this will modify :attr:`weight` in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default ``2``.
+        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
+                                 :class:`torch.nn.Embedding` for more details regarding sparse gradients.
 
     Shape:
-        - Input: LongTensor `(N, W)`, N = mini-batch, W = number of indices to extract per mini-batch
-        - Embedding_matrix: FloatTensor `(V, embedding_dim)`, V = maximum index + 1, embedding_dim = embedding size
-        - Output: `(N, W, embedding_dim)`
-
-    Notes:
-        It is advised to only use `sparse=True` if `embedding_matrix` is a leaf Tensor,
-        since some autograd functions may not propagate sparse gradients correctly.
-        Additionally, keep in mind that only a limited number of optimizers support
-        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`), and :class:`optim.Adagrad` (`CPU`)
+        - Input: LongTensor of arbitrary shape containing the indices to extract
+        - Weight: Embedding matrix of floating point type with shape `(V, embedding_dim)`,
+                            where V = maximum index + 1 and embedding_dim = the embedding size
+        - Output: `(*, embedding_dim)`, where `*` is the input shape
 
     Examples::
 
@@ -1078,87 +1082,103 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
 
 
-def embedding_bag(embedding_matrix, indices, offsets=None,
-                  max_norm=None, norm_type=2, scale_grad_by_freq=False, mode='mean', sparse=False):
+def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
+                  scale_grad_by_freq=False, mode='mean', sparse=False):
     r"""Computes sums or means of 'bags' of embeddings, without instantiating the
-        intermediate embeddings.
-
-        For bags of constant length,
-            * :func:`embedding_bag` with `mode=sum` is equivalent to :func:`nn.functional.embedding` followed by
-              ``torch.sum(dim=1)``
-            * with `mode=mean` is equivalent to :func:`nn.functional.embedding` followed by ``torch.mean(dim=1)``
-            * with `mode=max` is equivalent to :func:`nn.functional.embedding` followed by ``torch.max(dim=1)``
-
-        However, :func:`embedding_bag` is much more time and memory efficient than using a chain of these
-        operations.
-
-        Args:
-            embedding_matrix: FloatTensor, where number of rows should correspond to the maximum possible index + 1,
-                              number of columns is the embedding size
-            indices (N or BxN): LongTensor containing the indices of the embeddings to extract.
-                                When `input` is 1D Tensor of shape `N`, an `offsets` Tensor is given, that contains the
-                                starting position of each new sequence in the mini-batch.
-            offsets (B or None): LongTensor containing the starting positions of each sample in a mini-batch of variable
-                                 length sequences. If `input` is 2D (BxN), then offsets does not need to be given,
-                                 as the `input` is treated as a mini-batch of fixed length sequences of length `N` each.
-            max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
-            norm_type (float, optional): The p of the p-norm to compute for the max_norm option
-            scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
-                                                    the words in the dictionary.
-            mode (string, optional): 'sum' | 'mean' | 'max'. Specifies the way to reduce the bag. Default: 'mean'
-            sparse (boolean, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes
-                                        for more details regarding sparse gradients.
-
-        Shape:
-            - Embedding_matrix: FloatTensor `(V, embedding_dim)`,
-                                V = number of embeddings, embedding_dim = embedding size
-            - Input: LongTensor `N`, N = number of embeddings to extract
-                     (or) LongTensor `BxN`, B = number of sequences in mini-batch,
-                                            N = number of embeddings per sequence
-            - Offsets: LongTensor `B`, B = number of bags. The values are the
-                       offsets in `input` for each bag, i.e. the cumsum of lengths.
-                       Offsets is not given if Input is 2D `BxN` Tensor,
-                       the input is considered to be of fixed-length sequences
-            - Output: `(B, embedding_dim)`
-
-        Examples::
-
-            >>> # an Embedding module containing 10 tensors of size 3
-            >>> embedding_matrix = torch.rand(10, 3)
-            >>> # a batch of 2 samples of 4 indices each
-            >>> input = torch.tensor([1,2,4,5,4,3,2,9])
-            >>> offsets = torch.tensor([0,4])
-            >>> F.embedding_bag(embedding_matrix, input, offsets)
-            tensor([[ 0.3397,  0.3552,  0.5545],
-                    [ 0.5893,  0.4386,  0.5882]])
-        """
-    if indices.dim() == 2:
+    intermediate embeddings.
+
+    See :class:`torch.nn.EmbeddingBag` for more details.
+
+    Args:
+        input (LongTensor): Tensor containing bags of indices into the embedding matrix
+        weight (Tensor): The embedding matrix
+            Number of rows should correspond to the maximum possible index + 1,
+            number of columns is the embedding size
+        offsets (LongTensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
+                             the starting index position of each bag (sequence) in :attr:`input`.
+        max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
+                                    this before extracting. Note: this will modify :attr:`weight` in-place.
+        norm_type (float, optional): The ``p`` in the ``p``-norm to compute for the max_norm option. Default ``2``.
+        scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
+                                 :class:`torch.nn.Embedding` for more details regarding sparse gradients.
+                                 Note: this option is not supported when ``mode="max"``.
+
+    Shape:
+
+        - :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)
+
+          - If :attr:`input` is 2D of shape ``B x N``,
+
+            it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
+            this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
+            :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+          - If :attr:`input` is 1D of shape ``N``,
+
+            it will be treated as a concatenation of multiple bags (sequences).
+            :attr:`offsets` is required to be a 1D tensor containing the
+            starting index positions of each bag in :attr:`input`. Therefore,
+            for :attr:`offsets` of shape ``B``, :attr:`input` will be viewed as
+            having ``B`` bags. Empty bags (i.e., having 0-length) will have
+            returned vectors filled by zeros.
+
+        - :attr:`weight` (Tensor): the learnable weights of the module of
+          shape ``(num_embeddings x embedding_dim)``
+
+        - :attr:`output`: aggregated embedding values of shape ``B x embedding_dim``
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding_matrix = torch.rand(10, 3)
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.tensor([1,2,4,5,4,3,2,9])
+        >>> offsets = torch.tensor([0,4])
+        >>> F.embedding_bag(embedding_matrix, input, offsets)
+        tensor([[ 0.3397,  0.3552,  0.5545],
+                [ 0.5893,  0.4386,  0.5882]])
+    """
+    # Check for backward compatibility.
+    # Used to be embedding_bag(weight, input, ...)
+    # Now is     embedding_bag(input, weight, ...)
+    if weight.dtype == torch.long and input.is_floating_point():
+        warnings.warn("Argument order of nn.functional.embedding_bag was changed. "
+                      "Usage `embedding_bag(weight, input, ...)` is deprecated, "
+                      "and should now be `embedding_bag(input, weight, ...)`.")
+        weight, input = input, weight
+
+    if input.dim() == 2:
         if offsets is not None:
             raise ValueError("if input is 2D, then offsets has to be None"
                              ", as input is treated is a mini-batch of"
                              " fixed length sequences. However, found "
                              "offsets of type {}".format(type(offsets)))
         else:
-            offsets = torch.arange(0, indices.numel(), indices.size(1),
-                                   dtype=torch.long, device=indices.device)
+            offsets = torch.arange(0, input.numel(), input.size(1),
+                                   dtype=torch.long, device=input.device)
 
-            indices = indices.view(-1)
-    elif indices.dim() == 1:
+            input = input.view(-1)
+    elif input.dim() == 1:
         if offsets is None:
             raise ValueError("offsets has to be a 1D Tensor but got None")
         if offsets.dim() != 1:
             raise ValueError("offsets has to be a 1D Tensor")
-        if offsets[0] != 0:
-            raise ValueError("offsets[0] has to be 0, i.e. the first sequence"
-                             " in the mini-batch has to start from position 0."
-                             "However, got {}".format(offsets[0]))
-        if offsets[-1] > indices.size(0):
-            raise ValueError("offsets[-1] has to be smaller than indices's length"
+        if offsets[0].item() != 0:
+            raise ValueError("offsets[0] has to be 0, i.e., the first sequence "
+                             "in the mini-batch has to start from position 0. "
+                             "However, got {}".format(offsets[0].item()))
+        if offsets[-1].item() > input.size(0):
+            raise ValueError("offsets[-1] can not be greater than input's length"
                              " ({}), but got offsets[-1] of {}"
-                             .format(indices.size(0), offsets[-1]))
+                             .format(input.size(0), offsets[-1].item()))
     else:
         raise ValueError("input has to be 1D or 2D Tensor,"
-                         " but got Tensor of dimension {}".format(indices.dim()))
+                         " but got Tensor of dimension {}".format(input.dim()))
 
     if mode == 'sum':
         mode = 0
@@ -1181,8 +1201,8 @@ def embedding_bag(embedding_matrix, indices, offsets=None,
             torch.embedding_renorm_(weight, input, max_norm, norm_type)
 
     ret, _, _, _ = torch.embedding_bag(
-        embedding_matrix,
-        indices,
+        weight,
+        input,
         offsets,
         scale_grad_by_freq,
         mode,