Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/source/nn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -996,6 +996,11 @@ Non-linear activation functions

.. autofunction:: softshrink

:hidden:`gumbel_softmax`
~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: gumbel_softmax

:hidden:`log_softmax`
~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -1070,6 +1075,19 @@ Dropout functions

.. autofunction:: dropout3d

Sparse functions
----------------------------------

:hidden:`embedding`
~~~~~~~~~~~~~~~~~~~

.. autofunction:: embedding

:hidden:`embedding_bag`
~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: embedding_bag

Distance functions
----------------------------------

Expand Down
19 changes: 10 additions & 9 deletions test/test_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1626,9 +1626,9 @@ def _test_EmbeddingBag(self, cuda, mode, sparse, dtype=torch.double):
self.assertEqual(es_weight_grad, expected_grad_weight, dtype2prec[dtype])

# now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length
def _test_vs_Embedding(N, D, B, L):
es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse).to(device, dtype)
e = nn.Embedding(N, D).to(device, dtype)
def _test_vs_Embedding(N, D, B, L, max_norm=None):
es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, dtype)
e = nn.Embedding(N, D, max_norm=max_norm).to(device, dtype)
e.weight.data.copy_(es.weight.data)
input = torch.randint(N, (B, L), device=device, dtype=torch.long)
offsets = torch.arange(0, B, device=device, dtype=torch.long).mul_(L)
Expand Down Expand Up @@ -1656,8 +1656,9 @@ def _test_vs_Embedding(N, D, B, L):

N, D, B, L = random.randint(1, 100), random.randint(1, 100), random.randint(1, 50), random.randint(1, 50)
_test_vs_Embedding(N, D, B, L)
for p in itertools.product([1, 2], repeat=4):
_test_vs_Embedding(*p)
for max_norm in (None, 3):
for p in itertools.product([1, 2], repeat=4):
_test_vs_Embedding(*p, max_norm=max_norm)

# check that giving illegal input combos raises error
es = nn.EmbeddingBag(10, 20, mode=mode, sparse=sparse)
Expand Down Expand Up @@ -6758,27 +6759,27 @@ def multimarginloss_weights_no_reduce_test():
dict(
module_name='Embedding',
constructor_args=(4, 3),
input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
input_fn=lambda: torch.randperm(2).repeat(1, 2),
jacobian_input=False,
check_gradgrad=False,
),
dict(
module_name='EmbeddingBag',
constructor_args=(4, 3),
input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
input_fn=lambda:torch.randperm(2).repeat(1, 2),
jacobian_input=False,
check_gradgrad=False,
),
dict(
fullname='EmbeddingBag_sparse',
constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True),
input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
input_fn=lambda: torch.randperm(2).repeat(1, 2),
jacobian_input=False,
check_gradgrad=False,
),
dict(
constructor=lambda: nn.Embedding(4, 3, sparse=True),
input_fn=lambda: Variable(torch.randperm(2).repeat(1, 2)),
input_fn=lambda: torch.randperm(2).repeat(1, 2),
jacobian_input=False,
fullname='Embedding_sparse',
check_gradgrad=False,
Expand Down
208 changes: 114 additions & 94 deletions torch/nn/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,21 +891,25 @@ def _gumbel_softmax_sample(logits, tau=1, eps=1e-10):


def gumbel_softmax(logits, tau=1, hard=False, eps=1e-10):
"""
r"""
Sample from the Gumbel-Softmax distribution and optionally discretize.

Args:
logits: `[batch_size, n_class]` unnormalized log-probs
logits: `[batch_size, num_features]` unnormalized log probabilities
tau: non-negative scalar temperature
hard: if ``True``, take `argmax`, but differentiate w.r.t. soft sample y
hard: if ``True``, the returned samples will be discretized as one-hot vectors,
but will be differentiated as if it is the soft sample in autograd

Returns:
[batch_size, n_class] sample from the Gumbel-Softmax distribution.
If hard=True, then the returned sample will be one-hot, otherwise it will
be a probability distribution that sums to 1 across classes
Sampled tensor of shape ``batch_size x num_features`` from the Gumbel-Softmax distribution.
If ``hard=True``, the returned samples will be one-hot, otherwise they will
be probability distributions that sum to 1 across features

Constraints:
- this implementation only works on batch_size x num_features tensor for now

based on
- Currently only work on 2D input :attr:`logits` tensor of shape ``batch_size x num_features``

Based on
https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb ,
(MIT license)
"""
Expand Down Expand Up @@ -983,6 +987,7 @@ def linear(input, weight, bias=None):
Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.

Shape:

- Input: :math:`(N, *, in\_features)` where `*` means any number of
additional dimensions
- Weight: :math:`(out\_features, in\_features)`
Expand Down Expand Up @@ -1011,29 +1016,28 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
The input to the module is a list of indices, and the embedding matrix,
and the output is the corresponding word embeddings.

See :class:`torch.nn.Embedding` for more details.

Args:
input: tensor, containing indices into the embedding matrix
weight:
input (LongTensor): Tensor containing indices into the embedding matrix
weight (Tensor): The embedding matrix
Number of rows should correspond to the maximum possible index + 1,
number of columns is the embedding size
padding_idx (int, optional): Entries at the given index do not contribute to the gradient
max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
norm_type (float, optional): The p of the p-norm to compute for the max_norm option
scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
the words in the mini-batch.
sparse (boolean, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
more details regarding sparse gradients.
padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
(initialized to zeros) whenever it encounters the index.
max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
this before extracting. Note: this will modify :attr:`weight` in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default ``2``.
scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
the words in the mini-batch. Default ``False``.
sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
:class:`torch.nn.Embedding` for more details regarding sparse gradients.

Shape:
- Input: LongTensor `(N, W)`, N = mini-batch, W = number of indices to extract per mini-batch
- Embedding_matrix: FloatTensor `(V, embedding_dim)`, V = maximum index + 1, embedding_dim = embedding size
- Output: `(N, W, embedding_dim)`

Notes:
It is advised to only use `sparse=True` if `embedding_matrix` is a leaf Tensor,
since some autograd functions may not propagate sparse gradients correctly.
Additionally, keep in mind that only a limited number of optimizers support
sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`), and :class:`optim.Adagrad` (`CPU`)
- Input: LongTensor of arbitrary shape containing the indices to extract
- Weight: Embedding matrix of floating point type with shape `(V, embedding_dim)`,
where V = maximum index + 1 and embedding_dim = the embedding size
- Output: `(*, embedding_dim)`, where `*` is the input shape

Examples::

Expand Down Expand Up @@ -1078,87 +1082,103 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)


def embedding_bag(embedding_matrix, indices, offsets=None,
max_norm=None, norm_type=2, scale_grad_by_freq=False, mode='mean', sparse=False):
def embedding_bag(input, weight, offsets=None, max_norm=None, norm_type=2,
scale_grad_by_freq=False, mode='mean', sparse=False):
r"""Computes sums or means of 'bags' of embeddings, without instantiating the
intermediate embeddings.

For bags of constant length,
* :func:`embedding_bag` with `mode=sum` is equivalent to :func:`nn.functional.embedding` followed by
``torch.sum(dim=1)``
* with `mode=mean` is equivalent to :func:`nn.functional.embedding` followed by ``torch.mean(dim=1)``
* with `mode=max` is equivalent to :func:`nn.functional.embedding` followed by ``torch.max(dim=1)``

However, :func:`embedding_bag` is much more time and memory efficient than using a chain of these
operations.

Args:
embedding_matrix: FloatTensor, where number of rows should correspond to the maximum possible index + 1,
number of columns is the embedding size
indices (N or BxN): LongTensor containing the indices of the embeddings to extract.
When `input` is 1D Tensor of shape `N`, an `offsets` Tensor is given, that contains the
starting position of each new sequence in the mini-batch.
offsets (B or None): LongTensor containing the starting positions of each sample in a mini-batch of variable
length sequences. If `input` is 2D (BxN), then offsets does not need to be given,
as the `input` is treated as a mini-batch of fixed length sequences of length `N` each.
max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
norm_type (float, optional): The p of the p-norm to compute for the max_norm option
scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
the words in the dictionary.
mode (string, optional): 'sum' | 'mean' | 'max'. Specifies the way to reduce the bag. Default: 'mean'
sparse (boolean, optional): if ``True``, gradient w.r.t. weight matrix will be a sparse tensor. See Notes
for more details regarding sparse gradients.

Shape:
- Embedding_matrix: FloatTensor `(V, embedding_dim)`,
V = number of embeddings, embedding_dim = embedding size
- Input: LongTensor `N`, N = number of embeddings to extract
(or) LongTensor `BxN`, B = number of sequences in mini-batch,
N = number of embeddings per sequence
- Offsets: LongTensor `B`, B = number of bags. The values are the
offsets in `input` for each bag, i.e. the cumsum of lengths.
Offsets is not given if Input is 2D `BxN` Tensor,
the input is considered to be of fixed-length sequences
- Output: `(B, embedding_dim)`

Examples::

>>> # an Embedding module containing 10 tensors of size 3
>>> embedding_matrix = torch.rand(10, 3)
>>> # a batch of 2 samples of 4 indices each
>>> input = torch.tensor([1,2,4,5,4,3,2,9])
>>> offsets = torch.tensor([0,4])
>>> F.embedding_bag(embedding_matrix, input, offsets)
tensor([[ 0.3397, 0.3552, 0.5545],
[ 0.5893, 0.4386, 0.5882]])
"""
if indices.dim() == 2:
intermediate embeddings.

See :class:`torch.nn.EmbeddingBag` for more details.

Args:
input (LongTensor): Tensor containing bags of indices into the embedding matrix
weight (Tensor): The embedding matrix
Number of rows should correspond to the maximum possible index + 1,
number of columns is the embedding size
offsets (LongTensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
the starting index position of each bag (sequence) in :attr:`input`.
max_norm (float, optional): If given, will renormalize the embedding vectors to have a norm lesser than
this before extracting. Note: this will modify :attr:`weight` in-place.
norm_type (float, optional): The ``p`` in the ``p``-norm to compute for the max_norm option. Default ``2``.
scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
the words in the mini-batch. Default ``False``.
Note: this option is not supported when ``mode="max"``.
mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
Default: ``"mean"``
sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
:class:`torch.nn.Embedding` for more details regarding sparse gradients.
Note: this option is not supported when ``mode="max"``.

Shape:

- :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)

- If :attr:`input` is 2D of shape ``B x N``,

it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
:attr:`offsets` is ignored and required to be ``None`` in this case.

- If :attr:`input` is 1D of shape ``N``,

it will be treated as a concatenation of multiple bags (sequences).
:attr:`offsets` is required to be a 1D tensor containing the
starting index positions of each bag in :attr:`input`. Therefore,
for :attr:`offsets` of shape ``B``, :attr:`input` will be viewed as
having ``B`` bags. Empty bags (i.e., having 0-length) will have
returned vectors filled by zeros.

- :attr:`weight` (Tensor): the learnable weights of the module of
shape ``(num_embeddings x embedding_dim)``

- :attr:`output`: aggregated embedding values of shape ``B x embedding_dim``

Examples::

>>> # an Embedding module containing 10 tensors of size 3
>>> embedding_matrix = torch.rand(10, 3)
>>> # a batch of 2 samples of 4 indices each
>>> input = torch.tensor([1,2,4,5,4,3,2,9])
>>> offsets = torch.tensor([0,4])
>>> F.embedding_bag(embedding_matrix, input, offsets)
tensor([[ 0.3397, 0.3552, 0.5545],
[ 0.5893, 0.4386, 0.5882]])
"""
# Check for backward compatibility.
# Used to be embedding_bag(weight, input, ...)
# Now is embedding_bag(input, weight, ...)
if weight.dtype == torch.long and input.is_floating_point():
warnings.warn("Argument order of nn.functional.embedding_bag was changed. "
"Usage `embedding_bag(weight, input, ...)` is deprecated, "
"and should now be `embedding_bag(input, weight, ...)`.")
weight, input = input, weight

if input.dim() == 2:
if offsets is not None:
raise ValueError("if input is 2D, then offsets has to be None"
", as input is treated is a mini-batch of"
" fixed length sequences. However, found "
"offsets of type {}".format(type(offsets)))
else:
offsets = torch.arange(0, indices.numel(), indices.size(1),
dtype=torch.long, device=indices.device)
offsets = torch.arange(0, input.numel(), input.size(1),
dtype=torch.long, device=input.device)

indices = indices.view(-1)
elif indices.dim() == 1:
input = input.view(-1)
elif input.dim() == 1:
if offsets is None:
raise ValueError("offsets has to be a 1D Tensor but got None")
if offsets.dim() != 1:
raise ValueError("offsets has to be a 1D Tensor")
if offsets[0] != 0:
raise ValueError("offsets[0] has to be 0, i.e. the first sequence"
" in the mini-batch has to start from position 0."
"However, got {}".format(offsets[0]))
if offsets[-1] > indices.size(0):
raise ValueError("offsets[-1] has to be smaller than indices's length"
if offsets[0].item() != 0:
raise ValueError("offsets[0] has to be 0, i.e., the first sequence "
"in the mini-batch has to start from position 0. "
"However, got {}".format(offsets[0].item()))
if offsets[-1].item() > input.size(0):
raise ValueError("offsets[-1] can not be greater than input's length"
" ({}), but got offsets[-1] of {}"
.format(indices.size(0), offsets[-1]))
.format(input.size(0), offsets[-1].item()))
else:
raise ValueError("input has to be 1D or 2D Tensor,"
" but got Tensor of dimension {}".format(indices.dim()))
" but got Tensor of dimension {}".format(input.dim()))

if mode == 'sum':
mode = 0
Expand All @@ -1181,8 +1201,8 @@ def embedding_bag(embedding_matrix, indices, offsets=None,
torch.embedding_renorm_(weight, input, max_norm, norm_type)

ret, _, _, _ = torch.embedding_bag(
embedding_matrix,
indices,
weight,
input,
offsets,
scale_grad_by_freq,
mode,
Expand Down
Loading