Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e3544b9
adding a beta parameter to the smooth_l1 loss fn
bdhirsh Sep 9, 2020
3b1566e
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 10, 2020
9f2b3e5
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 10, 2020
0b1ce81
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 10, 2020
fb555c3
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 10, 2020
75d4e70
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 11, 2020
30a2a80
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 11, 2020
2f96843
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 14, 2020
9433550
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 14, 2020
9df04b7
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 14, 2020
8043711
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 14, 2020
aa0227d
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 14, 2020
dc531c7
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 15, 2020
c593e98
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 15, 2020
ba28ddd
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 16, 2020
773b079
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 17, 2020
8f107e2
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 17, 2020
0c3ab5a
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 18, 2020
5e8f8ed
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 18, 2020
3261419
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 23, 2020
b736fed
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 23, 2020
fd6d110
Update on "adding a beta parameter to the smooth_l1 loss fn"
bdhirsh Sep 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aten/src/ATen/autocast_mode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
KERNEL(ADD_NS(hinge_embedding_loss), "hinge_embedding_loss", Tensor (const Tensor &, const Tensor &, double, int64_t), fp32)
KERNEL(ADD_NS(kl_div), "kl_div", Tensor (const Tensor &, const Tensor &, int64_t, bool), fp32)
KERNEL(ADD_NS(l1_loss), "l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
KERNEL(ADD_NS(mse_loss), "mse_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
KERNEL(ADD_NS(margin_ranking_loss), "margin_ranking_loss", Tensor (const Tensor &, const Tensor &, const Tensor &, double, int64_t), fp32)
KERNEL(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/native/BinaryOps.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ inline void sub_check(const Tensor& self, const Tensor& other) {
}

using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha);
using binary_fn_beta = void(*)(TensorIterator&, double beta);
using binary_fn = void(*)(TensorIterator&);
using binary_clamp_fn_alpha =
void(*)(TensorIterator&, Scalar alpha, Scalar min_val, Scalar max_val);
Expand Down Expand Up @@ -54,7 +55,7 @@ DECLARE_DISPATCH(binary_fn, max_elementwise_stub);
DECLARE_DISPATCH(binary_fn, min_elementwise_stub);
DECLARE_DISPATCH(binary_fn, maximum_stub);
DECLARE_DISPATCH(binary_fn, minimum_stub);
DECLARE_DISPATCH(binary_fn, smooth_l1_stub);
DECLARE_DISPATCH(binary_fn_beta, smooth_l1_stub);
DECLARE_DISPATCH(binary_fn, sigmoid_backward_stub);
DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
DECLARE_DISPATCH(binary_fn, tanh_backward_stub);
Expand Down
33 changes: 24 additions & 9 deletions aten/src/ATen/native/Loss.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,38 +295,53 @@ Tensor soft_margin_loss(
return output;
}

Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction) {
Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
if (beta <= 0)
return at::native::l1_loss(input, target, reduction);
Tensor loss;
auto iter = TensorIterator::binary_op(loss, input, target);
smooth_l1_stub(iter.device_type(), iter);
smooth_l1_stub(iter.device_type(), iter, beta);
return apply_loss_reduction(iter.output(), reduction);
}

Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction) {
Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
if (beta <= 0)
return at::native::l1_loss_out(result, input, target, reduction);
if (reduction != Reduction::None) {
result = at::smooth_l1_loss(input, target, reduction);
Tensor loss;
auto iter = TensorIterator::binary_op(loss, input, target);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gchanan I updated smooth_l1_loss_out to something similar to what MSE does:

  • make a TensorIterator that puts the smooth l1 loss output in a temporary tensor
  • use the existing <reduction>_out functions to put the output of the reduction in the out var.

My educated guess as to why we need the temporary variable is because result has a different tensor shape than the tensor output of the actually kernel function when the caller is using reductions.

smooth_l1_stub(iter.device_type(), iter, beta);
if (reduction == Reduction::Mean) {
at::mean_out(result, iter.output(), 0);
} else {
at::sum_out(result, iter.output(), 0);
}
} else {
auto iter = TensorIterator::binary_op(result, input, target);
smooth_l1_stub(iter.device_type(), iter);
smooth_l1_stub(iter.device_type(), iter, beta);
}
return result;
}

Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
if (beta <= 0)
return at::native::l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
auto iter = at::TensorIteratorConfig()
.add_output(grad_input)
.add_input(input)
.add_input(target)
.add_input(grad_output)
.build();
smooth_l1_backward_stub(iter.device_type(), iter, norm);
smooth_l1_backward_stub(iter.device_type(), iter, norm, beta);
return grad_input;
}

Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
if (beta <= 0)
return at::native::l1_loss_backward(grad_output, input, target, reduction);
auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction, beta);
}

Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) {
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/native/PointwiseOps.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ struct TensorIterator;
namespace native {

using pointwise_fn = void (*)(TensorIterator&, Scalar scalar);
using pointwise_fn_beta = void (*)(TensorIterator&, Scalar scalar, double beta);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lmk if you think the name should be more general, or if I just shouldn't use an alias here. Right now smooth_l1_backwards looks like the only point wise op here that takes in two scalar params


DECLARE_DISPATCH(pointwise_fn, addcmul_stub);
DECLARE_DISPATCH(pointwise_fn, addcdiv_stub);
DECLARE_DISPATCH(pointwise_fn, smooth_l1_backward_stub);
DECLARE_DISPATCH(pointwise_fn_beta, smooth_l1_backward_stub);
DECLARE_DISPATCH(pointwise_fn, mse_backward_stub);

} // namespace native
Expand Down
17 changes: 9 additions & 8 deletions aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,24 +502,25 @@ void minimum_kernel(TensorIterator& iter) {
}
}

void smooth_l1_kernel(TensorIterator& iter) {
void smooth_l1_kernel(TensorIterator& iter, double beta) {
AT_DISPATCH_FLOATING_TYPES_AND2(
kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
using Vec = Vec256<scalar_t>;
const Vec one_vec(static_cast<scalar_t>(1));
const scalar_t beta_val(beta);
const Vec beta_val_vec(beta_val);
const Vec point_five_vec(static_cast<scalar_t>(0.5));
cpu_kernel_vec(
iter,
[](scalar_t a, scalar_t b) -> scalar_t {
[&beta_val](scalar_t a, scalar_t b) -> scalar_t {
auto z = std::abs(a - b);
return z < static_cast<scalar_t>(1)
? static_cast<scalar_t>(0.5) * z * z
: z - static_cast<scalar_t>(0.5);
return z < beta_val
? static_cast<scalar_t>(0.5) * z * z / beta_val
: z - static_cast<scalar_t>(0.5) * beta_val;
},
[&one_vec, &point_five_vec](Vec a, Vec b) {
[&beta_val_vec, &point_five_vec](Vec a, Vec b) {
auto z = (a - b).abs();
return Vec::blendv(
point_five_vec * z * z, z - point_five_vec, z >= one_vec);
point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec);
});
});
}
Expand Down
27 changes: 19 additions & 8 deletions aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,28 +46,39 @@ static void addcdiv_cpu_kernel(TensorIterator& iter, Scalar value) {
});
}

static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm) {
static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm, double beta) {
ScalarType dtype = iter.dtype(0);
AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
auto norm_val = norm.to<scalar_t>();
scalar_t beta_val(beta);
auto norm_val_vec = Vec256<scalar_t>(norm_val);
auto beta_val_vec = Vec256<scalar_t>(beta_val);
const auto neg_1_vec = Vec256<scalar_t>(-1);
const auto zero_vec = Vec256<scalar_t>(0);
const auto pos_1_vec = Vec256<scalar_t>(1);
cpu_kernel_vec(iter,
[=](scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
const auto x = input - target;
if (x < -1.)
if (x <= -beta)
return -norm_val * grad_output;
else if (x > 1.)
else if (x >= beta)
return norm_val * grad_output;
else
return norm_val * x * grad_output;
return norm_val * x * grad_output / beta;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the smooth l1 loss when |X| < 1 now divides by the beta parameter, so the derivative does as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be wrong. If it divides by beta in the forward pass, should we multiply the grad by beta in the backward pass?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Chatted with @bdhirsh offline. I was wrong. Brian's current implementation is correct.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like if x == beta == 0, we could reach here, and then trigger division by 0?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might miss sth, but looks like this is different from the match given in the doc.

0.5 (x_i - y_i)^2 / beta, & \text{if } |x_i - y_i| < beta 

Should the three condition be x <= -beta, x >= beta, and otherwise?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep you're right, thanks. And using x <= -beta and x >= beta should fix the division by 0 issue.

},
[norm_val_vec, neg_1_vec, pos_1_vec](
[norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
Vec256<scalar_t> input, Vec256<scalar_t> target, Vec256<scalar_t> grad_output) -> Vec256<scalar_t> {
auto x = input - target;
x = clamp(x, neg_1_vec, pos_1_vec);
return norm_val_vec * x * grad_output;
// using two blendv calls to simulate the 3 cases
// 1 if x >= beta
// -1 if x <= -beta
// x / beta if |x| < beta
const auto x = input - target;
const auto pos_or_neg_1_vec = Vec256<scalar_t>::blendv(
neg_1_vec, pos_1_vec, x > zero_vec);
const auto x_abs = x.abs();
const auto output = Vec256<scalar_t>::blendv(
x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
return norm_val_vec * output * grad_output;
}
);
});
Expand Down
9 changes: 5 additions & 4 deletions aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ void atan2_kernel_cuda(TensorIterator& iter) {
});
}

void smooth_l1_kernel_cuda(TensorIterator& iter) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&]() {
gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
void smooth_l1_kernel_cuda(TensorIterator& iter, double beta) {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&iter, beta]() {
scalar_t beta_val(beta);
gpu_kernel(iter, [beta_val] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
auto z = ::abs(a - b);
return z < scalar_t(1.) ? scalar_t(0.5) * z * z : z - scalar_t(0.5);
return z < beta_val ? scalar_t(0.5) * z * z / beta_val : z - scalar_t(0.5) * beta_val;
});
});
}
Expand Down
13 changes: 7 additions & 6 deletions aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,18 @@ void addcdiv_cuda_kernel(TensorIterator& iter, Scalar value) {
});
}

void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm) {
AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&]() {
void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm, double beta) {
AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&iter, &norm, beta] {
auto norm_val = norm.to<scalar_t>();
gpu_kernel(iter, [norm_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
scalar_t beta_val(beta);
gpu_kernel(iter, [norm_val, beta_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
const auto x = input - target;
if (x < scalar_t(-1))
if (x < -beta_val)
return -norm_val * grad_output;
else if (x > scalar_t(1))
else if (x > beta_val)
return norm_val * grad_output;
else
return norm_val * x * grad_output;
return norm_val * x * grad_output / beta_val;
});
});
}
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6545,23 +6545,23 @@
CPU: nll_loss2d_backward_cpu
CUDA: legacy::cuda::_thnn_nll_loss2d_backward

- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
python_module: nn
dispatch:
CPU: smooth_l1_loss_out
CUDA: smooth_l1_loss_out

- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
use_c10_dispatcher: full
python_module: nn

- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0, *, Tensor(a!) grad_input) -> Tensor(a!)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: you shouldn't need to provide a default parameter -- we don't expose these backwards functions to python, they should only be called via autograd. And you'll notice that reduction doesn't have a default parameter.

The parts I'm not sure about here are if TorchScript would let you serialize this function (it shouldn't) and/or if the backwards compatibility test would complain.

python_module: nn
dispatch:
CPU: smooth_l1_loss_backward_out
CUDA: smooth_l1_loss_backward_out

- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
use_c10_dispatcher: full
python_module: nn

Expand Down
12 changes: 12 additions & 0 deletions test/cpp/api/functional.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,18 @@ TEST_F(FunctionalTest, SmoothL1LossDefaultOptions) {
ASSERT_TRUE(input.sizes() == input.grad().sizes());
}

TEST_F(FunctionalTest, SmoothL1LossBeta) {
auto input = torch::tensor({0.1, 1.5, 10.0}, torch::dtype(torch::kFloat).requires_grad(true));
auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
auto output =
F::smooth_l1_loss(input, target, /*reduction=*/torch::kMean, /*beta=*/0.5);
auto expected = torch::tensor(1.67, torch::kFloat);
auto s = output.sum();
s.backward();
ASSERT_TRUE(output.allclose(expected));
ASSERT_TRUE(input.sizes() == input.grad().sizes());
}

TEST_F(FunctionalTest, SmoothL1LossNoReduction) {
auto input = torch::tensor({0.1, 1.2, 4.7}, torch::dtype(torch::kFloat).requires_grad(true));
auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
Expand Down
14 changes: 7 additions & 7 deletions tools/autograd/derivatives.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1221,9 +1221,9 @@
self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
target: non_differentiable

- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
self: smooth_l1_loss_backward(grad, self, target, reduction)
target: smooth_l1_loss_backward(grad, target, self, reduction)
- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
self: smooth_l1_loss_backward(grad, self, target, reduction, beta)
target: smooth_l1_loss_backward(grad, target, self, reduction, beta)

- name: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
self: soft_margin_loss_backward(grad, self, target, reduction)
Expand Down Expand Up @@ -1589,10 +1589,10 @@
grad_output: replication_pad3d(grad, padding)
self: zeros_like(self)

- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)

- name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
grad_output: softplus_backward(grad, self, beta, threshold, output)
Expand Down
14 changes: 8 additions & 6 deletions torch/csrc/api/include/torch/nn/functional/loss.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,25 +307,26 @@ inline Tensor cosine_embedding_loss(

// ============================================================================

inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target) {
inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target, double beta = 1.) {
auto t = torch::abs(input - target);
return torch::where(t < 1, 0.5 * torch::pow(t, 2), t - 0.5);
return torch::where(t < beta, 0.5 * torch::pow(t, 2) / beta, t - 0.5 * beta);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not 100% clear to me if I need to change this, since this seems independent of the kernel code

}

#ifndef DOXYGEN_SHOULD_SKIP_THIS
namespace detail {
inline Tensor smooth_l1_loss(
const Tensor& input,
const Tensor& target,
SmoothL1LossFuncOptions::reduction_t reduction) {
SmoothL1LossFuncOptions::reduction_t reduction,
double beta = 1.) {
if (target.sizes() != input.sizes()) {
TORCH_WARN("Using a target size (", target.sizes(), ") that is different to the input size (", input.sizes(), "). ",
"This will likely lead to incorrect results due to broadcasting. ",
"Please ensure they have the same size.");
}

std::vector<Tensor> expanded_tensors = torch::broadcast_tensors({input, target});
return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction));
return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction), beta);
}
} // namespace detail
#endif /* DOXYGEN_SHOULD_SKIP_THIS */
Expand All @@ -344,8 +345,9 @@ inline Tensor smooth_l1_loss(
inline Tensor smooth_l1_loss(
const Tensor& input,
const Tensor& target,
const SmoothL1LossFuncOptions& options = {}) {
return detail::smooth_l1_loss(input, target, options.reduction());
const SmoothL1LossFuncOptions& options = {},
double beta = 1.) {
return detail::smooth_l1_loss(input, target, options.reduction(), beta);
}

// ============================================================================
Expand Down
14 changes: 9 additions & 5 deletions torch/csrc/autograd/FunctionsManual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -952,20 +952,24 @@ Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & i
return output;
}

Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
// special case to protect against a divide-by-zero.
if (beta == 0) {
return at::zeros(grad.sizes(), grad.options());
}
auto d = (input - target).abs();
auto grad_input = grad * (d < 1).type_as(grad);
auto grad_input = grad * (d < beta).type_as(grad) / beta;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can beta be 0 here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, good catch. I'll try rewriting it in a way to prevent divide by zeros.

if (reduction == at::Reduction::Mean) {
grad_input /= input.numel();
}
return grad_input;
}

Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
if (reduction == at::Reduction::None) {
return smooth_l1_loss_backward(grad, input, target, reduction);
return smooth_l1_loss_backward(grad, input, target, reduction, beta);
}
auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction);
auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction, beta);
return (r * grad).sum();
}

Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/autograd/FunctionsManual.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ at::Tensor log_softmax_double_backward(const at::Tensor & grad, const at::Tensor
at::Tensor binary_cross_entropy_double_backward(const at::Tensor & grad_output, const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
at::Tensor binary_cross_entropy_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
at::Tensor l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
at::Tensor mse_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, int64_t reduction);
at::Tensor mse_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
at::Tensor soft_margin_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
Expand Down
Loading