Skip to content

Commit 3b5daef

Browse files
VitalyFedyuninfacebook-github-bot
authored andcommitted
Move addcmul to Aten (#22874)
Summary: Move CPU implementation of the `addcmul` operator to Aten ( #22797 ) ### before ```python In [11]: timeit x.addcmul(a, b) 1.31 ms ± 18.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` ### after ```python In [9]: timeit x.addcmul(a, b) 588 µs ± 22.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) ``` Adding custom code for the case when `value == 1`, doesn't provide significant performance gain. Pull Request resolved: #22874 Differential Revision: D16359348 Pulled By: VitalyFedyunin fbshipit-source-id: 941ead835672fca78a1fcc762da052e64308b111
1 parent dded794 commit 3b5daef

File tree

9 files changed

+126
-36
lines changed

9 files changed

+126
-36
lines changed

aten/src/ATen/Declarations.cwrap

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2386,6 +2386,8 @@
23862386
cname: addcmul
23872387
variants:
23882388
- function
2389+
backends:
2390+
- CUDA
23892391
return: argument 0
23902392
arguments:
23912393
- arg: THTensor* result
@@ -2403,6 +2405,8 @@
24032405
options:
24042406
- cname: addcmul
24052407
variants: function
2408+
backends:
2409+
- CUDA
24062410
return: argument 0
24072411
arguments:
24082412
- THTensor* self

aten/src/ATen/core/Tensor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,6 @@ class CAFFE2_API Tensor {
696696
Tensor & remainder_(const Tensor & other);
697697
Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1);
698698
Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
699-
Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
700699
Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
701700
Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr);
702701
Tensor & random_(int64_t to, Generator * generator=nullptr);
@@ -731,6 +730,7 @@ class CAFFE2_API Tensor {
731730
std::vector<Tensor> nonzero_numpy() const;
732731
Tensor gather(int64_t dim, const Tensor & index, bool sparse_grad=false) const;
733732
Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
733+
Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1);
734734
Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
735735
std::tuple<Tensor,Tensor> lstsq(const Tensor & A) const;
736736
std::tuple<Tensor,Tensor> triangular_solve(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;

aten/src/ATen/core/TensorMethods.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,10 +1405,6 @@ inline Tensor Tensor::addbmm(const Tensor & batch1, const Tensor & batch2, Scala
14051405
static auto table = globalATenDispatch().getOpTable("aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor");
14061406
return table->getOp<Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, batch1, batch2, beta, alpha);
14071407
}
1408-
inline Tensor & Tensor::addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
1409-
static auto table = globalATenDispatch().getOpTable("aten::addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)");
1410-
return table->getOp<Tensor & (Tensor &, const Tensor &, const Tensor &, Scalar)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, tensor1, tensor2, value);
1411-
}
14121408
inline Tensor & Tensor::addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
14131409
static auto table = globalATenDispatch().getOpTable("aten::addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)");
14141410
return table->getOp<Tensor & (Tensor &, const Tensor &, const Tensor &, Scalar)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, tensor1, tensor2, value);
@@ -1545,6 +1541,10 @@ inline Tensor Tensor::addcmul(const Tensor & tensor1, const Tensor & tensor2, Sc
15451541
static auto table = globalATenDispatch().getOpTable("aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor");
15461542
return table->getOp<Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, tensor1, tensor2, value);
15471543
}
1544+
inline Tensor & Tensor::addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) {
1545+
static auto table = globalATenDispatch().getOpTable("aten::addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)");
1546+
return table->getOp<Tensor & (Tensor &, const Tensor &, const Tensor &, Scalar)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, tensor1, tensor2, value);
1547+
}
15481548
inline Tensor Tensor::addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
15491549
static auto table = globalATenDispatch().getOpTable("aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor");
15501550
return table->getOp<Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, tensor1, tensor2, value);
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Ternary and higher-order pointwise operations
2+
#include <ATen/native/PointwiseOps.h>
3+
4+
#include <ATen/ATen.h>
5+
#include <ATen/NativeFunctions.h>
6+
#include <ATen/MemoryOverlap.h>
7+
#include <ATen/native/TensorIterator.h>
8+
9+
#ifdef BUILD_NAMEDTENSOR
10+
#include <ATen/NamedTensorUtils.h>
11+
#endif
12+
13+
namespace at {
14+
namespace native {
15+
16+
Tensor addcmul_cpu(
17+
const Tensor& self,
18+
const Tensor& tensor1,
19+
const Tensor& tensor2,
20+
Scalar value) {
21+
Tensor result = at::empty({0}, self.options());
22+
return at::addcmul_out(result, self, tensor1, tensor2, value);
23+
}
24+
25+
Tensor& addcmul_cpu_(
26+
Tensor& self,
27+
const Tensor& tensor1,
28+
const Tensor& tensor2,
29+
Scalar value) {
30+
return at::addcmul_out(self, self, tensor1, tensor2, value);
31+
}
32+
33+
Tensor& addcmul_cpu_out(
34+
Tensor& result,
35+
const Tensor& self,
36+
const Tensor& tensor1,
37+
const Tensor& tensor2,
38+
Scalar value) {
39+
checkBackend("addcmul_cpu", result, self.type().backend());
40+
auto iter = at::TensorIterator();
41+
iter.check_and_add_output(result);
42+
iter.add_input(self);
43+
iter.add_input(tensor1);
44+
iter.add_input(tensor2);
45+
iter.build();
46+
addcmul_stub(kCPU, iter, value);
47+
#ifdef BUILD_NAMEDTENSOR
48+
at::namedinference::propagate_names(result, self);
49+
#endif
50+
return result;
51+
}
52+
53+
DEFINE_DISPATCH(addcmul_stub);
54+
55+
} // namespace native
56+
} // namespace at
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Ternary and higher-order pointwise operations
2+
#pragma once
3+
4+
#include <ATen/ATen.h>
5+
#include <ATen/native/DispatchStub.h>
6+
7+
namespace at {
8+
9+
struct TensorIterator;
10+
11+
namespace native {
12+
13+
using addcmul_fn = void (*)(TensorIterator&, Scalar scalar);
14+
15+
DECLARE_DISPATCH(addcmul_fn, addcmul_stub);
16+
} // namespace native
17+
} // namespace at
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Ternary and higher-order pointwise operations
2+
#include <ATen/ATen.h>
3+
4+
#include <ATen/Dispatch.h>
5+
#include <ATen/native/PointwiseOps.h>
6+
#include <ATen/native/TensorIterator.h>
7+
#include <ATen/native/cpu/Loops.h>
8+
9+
namespace at {
10+
namespace native {
11+
namespace {
12+
13+
static void addcmul_cpu_kernel(TensorIterator& iter, Scalar value) {
14+
ScalarType dtype = iter.dtype(0);
15+
AT_DISPATCH_ALL_TYPES(dtype, "addcmul_cpu_out", [&] {
16+
scalar_t scalar_val = value.to<scalar_t>();
17+
auto scalar_vec = Vec256<scalar_t>(scalar_val);
18+
cpu_kernel_vec(
19+
iter,
20+
[=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
21+
return self_val + scalar_val * t1_val * t2_val;
22+
},
23+
[=](Vec256<scalar_t> self_vec,
24+
Vec256<scalar_t> t1_vec,
25+
Vec256<scalar_t> t2_vec) {
26+
return self_vec + scalar_vec * t1_vec * t2_vec;
27+
});
28+
});
29+
}
30+
31+
} // anonymous namespace
32+
33+
REGISTER_DISPATCH(addcmul_stub, &addcmul_cpu_kernel);
34+
35+
} // namespace native
36+
} // namespace at

aten/src/ATen/native/native_functions.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3436,12 +3436,6 @@
34363436
CPU: legacy::cpu::_th_addbmm
34373437
CUDA: legacy::cuda::_th_addbmm
34383438

3439-
- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
3440-
variants: method
3441-
dispatch:
3442-
CPU: legacy::cpu::_th_addcmul_
3443-
CUDA: legacy::cuda::_th_addcmul_
3444-
34453439
- func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
34463440
variants: method
34473441
dispatch:
@@ -3755,15 +3749,21 @@
37553749

37563750
- func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
37573751
dispatch:
3758-
CPU: legacy::cpu::_th_addcmul_out
3752+
CPU: addcmul_cpu_out
37593753
CUDA: legacy::cuda::_th_addcmul_out
37603754

37613755
- func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
37623756
variants: method, function
37633757
dispatch:
3764-
CPU: legacy::cpu::_th_addcmul
3758+
CPU: addcmul_cpu
37653759
CUDA: legacy::cuda::_th_addcmul
37663760

3761+
- func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
3762+
variants: method
3763+
dispatch:
3764+
CPU: addcmul_cpu_
3765+
CUDA: legacy::cuda::_th_addcmul_
3766+
37673767
- func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
37683768
dispatch:
37693769
CPU: legacy::cpu::_th_addcdiv_out

aten/src/TH/generic/THTensorMath.cpp

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -575,28 +575,6 @@ void THTensor_(tpow)(THTensor *r_, scalar_t value, THTensor *t)
575575
}
576576
}
577577

578-
void THTensor_(addcmul)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2)
579-
{
580-
if(r_ != t)
581-
{
582-
THTensor_(resizeAs)(r_, t);
583-
at::Tensor r__wrap = THTensor_wrap(r_);
584-
at::Tensor t_wrap = THTensor_wrap(t);
585-
at::native::copy_(r__wrap, t_wrap);
586-
}
587-
int64_t r_Size = THTensor_(nElement)(r_);
588-
int64_t src1Size = THTensor_(nElement)(src1);
589-
int64_t src2Size = THTensor_(nElement)(src2);
590-
int r_Contig = THTensor_(isContiguous)(r_);
591-
int src1Contig = THTensor_(isContiguous)(src1);
592-
int src2Contig = THTensor_(isContiguous)(src2);
593-
if( (src1Size == src2Size) && (src1Size == r_Size) ){
594-
TH_TENSOR_APPLY3_PARALLEL(r_Size, r_Contig, src1Contig, src2Contig, scalar_t, r_, scalar_t, src1, scalar_t, src2, *r__data += value * *src1_data * *src2_data;, UNCERTAIN_TH_OMP_OVERHEAD_THRESHOLD);
595-
} else {
596-
TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, src1, scalar_t, src2, *r__data += value * *src1_data * *src2_data;);
597-
}
598-
}
599-
600578
void THTensor_(addcdiv)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2)
601579
{
602580
if(r_ != t)

aten/src/TH/generic/THTensorMath.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src);
129129
TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src);
130130
TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src);
131131

132-
TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2);
133132
TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, scalar_t value, THTensor *src1, THTensor *src2);
134133

135134
TH_API void THTensor_(addmv)(THTensor *r_, scalar_t beta, THTensor *t, scalar_t alpha, THTensor *mat, THTensor *vec);

0 commit comments

Comments
 (0)