Skip to content

Commit 82b5705

Browse files
James Reedfacebook-github-bot
authored andcommitted
Move abs, frac, reciprocal, and neg to TensorIterator (#19041)
Summary: I've been messing around with vectorizing the fusion compiler in JIT, and noticed that these ops were pathologically slow. I moved them to use TensorIterator + Vec256<> and got some speed wins. Benchmark script: ``` import torch, time ops = ['abs', 'neg', 'reciprocal', 'frac'] x = torch.rand(1024, 1024) NITER = 10000 print('op', 'time per iter (ms)', 'gops/s', 'GB/s', sep='\t') for op in ops: s = time.time() for i in range(NITER): getattr(x, op)() elapsed_sec = ((time.time() - s) / NITER) print(op, elapsed_sec * 1000, (1024*1024/elapsed_sec)/1e9, (1024*1024*4*2) / elapsed_sec / 1e9, sep='\t') ``` Before this change (on my mac with a skylake): ``` op time per iter (ms) gops/s GB/s abs 0.9730974197387695 1.0775652866097343 8.620522292877874 neg 1.0723679780960083 0.9778136063534356 7.822508850827485 reciprocal 1.2610594034194946 0.8315040490215421 6.6520323921723366 frac 1.1681334018707275 0.8976509004200546 7.181207203360437 ``` After this change: ``` op time per iter (ms) gops/s GB/s abs 0.5031076192855835 2.084198210889721 16.673585687117768 neg 0.4433974027633667 2.3648672578256087 18.91893806260487 reciprocal 0.47145988941192624 2.2241043693195985 17.79283495455679 frac 0.5036592721939087 2.0819154096627024 16.65532327730162 ``` So, after this change it looks like we are hitting machine peak for bandwidth and are bandwidth bound. Pull Request resolved: #19041 Differential Revision: D14862037 Pulled By: jamesr66a fbshipit-source-id: e2032ac0ca962dbf4120bb36812277c260e22912
1 parent 56b18ea commit 82b5705

File tree

15 files changed

+178
-117
lines changed

15 files changed

+178
-117
lines changed

aten/src/ATen/Declarations.cwrap

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,6 @@
879879
name: _th_abs
880880
cname: abs
881881
backends:
882-
- CPU
883882
- CUDA
884883
variants: function
885884
return: argument 0
@@ -1353,7 +1352,6 @@
13531352
types:
13541353
- floating_point
13551354
backends:
1356-
- CPU
13571355
- CUDA
13581356
cname: frac
13591357
variants: function
@@ -1368,7 +1366,6 @@
13681366
types:
13691367
- floating_point
13701368
backends:
1371-
- CPU
13721369
- CUDA
13731370
variants:
13741371
- function
@@ -1506,7 +1503,6 @@
15061503
types:
15071504
- floating_point
15081505
backends:
1509-
- CPU
15101506
- CUDA
15111507
variants:
15121508
- function
@@ -1523,7 +1519,6 @@
15231519
types:
15241520
- floating_point
15251521
backends:
1526-
- CPU
15271522
- CUDA
15281523
variants: function
15291524
options:
@@ -1536,7 +1531,6 @@
15361531
[[
15371532
name: _th_neg
15381533
backends:
1539-
- CPU
15401534
- CUDA
15411535
variants:
15421536
- function
@@ -1551,7 +1545,6 @@
15511545
[[
15521546
name: _th_neg_
15531547
backends:
1554-
- CPU
15551548
- CUDA
15561549
variants: function
15571550
options:

aten/src/ATen/core/Tensor.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,8 @@ class CAFFE2_API Tensor {
408408
Tensor & fill_(const Tensor & value);
409409
Tensor floor() const;
410410
Tensor & floor_();
411+
Tensor frac() const;
412+
Tensor & frac_();
411413
Tensor ger(const Tensor & vec2) const;
412414
Tensor fft(int64_t signal_ndim, bool normalized=false) const;
413415
Tensor ifft(int64_t signal_ndim, bool normalized=false) const;
@@ -465,6 +467,10 @@ class CAFFE2_API Tensor {
465467
Tensor permute(IntArrayRef dims) const;
466468
Tensor pin_memory() const;
467469
Tensor pinverse(double rcond=1e-15) const;
470+
Tensor reciprocal() const;
471+
Tensor & reciprocal_();
472+
Tensor neg() const;
473+
Tensor & neg_();
468474
Tensor repeat(IntArrayRef repeats) const;
469475
Tensor repeat_interleave(const Tensor & repeats, c10::optional<int64_t> dim=c10::nullopt) const;
470476
Tensor repeat_interleave(int64_t repeats, c10::optional<int64_t> dim=c10::nullopt) const;
@@ -648,10 +654,7 @@ class CAFFE2_API Tensor {
648654
Tensor & digamma_();
649655
Tensor & polygamma_(int64_t n);
650656
Tensor & erfinv_();
651-
Tensor & frac_();
652657
Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm);
653-
Tensor & reciprocal_();
654-
Tensor & neg_();
655658
Tensor & pow_(Scalar exponent);
656659
Tensor & pow_(const Tensor & exponent);
657660
Tensor & lerp_(const Tensor & end, Scalar weight);
@@ -718,10 +721,7 @@ class CAFFE2_API Tensor {
718721
Tensor digamma() const;
719722
Tensor polygamma(int64_t n) const;
720723
Tensor erfinv() const;
721-
Tensor frac() const;
722724
Tensor dist(const Tensor & other, Scalar p=2) const;
723-
Tensor reciprocal() const;
724-
Tensor neg() const;
725725
Tensor atan2(const Tensor & other) const;
726726
Tensor lerp(const Tensor & end, Scalar weight) const;
727727
Tensor lerp(const Tensor & end, const Tensor & weight) const;

aten/src/ATen/core/TensorMethods.h

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,12 @@ inline Tensor Tensor::floor() const {
280280
inline Tensor & Tensor::floor_() {
281281
return dispatch_type().floor_(*this);
282282
}
283+
inline Tensor Tensor::frac() const {
284+
return dispatch_type().frac(*this);
285+
}
286+
inline Tensor & Tensor::frac_() {
287+
return dispatch_type().frac_(*this);
288+
}
283289
inline Tensor Tensor::ger(const Tensor & vec2) const {
284290
return dispatch_type().ger(*this, vec2);
285291
}
@@ -451,6 +457,18 @@ inline Tensor Tensor::pin_memory() const {
451457
inline Tensor Tensor::pinverse(double rcond) const {
452458
return dispatch_type().pinverse(*this, rcond);
453459
}
460+
inline Tensor Tensor::reciprocal() const {
461+
return dispatch_type().reciprocal(*this);
462+
}
463+
inline Tensor & Tensor::reciprocal_() {
464+
return dispatch_type().reciprocal_(*this);
465+
}
466+
inline Tensor Tensor::neg() const {
467+
return dispatch_type().neg(*this);
468+
}
469+
inline Tensor & Tensor::neg_() {
470+
return dispatch_type().neg_(*this);
471+
}
454472
inline Tensor Tensor::repeat(IntArrayRef repeats) const {
455473
return dispatch_type().repeat(*this, repeats);
456474
}
@@ -1000,18 +1018,9 @@ inline Tensor & Tensor::polygamma_(int64_t n) {
10001018
inline Tensor & Tensor::erfinv_() {
10011019
return dispatch_type().erfinv_(*this);
10021020
}
1003-
inline Tensor & Tensor::frac_() {
1004-
return dispatch_type().frac_(*this);
1005-
}
10061021
inline Tensor & Tensor::renorm_(Scalar p, int64_t dim, Scalar maxnorm) {
10071022
return dispatch_type().renorm_(*this, p, dim, maxnorm);
10081023
}
1009-
inline Tensor & Tensor::reciprocal_() {
1010-
return dispatch_type().reciprocal_(*this);
1011-
}
1012-
inline Tensor & Tensor::neg_() {
1013-
return dispatch_type().neg_(*this);
1014-
}
10151024
inline Tensor & Tensor::pow_(Scalar exponent) {
10161025
return dispatch_type().pow_(*this, exponent);
10171026
}
@@ -1210,18 +1219,9 @@ inline Tensor Tensor::polygamma(int64_t n) const {
12101219
inline Tensor Tensor::erfinv() const {
12111220
return dispatch_type().erfinv(*this);
12121221
}
1213-
inline Tensor Tensor::frac() const {
1214-
return dispatch_type().frac(*this);
1215-
}
12161222
inline Tensor Tensor::dist(const Tensor & other, Scalar p) const {
12171223
return dispatch_type().dist(*this, other, p);
12181224
}
1219-
inline Tensor Tensor::reciprocal() const {
1220-
return dispatch_type().reciprocal(*this);
1221-
}
1222-
inline Tensor Tensor::neg() const {
1223-
return dispatch_type().neg(*this);
1224-
}
12251225
inline Tensor Tensor::atan2(const Tensor & other) const {
12261226
return dispatch_type().atan2(*this, other);
12271227
}

aten/src/ATen/core/Type.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,8 @@ struct CAFFE2_API Type {
282282
virtual Tensor & fill_(Tensor & self, const Tensor & value) const = 0;
283283
virtual Tensor floor(const Tensor & self) const = 0;
284284
virtual Tensor & floor_(Tensor & self) const = 0;
285+
virtual Tensor frac(const Tensor & self) const = 0;
286+
virtual Tensor & frac_(Tensor & self) const = 0;
285287
virtual Tensor ger(const Tensor & self, const Tensor & vec2) const = 0;
286288
virtual Tensor fft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
287289
virtual Tensor ifft(const Tensor & self, int64_t signal_ndim, bool normalized) const = 0;
@@ -339,6 +341,10 @@ struct CAFFE2_API Type {
339341
virtual Tensor permute(const Tensor & self, IntArrayRef dims) const = 0;
340342
virtual Tensor pin_memory(const Tensor & self) const = 0;
341343
virtual Tensor pinverse(const Tensor & self, double rcond) const = 0;
344+
virtual Tensor reciprocal(const Tensor & self) const = 0;
345+
virtual Tensor & reciprocal_(Tensor & self) const = 0;
346+
virtual Tensor neg(const Tensor & self) const = 0;
347+
virtual Tensor & neg_(Tensor & self) const = 0;
342348
virtual Tensor repeat(const Tensor & self, IntArrayRef repeats) const = 0;
343349
virtual Tensor repeat_interleave(const Tensor & repeats) const = 0;
344350
virtual Tensor repeat_interleave(const Tensor & self, const Tensor & repeats, c10::optional<int64_t> dim) const = 0;
@@ -523,10 +529,7 @@ struct CAFFE2_API Type {
523529
virtual Tensor & digamma_(Tensor & self) const = 0;
524530
virtual Tensor & polygamma_(Tensor & self, int64_t n) const = 0;
525531
virtual Tensor & erfinv_(Tensor & self) const = 0;
526-
virtual Tensor & frac_(Tensor & self) const = 0;
527532
virtual Tensor & renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) const = 0;
528-
virtual Tensor & reciprocal_(Tensor & self) const = 0;
529-
virtual Tensor & neg_(Tensor & self) const = 0;
530533
virtual Tensor & pow_(Tensor & self, Scalar exponent) const = 0;
531534
virtual Tensor & pow_(Tensor & self, const Tensor & exponent) const = 0;
532535
virtual Tensor & lerp_(Tensor & self, const Tensor & end, Scalar weight) const = 0;
@@ -593,10 +596,7 @@ struct CAFFE2_API Type {
593596
virtual Tensor digamma(const Tensor & self) const = 0;
594597
virtual Tensor polygamma(int64_t n, const Tensor & self) const = 0;
595598
virtual Tensor erfinv(const Tensor & self) const = 0;
596-
virtual Tensor frac(const Tensor & self) const = 0;
597599
virtual Tensor dist(const Tensor & self, const Tensor & other, Scalar p) const = 0;
598-
virtual Tensor reciprocal(const Tensor & self) const = 0;
599-
virtual Tensor neg(const Tensor & self) const = 0;
600600
virtual Tensor atan2(const Tensor & self, const Tensor & other) const = 0;
601601
virtual Tensor lerp(const Tensor & self, const Tensor & end, Scalar weight) const = 0;
602602
virtual Tensor lerp(const Tensor & self, const Tensor & end, const Tensor & weight) const = 0;

aten/src/ATen/cpu/vec256/vec256_base.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,9 @@ struct Vec256 {
194194
Vec256<T> expm1() const {
195195
return map(std::expm1);
196196
}
197+
Vec256<T> frac() const {
198+
return *this - this->trunc();
199+
}
197200
Vec256<T> log() const {
198201
return map(std::log);
199202
}
@@ -219,7 +222,10 @@ struct Vec256 {
219222
return map(std::floor);
220223
}
221224
Vec256<T> neg() const {
222-
return map([](T x) { return -x; });
225+
// NB: the trailing return type is needed because we need to coerce the
226+
// return value back to T in the case of unary operator- incuring a
227+
// promotion
228+
return map([](T x) -> T { return -x; });
223229
}
224230
Vec256<T> round() const {
225231
return map(std::nearbyint);

aten/src/ATen/cpu/vec256/vec256_double.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ template <> class Vec256<double> {
141141
Vec256<double> floor() const {
142142
return _mm256_floor_pd(values);
143143
}
144+
Vec256<double> frac() const;
144145
Vec256<double> neg() const {
145146
return _mm256_xor_pd(_mm256_set1_pd(-0.), values);
146147
}
@@ -216,6 +217,11 @@ Vec256<double> inline operator/(const Vec256<double>& a, const Vec256<double>& b
216217
return _mm256_div_pd(a, b);
217218
}
218219

220+
// frac. Implement this here so we can use subtraction.
221+
Vec256<double> Vec256<double>::frac() const {
222+
return *this - this->trunc();
223+
}
224+
219225
// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
220226
// either input is a NaN.
221227
template <>

aten/src/ATen/cpu/vec256/vec256_float.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ template <> class Vec256<float> {
131131
Vec256<float> log1p() const {
132132
return Vec256<float>(Sleef_log1pf8_u10(values));
133133
}
134+
Vec256<float> frac() const;
134135
Vec256<float> sin() const {
135136
return map(std::sin);
136137
}
@@ -224,6 +225,11 @@ Vec256<float> inline operator/(const Vec256<float>& a, const Vec256<float>& b) {
224225
return _mm256_div_ps(a, b);
225226
}
226227

228+
// frac. Implement this here so we can use subtraction
229+
Vec256<float> Vec256<float>::frac() const {
230+
return *this - this->trunc();
231+
}
232+
227233
// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
228234
// either input is a NaN.
229235
template <>

aten/src/ATen/cpu/vec256/vec256_int.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ struct Vec256<int64_t> : public Vec256i {
9696
auto inverse = _mm256_xor_si256(values, is_larger);
9797
return _mm256_sub_epi64(inverse, is_larger);
9898
}
99+
Vec256<int64_t> frac() const;
100+
Vec256<int64_t> neg() const;
99101
Vec256<int64_t> operator==(const Vec256<int64_t>& other) const {
100102
return _mm256_cmpeq_epi64(values, other.values);
101103
}
@@ -185,6 +187,8 @@ struct Vec256<int32_t> : public Vec256i {
185187
Vec256<int32_t> abs() const {
186188
return _mm256_abs_epi32(values);
187189
}
190+
Vec256<int32_t> frac() const;
191+
Vec256<int32_t> neg() const;
188192
Vec256<int32_t> operator==(const Vec256<int32_t>& other) const {
189193
return _mm256_cmpeq_epi32(values, other.values);
190194
}
@@ -369,6 +373,8 @@ struct Vec256<int16_t> : public Vec256i {
369373
Vec256<int16_t> abs() const {
370374
return _mm256_abs_epi16(values);
371375
}
376+
Vec256<int16_t> frac() const;
377+
Vec256<int16_t> neg() const;
372378
Vec256<int16_t> operator==(const Vec256<int16_t>& other) const {
373379
return _mm256_cmpeq_epi16(values, other.values);
374380
}
@@ -419,6 +425,19 @@ Vec256<int16_t> inline operator-(const Vec256<int16_t>& a, const Vec256<int16_t>
419425
return _mm256_sub_epi16(a, b);
420426
}
421427

428+
// Negation. Defined here so we can utilize operator-
429+
Vec256<int64_t> Vec256<int64_t>::neg() const {
430+
return Vec256<int64_t>(0) - *this;
431+
}
432+
433+
Vec256<int32_t> Vec256<int32_t>::neg() const {
434+
return Vec256<int32_t>(0) - *this;
435+
}
436+
437+
Vec256<int16_t> Vec256<int16_t>::neg() const {
438+
return Vec256<int16_t>(0) - *this;
439+
}
440+
422441
// Emulate operations with no native 64-bit support in avx,
423442
// by extracting each element, performing the operation pointwise,
424443
// then combining the results into a vector.

aten/src/ATen/native/LegacyDefinitions.cpp

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -162,22 +162,10 @@ Tensor & erfinv_(Tensor& self) {
162162
return at::legacy::th::_th_erfinv_(self);
163163
}
164164

165-
Tensor & frac_(Tensor& self) {
166-
return at::legacy::th::_th_frac_(self);
167-
}
168-
169165
Tensor & renorm_(Tensor& self, Scalar p, int64_t dim, Scalar maxnorm) {
170166
return at::legacy::th::_th_renorm_(self, p, dim, maxnorm);
171167
}
172168

173-
Tensor & reciprocal_(Tensor& self) {
174-
return at::legacy::th::_th_reciprocal_(self);
175-
}
176-
177-
Tensor & neg_(Tensor& self) {
178-
return at::legacy::th::_th_neg_(self);
179-
}
180-
181169
Tensor & pow_(Tensor& self, Scalar exponent) {
182170
return at::legacy::th::_th_pow_(self, exponent);
183171
}
@@ -563,34 +551,10 @@ Tensor erfinv(const Tensor & self) {
563551
return at::legacy::th::_th_erfinv(self);
564552
}
565553

566-
Tensor & frac_out(Tensor & result, const Tensor & self) {
567-
return at::legacy::th::_th_frac_out(result, self);
568-
}
569-
570-
Tensor frac(const Tensor & self) {
571-
return at::legacy::th::_th_frac(self);
572-
}
573-
574554
Tensor dist(const Tensor & self, const Tensor & other, Scalar p) {
575555
return at::legacy::th::_th_dist(self, other, p);
576556
}
577557

578-
Tensor & reciprocal_out(Tensor & result, const Tensor & self) {
579-
return at::legacy::th::_th_reciprocal_out(result, self);
580-
}
581-
582-
Tensor reciprocal(const Tensor & self) {
583-
return at::legacy::th::_th_reciprocal(self);
584-
}
585-
586-
Tensor & neg_out(Tensor & result, const Tensor & self) {
587-
return at::legacy::th::_th_neg_out(result, self);
588-
}
589-
590-
Tensor neg(const Tensor & self) {
591-
return at::legacy::th::_th_neg(self);
592-
}
593-
594558
Tensor & atan2_out(Tensor & result, const Tensor & self, const Tensor & other) {
595559
return at::legacy::th::_th_atan2_out(result, self, other);
596560
}

0 commit comments

Comments
 (0)