Skip to content

Commit 155f767

Browse files
syed-ahmedfacebook-github-bot
authored andcommitted
Move THCTensor_{normal, normal_means, normal_stddevs, normal_means_stddevs} to ATen (#21287)
Summary: ## Effective Bandwidth Benchmark - using https://gist.github.com/syed-ahmed/f8b7384d642f4bce484228b508b4bc68 - on V100 ### Float Type #### Before: ``` normal, size, elements 65536 forward 4.956722259521484e-06 bandwidth (GB/s) 52.88656218258779 normal, size, elements 131072 forward 5.285739898681641e-06 bandwidth (GB/s) 99.18914098114568 normal, size, elements 262144 forward 7.548332214355469e-06 bandwidth (GB/s) 138.91492454529376 normal, size, elements 524288 forward 1.1980533599853516e-05 bandwidth (GB/s) 175.0466273076219 normal, size, elements 1048576 forward 2.091646194458008e-05 bandwidth (GB/s) 200.52645667862762 normal, size, elements 2097152 forward 3.9961338043212894e-05 bandwidth (GB/s) 209.91809610901498 normal, size, elements 4194304 forward 7.39765167236328e-05 bandwidth (GB/s) 226.79110538115253 normal, size, elements 8388608 forward 0.0001377725601196289 bandwidth (GB/s) 243.5494555001696 normal, size, elements 16777216 forward 0.0002710080146789551 bandwidth (GB/s) 247.62686107087774 normal, size, elements 33554432 forward 0.0005375170707702637 bandwidth (GB/s) 249.69947058177252 ``` #### After: ``` normal, size, elements 65536 forward 6.198883056640625e-06 bandwidth (GB/s) 42.288908760615385 normal, size, elements 131072 forward 6.756782531738281e-06 bandwidth (GB/s) 77.59432800112916 normal, size, elements 262144 forward 7.560253143310547e-06 bandwidth (GB/s) 138.6958849291706 normal, size, elements 524288 forward 7.550716400146485e-06 bandwidth (GB/s) 277.7421225831386 normal, size, elements 1048576 forward 1.1034011840820313e-05 bandwidth (GB/s) 380.1250225673293 normal, size, elements 2097152 forward 1.802682876586914e-05 bandwidth (GB/s) 465.34019427102237 normal, size, elements 4194304 forward 2.8417110443115234e-05 bandwidth (GB/s) 590.3913430460946 normal, size, elements 8388608 forward 4.8711299896240235e-05 bandwidth (GB/s) 688.8428777608927 normal, size, elements 16777216 forward 9.685993194580078e-05 bandwidth (GB/s) 692.8444265018856 normal, size, elements 33554432 forward 0.00018213510513305663 bandwidth (GB/s) 736.9130069787966 ``` ### Double Type #### Before: ``` normal, size, elements 65536 forward 5.8841705322265624e-06 bandwidth (GB/s) 44.55071425348461 normal, size, elements 131072 forward 8.018016815185547e-06 bandwidth (GB/s) 65.38873789925661 normal, size, elements 262144 forward 1.2989044189453124e-05 bandwidth (GB/s) 80.72772597474304 normal, size, elements 524288 forward 2.2075176239013673e-05 bandwidth (GB/s) 95.00046465285668 normal, size, elements 1048576 forward 4.1041374206542965e-05 bandwidth (GB/s) 102.19696784254678 normal, size, elements 2097152 forward 7.57598876953125e-05 bandwidth (GB/s) 110.72624650312186 normal, size, elements 4194304 forward 0.00013725996017456056 bandwidth (GB/s) 122.22949779865557 normal, size, elements 8388608 forward 0.0002614736557006836 bandwidth (GB/s) 128.32815569921402 normal, size, elements 16777216 forward 0.0005080199241638184 bandwidth (GB/s) 132.0988819689674 normal, size, elements 33554432 forward 0.0009479570388793945 bandwidth (GB/s) 141.58629821311564 ``` #### After: ``` normal, size, elements 65536 forward 5.991458892822265e-06 bandwidth (GB/s) 43.75294977222444 normal, size, elements 131072 forward 7.293224334716797e-06 bandwidth (GB/s) 71.88699756626349 normal, size, elements 262144 forward 8.094310760498048e-06 bandwidth (GB/s) 129.54481623281296 normal, size, elements 524288 forward 1.2805461883544922e-05 bandwidth (GB/s) 163.7701177100726 normal, size, elements 1048576 forward 2.2592544555664064e-05 bandwidth (GB/s) 185.64991604491345 normal, size, elements 2097152 forward 3.801822662353516e-05 bandwidth (GB/s) 220.6470092112881 normal, size, elements 4194304 forward 6.761550903320313e-05 bandwidth (GB/s) 248.1267425164457 normal, size, elements 8388608 forward 0.00013209104537963867 bandwidth (GB/s) 254.02503177684966 normal, size, elements 16777216 forward 0.0002667689323425293 bandwidth (GB/s) 251.56176699703818 normal, size, elements 33554432 forward 0.0004705166816711426 bandwidth (GB/s) 285.25604559501795 ``` Resubmit of #20621 Pull Request resolved: #21287 Differential Revision: D15603695 Pulled By: ezyang fbshipit-source-id: f8c5032678d503d45ac99fb1475a929df7c2b361
1 parent 21113c2 commit 155f767

File tree

7 files changed

+93
-53
lines changed

7 files changed

+93
-53
lines changed

aten/src/ATen/Declarations.cwrap

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2623,7 +2623,6 @@
26232623
- floating_point
26242624
backends:
26252625
- CPU
2626-
- CUDA
26272626
return: argument 0
26282627
variants:
26292628
- function
@@ -2663,7 +2662,6 @@
26632662
- floating_point
26642663
backends:
26652664
- CPU
2666-
- CUDA
26672665
cname: normal
26682666
variants: function
26692667
return: self

aten/src/ATen/native/cuda/Distributions.cu

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <ATen/native/Distributions.h>
1515
#include <ATen/native/cuda/Loops.cuh>
1616
#include <ATen/native/TensorIterator.h>
17+
#include <ATen/LegacyTHFunctionsCUDA.h>
1718

1819
#include <THC/THCGeneral.h>
1920
#include <THC/THCTensorRandom.h>
@@ -120,6 +121,22 @@ __global__ void distribution_elementwise_grid_stride_kernel(int numel,
120121
}
121122
}
122123

124+
/**
125+
* distribution_nullary_kernel is analogous to gpu_nullary_kernel in
126+
* ATen/native/cuda/Loops.cuh. Like gpu_nullary_kernel, it uses
127+
* TensorIterator to launch a kernel. However, the differences are
128+
* - it launches a grid-stride loop based kernel. The kernel is not
129+
* generic like elementwise_kernel in Loops.cuh and is specialized
130+
* for the distribution kernels here.
131+
* - For big size tensors, we can launch multiple kernels recursively
132+
* (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox
133+
* offset calculation is done in this function.
134+
*
135+
* FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh
136+
* to have grid-stride loop kernel and then use that to launch our distribution
137+
* kernels? Note that we need a grid-stride loop kernel because, we found by testing
138+
* that it achieves peak effective bandwidth.
139+
*/
123140
template<typename scalar_t,
124141
typename accscalar_t,
125142
int unroll_factor,
@@ -475,6 +492,30 @@ void random_kernel_cuda(TensorIterator& iter, uint64_t range, int64_t base, Gene
475492
});
476493
}
477494

495+
void normal_kernel_cuda(TensorIterator& iter, double mean_, double std_, Generator* gen_) {
496+
auto gen = check_generator<CUDAGenerator>(gen_, &globalContext().defaultGenerator(kCUDA));
497+
AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "normal_cuda", [&] {
498+
using accscalar_t = at::acc_type<scalar_t, true>;
499+
auto mean = static_cast<accscalar_t>(mean_);
500+
auto std = static_cast<accscalar_t>(std_);
501+
// define lambda to multiply std and add mean
502+
auto normal_func = [mean, std] __device__ (accscalar_t rand) {
503+
return static_cast<scalar_t>(rand * std + mean);
504+
};
505+
if (std::is_same<scalar_t, double>::value) {
506+
distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
507+
gen,
508+
[] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal2_double(state); },
509+
normal_func);
510+
} else {
511+
distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls>(iter,
512+
gen,
513+
[] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal4(state); },
514+
normal_func);
515+
}
516+
});
517+
}
518+
478519
Tensor& uniform_cuda_(Tensor& self, double from, double to, Generator* gen) {
479520
auto iter = TensorIterator::nullary_op(self);
480521
uniform_kernel_cuda(*iter, from, to, gen);
@@ -510,4 +551,48 @@ Tensor& capped_random_cuda_(Tensor& self, int64_t to, Generator* gen) {
510551
return clamped_random_cuda_(self, 0, to, gen);
511552
}
512553

554+
Tensor& normal_cuda_(Tensor& self, double mean, double std, Generator* gen) {
555+
TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std);
556+
auto iter = TensorIterator::nullary_op(self);
557+
normal_kernel_cuda(*iter, mean, std, gen);
558+
return self;
559+
}
560+
561+
Tensor& normal_out_cuda(Tensor& output, const Tensor& mean, double std, Generator* gen) {
562+
normal_cuda_(output, 0, std, gen);
563+
output.add_(mean);
564+
return output;
565+
}
566+
567+
Tensor& normal_out_cuda(Tensor& output, double mean, const Tensor& std, Generator* gen) {
568+
normal_cuda_(output, 0, 1, gen);
569+
auto mean_tensor = at::full({1}, mean, output.options());
570+
at::native::legacy::cuda::_th_addcmul_out(output, mean_tensor, output, std, 1);
571+
return output;
572+
}
573+
574+
Tensor& normal_out_cuda(Tensor& output, const Tensor& mean, const Tensor& std, Generator* gen) {
575+
normal_cuda_(output, 0, 1, gen);
576+
at::native::legacy::cuda::_th_addcmul_out(output, mean, output, std, 1);
577+
return output;
578+
}
579+
580+
Tensor normal_cuda(const Tensor& mean, double std, Generator* gen) {
581+
Tensor ret = at::empty(mean.sizes(), mean.options());
582+
normal_out_cuda(ret, mean, std, gen);
583+
return ret;
584+
}
585+
586+
Tensor normal_cuda(double mean, const Tensor& std, Generator* gen) {
587+
Tensor ret = at::empty(std.sizes(), std.options());
588+
normal_out_cuda(ret, mean, std, gen);
589+
return ret;
590+
}
591+
592+
Tensor normal_cuda(const Tensor& mean, const Tensor& std, Generator* gen) {
593+
Tensor ret = at::empty(mean.sizes(), mean.options());
594+
normal_out_cuda(ret, mean, std, gen);
595+
return ret;
596+
}
597+
513598
}} // namespace at::native

aten/src/ATen/native/native_functions.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3205,7 +3205,7 @@
32053205
variants: method
32063206
dispatch:
32073207
CPU: legacy::cpu::_th_normal_
3208-
CUDA: legacy::cuda::_th_normal_
3208+
CUDA: normal_cuda_
32093209

32103210
- func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
32113211
variants: method
@@ -3947,32 +3947,32 @@
39473947
- func: normal(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
39483948
dispatch:
39493949
CPU: legacy::cpu::_th_normal_out
3950-
CUDA: legacy::cuda::_th_normal_out
3950+
CUDA: normal_out_cuda
39513951

39523952
- func: normal(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
39533953
dispatch:
39543954
CPU: legacy::cpu::_th_normal
3955-
CUDA: legacy::cuda::_th_normal
3955+
CUDA: normal_cuda
39563956

39573957
- func: normal(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
39583958
dispatch:
39593959
CPU: legacy::cpu::_th_normal_out
3960-
CUDA: legacy::cuda::_th_normal_out
3960+
CUDA: normal_out_cuda
39613961

39623962
- func: normal(float mean, Tensor std, *, Generator? generator=None) -> Tensor
39633963
dispatch:
39643964
CPU: legacy::cpu::_th_normal
3965-
CUDA: legacy::cuda::_th_normal
3965+
CUDA: normal_cuda
39663966

39673967
- func: normal(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
39683968
dispatch:
39693969
CPU: legacy::cpu::_th_normal_out
3970-
CUDA: legacy::cuda::_th_normal_out
3970+
CUDA: normal_out_cuda
39713971

39723972
- func: normal(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
39733973
dispatch:
39743974
CPU: legacy::cpu::_th_normal
3975-
CUDA: legacy::cuda::_th_normal
3975+
CUDA: normal_cuda
39763976

39773977
- func: alias(Tensor(a) self) -> Tensor(a)
39783978
variants: method, function

aten/src/THC/THCTensorRandom.cu

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,12 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)
129129
} \
130130
}
131131

132-
GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
133-
GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean)
134-
135132
GENERATE_KERNEL1(generate_exponential, float, double lambda, float, curand_uniform, (float)(-1. / lambda * log(x)))
136133
GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uniform_double, (double)(-1. / lambda * log(x)))
137134

138135
GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
139136
GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
140137

141-
GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, at::Half>::to((x * stdv) + mean)))
142138
GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(-1. / lambda * log(x)))))
143139
GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
144140

aten/src/THC/generic/THCTensorRandom.cu

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,42 +8,6 @@
88

99
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
1010

11-
void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
12-
{
13-
THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
14-
ptrdiff_t size = THCTensor_(nElement)(state, self_);
15-
if (size == 0) return;
16-
THCGenerator* gen = THCRandom_getGenerator(state);
17-
THCTensor *self = THCTensor_(newContiguous)(state, self_);
18-
scalar_t *data = THCTensor_(data)(state, self);
19-
20-
generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
21-
gen->state.gen_states, size, data, mean, stdv);
22-
23-
THCTensor_(freeCopyTo)(state, self, self_);
24-
};
25-
26-
void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) {
27-
THCTensor_(resizeAs)(state, self, means);
28-
THCTensor_(normal)(state, self, 0, stddev);
29-
THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
30-
}
31-
32-
void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs)
33-
{
34-
THCTensor_(resizeAs)(state, self, stddevs);
35-
THCTensor_(normal)(state, self, 0, 1);
36-
THCTensor_(cmul)(state, self, self, stddevs);
37-
THCTensor_(add)(state, self, self, ScalarConvert<double, scalar_t>::to(mean));
38-
}
39-
40-
void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs)
41-
{
42-
THCTensor_(resizeAs)(state, self, means);
43-
THCTensor_(normal)(state, self, 0, 1);
44-
THCTensor_(cmul)(state, self, self, stddevs);
45-
THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
46-
}
4711

4812
void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
4913
{

aten/src/THC/generic/THCTensorRandom.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44

55
#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
66

7-
THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv);
8-
THC_API void THCTensor_(normal_means)(struct THCState *state, THCTensor *self, THCTensor *means, double stddev);
9-
THC_API void THCTensor_(normal_stddevs)(struct THCState *state, THCTensor *self, double mean, THCTensor *stddevs);
10-
THC_API void THCTensor_(normal_means_stddevs)(struct THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs);
117
THC_API void THCTensor_(logNormal)(struct THCState *state, THCTensor *self, double mean, double stdv);
128
THC_API void THCTensor_(exponential)(struct THCState *state, THCTensor *self, double lambda);
139
THC_API void THCTensor_(cauchy)(struct THCState *state, THCTensor *self, double median, double sigma);

test/test_nn.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4815,6 +4815,7 @@ def test_Conv2d_groups_nobias(self):
48154815
# See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
48164816
# and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
48174817
def test_Conv2d_groups_nobias_v2(self):
4818+
torch.manual_seed(123)
48184819
dev_dtypes = [("cpu", torch.float)]
48194820
if TEST_CUDA:
48204821
dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]

0 commit comments

Comments
 (0)