Move THCTensor_{normal, normal_means, normal_stddevs, normal_means_stddevs} to ATen (#21287)

syed-ahmed · facebook-github-bot · commit 155f7673825b · 2019-06-03T09:45:02.000-07:00
Summary: ## Effective Bandwidth Benchmark - using https://gist.github.com/syed-ahmed/f8b7384d642f4bce484228b508b4bc68 - on V100 ### Float Type #### Before: ``` normal, size, elements 65536 forward 4.956722259521484e-06 bandwidth (GB/s) 52.88656218258779 normal, size, elements 131072 forward 5.285739898681641e-06 bandwidth (GB/s) 99.18914098114568 normal, size, elements 262144 forward 7.548332214355469e-06 bandwidth (GB/s) 138.91492454529376 normal, size, elements 524288 forward 1.1980533599853516e-05 bandwidth (GB/s) 175.0466273076219 normal, size, elements 1048576 forward 2.091646194458008e-05 bandwidth (GB/s) 200.52645667862762 normal, size, elements 2097152 forward 3.9961338043212894e-05 bandwidth (GB/s) 209.91809610901498 normal, size, elements 4194304 forward 7.39765167236328e-05 bandwidth (GB/s) 226.79110538115253 normal, size, elements 8388608 forward 0.0001377725601196289 bandwidth (GB/s) 243.5494555001696 normal, size, elements 16777216 forward 0.0002710080146789551 bandwidth (GB/s) 247.62686107087774 normal, size, elements 33554432 forward 0.0005375170707702637 bandwidth (GB/s) 249.69947058177252 ``` #### After: ``` normal, size, elements 65536 forward 6.198883056640625e-06 bandwidth (GB/s) 42.288908760615385 normal, size, elements 131072 forward 6.756782531738281e-06 bandwidth (GB/s) 77.59432800112916 normal, size, elements 262144 forward 7.560253143310547e-06 bandwidth (GB/s) 138.6958849291706 normal, size, elements 524288 forward 7.550716400146485e-06 bandwidth (GB/s) 277.7421225831386 normal, size, elements 1048576 forward 1.1034011840820313e-05 bandwidth (GB/s) 380.1250225673293 normal, size, elements 2097152 forward 1.802682876586914e-05 bandwidth (GB/s) 465.34019427102237 normal, size, elements 4194304 forward 2.8417110443115234e-05 bandwidth (GB/s) 590.3913430460946 normal, size, elements 8388608 forward 4.8711299896240235e-05 bandwidth (GB/s) 688.8428777608927 normal, size, elements 16777216 forward 9.685993194580078e-05 bandwidth (GB/s) 692.8444265018856 normal, size, elements 33554432 forward 0.00018213510513305663 bandwidth (GB/s) 736.9130069787966 ``` ### Double Type #### Before: ``` normal, size, elements 65536 forward 5.8841705322265624e-06 bandwidth (GB/s) 44.55071425348461 normal, size, elements 131072 forward 8.018016815185547e-06 bandwidth (GB/s) 65.38873789925661 normal, size, elements 262144 forward 1.2989044189453124e-05 bandwidth (GB/s) 80.72772597474304 normal, size, elements 524288 forward 2.2075176239013673e-05 bandwidth (GB/s) 95.00046465285668 normal, size, elements 1048576 forward 4.1041374206542965e-05 bandwidth (GB/s) 102.19696784254678 normal, size, elements 2097152 forward 7.57598876953125e-05 bandwidth (GB/s) 110.72624650312186 normal, size, elements 4194304 forward 0.00013725996017456056 bandwidth (GB/s) 122.22949779865557 normal, size, elements 8388608 forward 0.0002614736557006836 bandwidth (GB/s) 128.32815569921402 normal, size, elements 16777216 forward 0.0005080199241638184 bandwidth (GB/s) 132.0988819689674 normal, size, elements 33554432 forward 0.0009479570388793945 bandwidth (GB/s) 141.58629821311564 ``` #### After: ``` normal, size, elements 65536 forward 5.991458892822265e-06 bandwidth (GB/s) 43.75294977222444 normal, size, elements 131072 forward 7.293224334716797e-06 bandwidth (GB/s) 71.88699756626349 normal, size, elements 262144 forward 8.094310760498048e-06 bandwidth (GB/s) 129.54481623281296 normal, size, elements 524288 forward 1.2805461883544922e-05 bandwidth (GB/s) 163.7701177100726 normal, size, elements 1048576 forward 2.2592544555664064e-05 bandwidth (GB/s) 185.64991604491345 normal, size, elements 2097152 forward 3.801822662353516e-05 bandwidth (GB/s) 220.6470092112881 normal, size, elements 4194304 forward 6.761550903320313e-05 bandwidth (GB/s) 248.1267425164457 normal, size, elements 8388608 forward 0.00013209104537963867 bandwidth (GB/s) 254.02503177684966 normal, size, elements 16777216 forward 0.0002667689323425293 bandwidth (GB/s) 251.56176699703818 normal, size, elements 33554432 forward 0.0004705166816711426 bandwidth (GB/s) 285.25604559501795 ``` Resubmit of #20621 Pull Request resolved: #21287 Differential Revision: D15603695 Pulled By: ezyang fbshipit-source-id: f8c5032678d503d45ac99fb1475a929df7c2b361
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
@@ -2623,7 +2623,6 @@
     - floating_point
   backends:
     - CPU
-    - CUDA
   return: argument 0
   variants:
     - function
@@ -2663,7 +2662,6 @@
     - floating_point
   backends:
     - CPU
-    - CUDA
   cname: normal
   variants: function
   return: self
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
@@ -14,6 +14,7 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCTensorRandom.h>
@@ -120,6 +121,22 @@ __global__ void distribution_elementwise_grid_stride_kernel(int numel,
   }
 }
 
+/**
+ * distribution_nullary_kernel is analogous to gpu_nullary_kernel in
+ * ATen/native/cuda/Loops.cuh. Like gpu_nullary_kernel, it uses
+ * TensorIterator to launch a kernel. However, the differences are
+ *   - it launches a grid-stride loop based kernel. The kernel is not
+ *     generic like elementwise_kernel in Loops.cuh and is specialized
+ *     for the distribution kernels here.
+ *   - For big size tensors, we can launch multiple kernels recursively
+ *     (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox
+ *     offset calculation is done in this function.
+ *
+ * FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh
+ * to have grid-stride loop kernel and then use that to launch our distribution
+ * kernels? Note that we need a grid-stride loop kernel because, we found by testing
+ * that it achieves peak effective bandwidth.
+ */
 template<typename scalar_t, 
          typename accscalar_t,
          int unroll_factor,
@@ -475,6 +492,30 @@ void random_kernel_cuda(TensorIterator& iter, uint64_t range, int64_t base, Gene
    });
 }
 
+void normal_kernel_cuda(TensorIterator& iter, double mean_, double std_, Generator* gen_) {
+  auto gen = check_generator<CUDAGenerator>(gen_, &globalContext().defaultGenerator(kCUDA));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "normal_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto mean = static_cast<accscalar_t>(mean_);
+    auto std = static_cast<accscalar_t>(std_);
+    // define lambda to multiply std and add mean
+    auto normal_func = [mean, std] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(rand * std + mean);
+    };
+    if (std::is_same<scalar_t, double>::value) {
+      distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal2_double(state); },
+        normal_func);
+    } else {
+      distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal4(state); },
+        normal_func);
+    }
+   });
+}
+
 Tensor& uniform_cuda_(Tensor& self, double from, double to, Generator* gen) {
   auto iter = TensorIterator::nullary_op(self);
   uniform_kernel_cuda(*iter, from, to, gen);
@@ -510,4 +551,48 @@ Tensor& capped_random_cuda_(Tensor& self, int64_t to, Generator* gen) {
   return clamped_random_cuda_(self, 0, to, gen);
 }
 
+Tensor& normal_cuda_(Tensor& self, double mean, double std, Generator* gen) {
+  TORCH_CHECK(std > 0.0, "normal_ expects std > 0.0, but found std=", std);
+  auto iter = TensorIterator::nullary_op(self);
+  normal_kernel_cuda(*iter, mean, std, gen);
+  return self;
+}
+
+Tensor& normal_out_cuda(Tensor& output, const Tensor& mean, double std, Generator* gen) {
+  normal_cuda_(output, 0, std, gen);
+  output.add_(mean);
+  return output;
+}
+
+Tensor& normal_out_cuda(Tensor& output, double mean, const Tensor& std, Generator* gen) {
+  normal_cuda_(output, 0, 1, gen);
+  auto mean_tensor = at::full({1}, mean, output.options());
+  at::native::legacy::cuda::_th_addcmul_out(output, mean_tensor, output, std, 1);
+  return output;
+}
+
+Tensor& normal_out_cuda(Tensor& output, const Tensor& mean, const Tensor& std, Generator* gen) {
+  normal_cuda_(output, 0, 1, gen);
+  at::native::legacy::cuda::_th_addcmul_out(output, mean, output, std, 1);
+  return output; 
+}
+
+Tensor normal_cuda(const Tensor& mean, double std, Generator* gen) {
+  Tensor ret = at::empty(mean.sizes(), mean.options());
+  normal_out_cuda(ret, mean, std, gen);
+  return ret;
+}
+
+Tensor normal_cuda(double mean, const Tensor& std, Generator* gen) {
+  Tensor ret = at::empty(std.sizes(), std.options());
+  normal_out_cuda(ret, mean, std, gen);
+  return ret;
+}
+
+Tensor normal_cuda(const Tensor& mean, const Tensor& std, Generator* gen) {
+  Tensor ret = at::empty(mean.sizes(), mean.options());
+  normal_out_cuda(ret, mean, std, gen);
+  return ret;
+}
+
 }} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -3205,7 +3205,7 @@
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_normal_
-    CUDA: legacy::cuda::_th_normal_
+    CUDA: normal_cuda_
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
@@ -3947,32 +3947,32 @@
 - func: normal(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: legacy::cpu::_th_normal_out
-    CUDA: legacy::cuda::_th_normal_out
+    CUDA: normal_out_cuda
 
 - func: normal(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
   dispatch:
     CPU: legacy::cpu::_th_normal
-    CUDA: legacy::cuda::_th_normal
+    CUDA: normal_cuda
 
 - func: normal(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: legacy::cpu::_th_normal_out
-    CUDA: legacy::cuda::_th_normal_out
+    CUDA: normal_out_cuda
 
 - func: normal(float mean, Tensor std, *, Generator? generator=None) -> Tensor
   dispatch:
     CPU: legacy::cpu::_th_normal
-    CUDA: legacy::cuda::_th_normal
+    CUDA: normal_cuda
 
 - func: normal(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: legacy::cpu::_th_normal_out
-    CUDA: legacy::cuda::_th_normal_out
+    CUDA: normal_out_cuda
 
 - func: normal(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
   dispatch:
     CPU: legacy::cpu::_th_normal
-    CUDA: legacy::cuda::_th_normal
+    CUDA: normal_cuda
 
 - func: alias(Tensor(a) self) -> Tensor(a)
   variants: method, function
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
@@ -129,16 +129,12 @@ __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)
   }                                                                                  \
 }
 
-GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
-GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean)
-
 GENERATE_KERNEL1(generate_exponential, float, double lambda, float, curand_uniform, (float)(-1. / lambda * log(x)))
 GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uniform_double, (double)(-1. / lambda * log(x)))
 
 GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
 GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
 
-GENERATE_KERNEL2(generate_normal, at::Half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, at::Half>::to((x * stdv) + mean)))
 GENERATE_KERNEL1(generate_exponential, at::Half, double lambda, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(-1. / lambda * log(x)))))
 GENERATE_KERNEL2(generate_cauchy, at::Half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, at::Half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
 
diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu
@@ -8,42 +8,6 @@
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 1, self_));
-  ptrdiff_t size = THCTensor_(nElement)(state, self_);
-  if (size == 0) return;
-  THCGenerator* gen = THCRandom_getGenerator(state);
-  THCTensor *self = THCTensor_(newContiguous)(state, self_);
-  scalar_t *data = THCTensor_(data)(state, self);
-
-  generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->state.gen_states, size, data, mean, stdv);
-
-  THCTensor_(freeCopyTo)(state, self, self_);
-};
-
-void THCTensor_(normal_means)(THCState *state, THCTensor *self, THCTensor *means, double stddev) {
-  THCTensor_(resizeAs)(state, self, means);
-  THCTensor_(normal)(state, self, 0, stddev);
-  THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
-}
-
-void THCTensor_(normal_stddevs)(THCState *state, THCTensor *self, double mean, THCTensor *stddevs)
-{
-  THCTensor_(resizeAs)(state, self, stddevs);
-  THCTensor_(normal)(state, self, 0, 1);
-  THCTensor_(cmul)(state, self, self, stddevs);
-  THCTensor_(add)(state, self, self, ScalarConvert<double, scalar_t>::to(mean));
-}
-
-void THCTensor_(normal_means_stddevs)(THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs)
-{
-  THCTensor_(resizeAs)(state, self, means);
-  THCTensor_(normal)(state, self, 0, 1);
-  THCTensor_(cmul)(state, self, self, stddevs);
-  THCTensor_(cadd)(state, self, self, ScalarConvert<int, scalar_t>::to(1), means);
-}
 
 void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
 {
diff --git a/aten/src/THC/generic/THCTensorRandom.h b/aten/src/THC/generic/THCTensorRandom.h
@@ -4,10 +4,6 @@
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv);
-THC_API void THCTensor_(normal_means)(struct THCState *state, THCTensor *self, THCTensor *means, double stddev);
-THC_API void THCTensor_(normal_stddevs)(struct THCState *state, THCTensor *self, double mean, THCTensor *stddevs);
-THC_API void THCTensor_(normal_means_stddevs)(struct THCState *state, THCTensor *self, THCTensor *means, THCTensor *stddevs);
 THC_API void THCTensor_(logNormal)(struct THCState *state, THCTensor *self, double mean, double stdv);
 THC_API void THCTensor_(exponential)(struct THCState *state, THCTensor *self, double lambda);
 THC_API void THCTensor_(cauchy)(struct THCState *state, THCTensor *self, double median, double sigma);
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -4815,6 +4815,7 @@ def test_Conv2d_groups_nobias(self):
     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
     def test_Conv2d_groups_nobias_v2(self):
+        torch.manual_seed(123)
         dev_dtypes = [("cpu", torch.float)]
         if TEST_CUDA:
             dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]