-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Computing var/stddev and mean at the same time #18731
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
573b0ab
6df1664
b6e2e96
0c4033d
d998c4d
aaae08d
ae63691
f6ffd15
804364e
faf9899
a10a78a
69dc37b
4302eae
5e4ffab
305bf67
c8054f5
475d532
d148d33
bc1cc26
2b8ea7c
01deaf4
a378991
81c3153
b68f5f1
2b1fe5e
07fb569
3ed0183
3fc116a
d8c9717
0a6bde4
b35fba1
d301215
a10b4b5
11cb6cd
6870034
64b5318
b19c832
096b7e7
44c4a14
3a58e6b
038e2d9
119c509
bc115d5
3fd4554
c16d4b2
69657fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -114,6 +114,41 @@ static std::unique_ptr<TensorIterator> make_reduction( | |
| return TensorIterator::reduce_op(viewed_result, self.to(dtype)); | ||
| } | ||
|
|
||
| static std::unique_ptr<TensorIterator> make_reduction( | ||
| const char* name, Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim, | ||
| bool keepdim, ScalarType dtype) | ||
| { | ||
| // check that result type and dtype match if provided | ||
| for (const Tensor *t: {&result1, &result2}) { | ||
| const Tensor& result = *t; | ||
| AT_CHECK( | ||
| !result.defined() || result.type().scalarType() == dtype, | ||
| name, ": provided dtype must match dtype of result. Got ", | ||
| toString(result.type().scalarType()), | ||
| " and ", | ||
| toString(dtype), | ||
| "."); | ||
| } | ||
|
|
||
| int64_t ndim = self.dim(); | ||
| DimMask mask = make_dim_mask(dim, ndim); | ||
| allocate_reduction_result(result1, self, mask, keepdim, dtype); | ||
| auto viewed_result1 = review_reduce_result(result1, ndim, mask, keepdim); | ||
|
|
||
| allocate_reduction_result(result2, self, mask, keepdim, dtype); | ||
| auto viewed_result2 = review_reduce_result(result2, ndim, mask, keepdim); | ||
|
|
||
| // special case for type promotion in mixed precision, improves computational | ||
| // efficiency. | ||
| // We don't generalize this to common mismatched input/output types to avoid cross | ||
| // product of templated kernel launches. | ||
| if (self.type().scalarType() == dtype || | ||
| (self.is_cuda() && self.type().scalarType() == kHalf && dtype == kFloat)) { | ||
| return TensorIterator::reduce_op(viewed_result1, viewed_result2, self); | ||
| } | ||
| return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype)); | ||
| } | ||
|
|
||
| static inline int64_t n_dim_size(const Tensor& self, IntArrayRef dim) { | ||
| int64_t numel = 1; | ||
| for (auto d : dim) { | ||
|
|
@@ -611,6 +646,68 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim, | |
| return result; | ||
| } | ||
|
|
||
| static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) { | ||
| AT_ASSERT(result1.defined() && result2.defined()); | ||
| AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, | ||
| fname, " only support CPU and CUDA backend, got: ", toString(self.type().backend())); | ||
| AT_CHECK(at::isFloatingType(self.type().scalarType()), fname, " only support floating-point dtypes"); | ||
| AT_CHECK(result1.type().scalarType() == result2.type().scalarType(), | ||
| "provided by result1 dtype must match dtype of result2. Got ", | ||
| toString(result1.type().scalarType()), | ||
| " and ", | ||
| toString(result2.type().scalarType()), | ||
| "."); | ||
| ScalarType dtype = get_dtype(result1, self, {}, true); | ||
| auto iter = make_reduction(fname, result1, result2, self, dim, keepdim, dtype); | ||
| if (iter->numel() == 0) { | ||
| result1.fill_(NAN); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you know why this is done?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose the idea is to populate results with NAN if the input is empty |
||
| result2.fill_(NAN); | ||
| } else { | ||
| std_var_stub(iter->device_type(), *iter, unbiased, take_sqrt); | ||
| } | ||
| return std::tuple<Tensor&, Tensor&>(result1, result2); | ||
| } | ||
|
|
||
| std::tuple<Tensor&,Tensor&> var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) { | ||
| return std_var_mean_out("var_mean", result1, result2, self, dim, unbiased, keepdim, false); | ||
| } | ||
|
|
||
| std::tuple<Tensor&,Tensor&> std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) { | ||
| return std_var_mean_out("std_mean", result1, result2, self, dim, unbiased, keepdim, true); | ||
| } | ||
|
|
||
| std::tuple<Tensor&,Tensor&> var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) { | ||
| return std_var_mean_out("var_mean", result1, result2, self, {}, unbiased, false, false); | ||
| } | ||
|
|
||
| std::tuple<Tensor&,Tensor&> std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) { | ||
| return std_var_mean_out("std_mean", result1, result2, self, {}, unbiased, false, true); | ||
| } | ||
|
|
||
| std::tuple<Tensor,Tensor> var_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { | ||
| Tensor result1 = at::empty({0}, self.options()); | ||
| Tensor result2 = at::empty({0}, self.options()); | ||
| return at::native::var_mean_out(result1, result2, self, dim, unbiased, keepdim); | ||
| } | ||
|
|
||
| std::tuple<Tensor,Tensor> std_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) { | ||
| Tensor result1 = at::empty({0}, self.options()); | ||
| Tensor result2 = at::empty({0}, self.options()); | ||
| return at::native::std_mean_out(result1, result2, self, dim, unbiased, keepdim); | ||
| } | ||
|
|
||
| std::tuple<Tensor,Tensor> std_mean(const Tensor& self, bool unbiased) { | ||
| Tensor result1 = at::empty({0}, self.options()); | ||
| Tensor result2 = at::empty({0}, self.options()); | ||
| return at::native::std_mean_out(result1, result2, self, unbiased); | ||
| } | ||
|
|
||
| std::tuple<Tensor,Tensor> var_mean(const Tensor& self, bool unbiased) { | ||
| Tensor result1 = at::empty({0}, self.options()); | ||
| Tensor result2 = at::empty({0}, self.options()); | ||
| return at::native::var_mean_out(result1, result2, self, unbiased); | ||
| } | ||
|
|
||
| Tensor var(const Tensor& self, bool unbiased) { | ||
| TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA, | ||
| "var only supports CPU AND CUDA backend, got: ", toString(self.type().backend())); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,9 +6,11 @@ | |
| #if defined(__CUDACC__) | ||
| #include <THC/THCDeviceUtils.cuh> | ||
| #include <ATen/native/cuda/DeviceSqrt.cuh> | ||
| #include <thrust/tuple.h> | ||
| #elif defined(__HIPCC__) | ||
| #include <THH/THHDeviceUtils.cuh> | ||
| #include <ATen/native/hip/DeviceSqrt.cuh> | ||
| #include <thrust/tuple.h> | ||
| #else | ||
| #include <cmath> | ||
| #define device_sqrt std::sqrt | ||
|
|
@@ -42,7 +44,7 @@ struct WelfordData { | |
| }; | ||
|
|
||
|
|
||
| template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t> | ||
| template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t, typename res_t> | ||
| struct WelfordOps { | ||
| bool unbiased; | ||
| bool take_sqrt; | ||
|
|
@@ -80,12 +82,18 @@ struct WelfordOps { | |
| new_count | ||
| }; | ||
| } | ||
| inline C10_DEVICE scalar_t project(acc_t acc) const { | ||
| inline C10_DEVICE res_t project(acc_t acc) const { | ||
| auto mean = acc.mean; | ||
| combine_t divisor = unbiased ? (acc.nf - 1) : acc.nf; | ||
| auto ret = (divisor > 0) ? | ||
| (take_sqrt ? device_sqrt(acc.m2 / divisor) : (acc.m2 / divisor)) | ||
| : NAN; | ||
| return (scalar_t) ret; | ||
| #if defined(__CUDACC__) || defined(__HIPCC__) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we using a tuple for CUDA, and a vector for CPU? Can't we use a tuple for both? (
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally it should be vector for both CPU and CUDA, but CUDA doesn't support to call thrust::vector methods on DEVICE. So I use tuple instead.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like using (1) It is worse for performance, because it requires an allocation on the heap (this is not a huge deal since we are not calling (2) It is confusing and error-prone to have different behavior on CUDA vs. CPU. (3)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. Made a tuple on both CPU and CUDA parts |
||
| thrust::tuple<scalar_t, scalar_t> results((scalar_t) ret, (scalar_t) mean); | ||
| #else | ||
| std::tuple<scalar_t, scalar_t> results{(scalar_t) ret, (scalar_t) mean}; | ||
| #endif | ||
| return results; | ||
| } | ||
| #if defined(__CUDACC__) || defined(__HIPCC__) | ||
| inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -505,6 +505,28 @@ std::unique_ptr<TensorIterator> TensorIterator::reduce_op(Tensor& out, const Ten | |
| return builder.build(); | ||
| } | ||
|
|
||
| std::unique_ptr<TensorIterator> TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tensor& a) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't seem like anything is really specific to having 2 outputs. What if someday we want to make an op with 10 outputs? Should we try to make this take arbitrarily many outputs?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we are agreed so far, that two results are the max that we want in the nearest future. I just don't want to overcomplicate the implementation.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
| AT_ASSERT(out1.defined()); | ||
| AT_ASSERT(out2.defined()); | ||
| AT_CHECK((!a.is_cuda() && !out1.is_cuda() && !out2.is_cuda()) || (a.device() == out1.device() && out1.device() == out2.device()), | ||
| "reduce_op(): expected input and both outputs to be on same device, but input is on ", a.device(), | ||
| ", output1 is on ", out1.device(), " and output2 is on", out2.device()); | ||
| AT_CHECK(out1.dim() == out2.dim(), "reduce_op(): expected both outputs to have same number of dims, but output1 has ", out1.dim(), | ||
| " and output2 has ", out2.dim()); | ||
| AT_CHECK(out1.sizes() == out2.sizes(), "reduce_op(): expected both outputs to have same sizes, but output1 has ", out1.sizes(), | ||
| " and output2 has ", out2.sizes()); | ||
| AT_CHECK(out1.strides() == out2.strides(), "reduce_op(): expected both outputs to have same strides, but output1 has ", out1.strides(), | ||
| " and output2 has ", out2.strides()); | ||
| auto builder = TensorIterator::Builder(); | ||
| builder.add_output(out1); | ||
| builder.add_output(out2); | ||
| builder.add_input(a); | ||
| builder.iter_->promote_gpu_output_dtypes_ = true; | ||
| builder.iter_->resize_outputs_ = false; | ||
| builder.iter_->is_reduction_ = true; | ||
| return builder.build(); | ||
| } | ||
|
|
||
| void TensorIterator::mark_outputs() { | ||
| for (int i = 0; i < num_outputs_; i++) { | ||
| operands_[i].is_output = true; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this API different than the 1 result case with the dim being optional?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently std and var without dim uses TH implementation. I use ATen implementation, so I made dim optional.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you go into more depth on this? I don't really understand why the implementation needs to "bleed through" to the interface. Why can't the implementation just handle consistently getting an optional vs not?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is probably because when I moved them to TensorIterator, I forgot about the no-dim version. Is there any good reason not to just move it over now?