pytorch
diff --git a/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/ReduceOps.cpp‎
Lines changed: 97 additions & 0 deletions b/‎aten/src/ATen/native/ReduceOps.cpp‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/SharedReduceOps.h‎
Lines changed: 11 additions & 3 deletions b/‎aten/src/ATen/native/SharedReduceOps.h‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/TensorIterator.cpp‎
Lines changed: 22 additions & 0 deletions b/‎aten/src/ATen/native/TensorIterator.cpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/TensorIterator.h‎
Lines changed: 8 additions & 0 deletions b/‎aten/src/ATen/native/TensorIterator.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/TensorIteratorReduce.cpp‎
Lines changed: 7 additions & 6 deletions b/‎aten/src/ATen/native/TensorIteratorReduce.cpp‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎aten/src/ATen/native/cpu/Reduce.h‎
Lines changed: 47 additions & 12 deletions b/‎aten/src/ATen/native/cpu/Reduce.h‎
Lines changed: 47 additions & 12 deletions
diff --git a/‎aten/src/ATen/native/cpu/ReduceOpsKernel.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cpu/ReduceOpsKernel.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -257,6 +257,8 @@ _(aten, cosh) \
 _(aten, cosine_embedding_loss) \
 _(aten, cosine_similarity) \
 _(aten, cross) \
+_(aten, std_mean) \
+_(aten, var_mean) \
 _(aten, ctc_loss) \
 _(aten, cudnn_affine_grid_generator) \
 _(aten, cudnn_affine_grid_generator_backward) \
@@ -905,6 +907,8 @@ _(attr, padding_value) \
 _(attr, params) \
 _(attr, pdist) \
 _(attr, cdist) \
+_(attr, std_mean) \
+_(attr, var_mean) \
 _(attr, periodic) \
 _(attr, pivot) \
 _(attr, pivots) \
 
@@ -114,6 +114,41 @@ static std::unique_ptr<TensorIterator> make_reduction(
   return TensorIterator::reduce_op(viewed_result, self.to(dtype));
 }
 
+static std::unique_ptr<TensorIterator> make_reduction(
+    const char* name, Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim,
+    bool keepdim, ScalarType dtype)
+{
+  // check that result type and dtype match if provided
+  for (const Tensor *t: {&result1, &result2}) {
+    const Tensor& result = *t;
+    AT_CHECK(
+        !result.defined() || result.type().scalarType() == dtype,
+        name, ": provided dtype must match dtype of result. Got ",
+        toString(result.type().scalarType()),
+        " and ",
+        toString(dtype),
+        ".");
+  }
+
+  int64_t ndim = self.dim();
+  DimMask mask = make_dim_mask(dim, ndim);
+  allocate_reduction_result(result1, self, mask, keepdim, dtype);
+  auto viewed_result1 = review_reduce_result(result1, ndim, mask, keepdim);
+
+  allocate_reduction_result(result2, self, mask, keepdim, dtype);
+  auto viewed_result2 = review_reduce_result(result2, ndim, mask, keepdim);
+
+  // special case for type promotion in mixed precision, improves computational
+  // efficiency.
+  // We don't generalize this to common mismatched input/output types to avoid cross
+  // product of templated kernel launches.
+  if (self.type().scalarType() == dtype ||
+      (self.is_cuda() && self.type().scalarType() == kHalf && dtype == kFloat)) {
+    return TensorIterator::reduce_op(viewed_result1, viewed_result2, self);
+  }
+  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype));
+}
+
 static inline int64_t n_dim_size(const Tensor& self, IntArrayRef dim) {
   int64_t numel = 1;
   for (auto d : dim) {
@@ -611,6 +646,68 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
   return result;
 }
 
+static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) {
+  AT_ASSERT(result1.defined() && result2.defined());
+  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
+           fname, " only support CPU and CUDA backend, got: ", toString(self.type().backend()));
+  AT_CHECK(at::isFloatingType(self.type().scalarType()), fname, " only support floating-point dtypes");
+  AT_CHECK(result1.type().scalarType() == result2.type().scalarType(),
+           "provided by result1 dtype must match dtype of result2. Got ",
+           toString(result1.type().scalarType()),
+           " and ",
+           toString(result2.type().scalarType()),
+           ".");
+  ScalarType dtype = get_dtype(result1, self, {}, true);
+  auto iter = make_reduction(fname, result1, result2, self, dim, keepdim, dtype);
+  if (iter->numel() == 0) {
+    result1.fill_(NAN);
+    result2.fill_(NAN);
+  } else {
+    std_var_stub(iter->device_type(), *iter, unbiased, take_sqrt);
+  }
+  return std::tuple<Tensor&, Tensor&>(result1, result2);
+}
+
+std::tuple<Tensor&,Tensor&> var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  return std_var_mean_out("var_mean", result1, result2, self, dim, unbiased, keepdim, false);
+}
+
+std::tuple<Tensor&,Tensor&> std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  return std_var_mean_out("std_mean", result1, result2, self, dim, unbiased, keepdim, true);
+}
+
+std::tuple<Tensor&,Tensor&> var_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) {
+  return std_var_mean_out("var_mean", result1, result2, self, {}, unbiased, false, false);
+}
+
+std::tuple<Tensor&,Tensor&> std_mean_out(Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) {
+  return std_var_mean_out("std_mean", result1, result2, self, {}, unbiased, false, true);
+}
+
+std::tuple<Tensor,Tensor> var_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::var_mean_out(result1, result2, self, dim, unbiased, keepdim);
+}
+
+std::tuple<Tensor,Tensor> std_mean(const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::std_mean_out(result1, result2, self, dim, unbiased, keepdim);
+}
+
+std::tuple<Tensor,Tensor> std_mean(const Tensor& self, bool unbiased) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::std_mean_out(result1, result2, self, unbiased);
+}
+
+std::tuple<Tensor,Tensor> var_mean(const Tensor& self, bool unbiased) {
+  Tensor result1 = at::empty({0}, self.options());
+  Tensor result2 = at::empty({0}, self.options());
+  return at::native::var_mean_out(result1, result2, self, unbiased);
+}
+
 Tensor var(const Tensor& self, bool unbiased) {
   TORCH_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
            "var only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
 
@@ -6,9 +6,11 @@
 #if defined(__CUDACC__)
 #include <THC/THCDeviceUtils.cuh>
 #include <ATen/native/cuda/DeviceSqrt.cuh>
+#include <thrust/tuple.h>
 #elif defined(__HIPCC__)
 #include <THH/THHDeviceUtils.cuh>
 #include <ATen/native/hip/DeviceSqrt.cuh>
+#include <thrust/tuple.h>
 #else
 #include <cmath>
 #define device_sqrt std::sqrt
@@ -42,7 +44,7 @@ struct WelfordData {
 };
 
 
-template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t>
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename combine_t, typename res_t>
 struct WelfordOps {
   bool unbiased;
   bool take_sqrt;
@@ -80,12 +82,18 @@ struct WelfordOps {
       new_count
     };
   }
-  inline C10_DEVICE scalar_t project(acc_t acc) const {
+  inline C10_DEVICE res_t project(acc_t acc) const {
+    auto mean = acc.mean;
     combine_t divisor = unbiased ? (acc.nf - 1) : acc.nf;
     auto ret = (divisor > 0) ?
       (take_sqrt ? device_sqrt(acc.m2 / divisor) : (acc.m2 / divisor))
       : NAN;
-    return (scalar_t) ret;
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    thrust::tuple<scalar_t, scalar_t> results((scalar_t) ret, (scalar_t) mean);
+#else
+    std::tuple<scalar_t, scalar_t> results{(scalar_t) ret, (scalar_t) mean};
+#endif
+    return results;
   }
 #if defined(__CUDACC__) || defined(__HIPCC__)
   inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
 
@@ -505,6 +505,28 @@ std::unique_ptr<TensorIterator> TensorIterator::reduce_op(Tensor& out, const Ten
   return builder.build();
 }
 
+std::unique_ptr<TensorIterator> TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tensor& a) {
+  AT_ASSERT(out1.defined());
+  AT_ASSERT(out2.defined());
+  AT_CHECK((!a.is_cuda() && !out1.is_cuda() && !out2.is_cuda()) || (a.device() == out1.device() && out1.device() == out2.device()),
+      "reduce_op(): expected input and both outputs to be on same device, but input is on ", a.device(),
+      ", output1 is on ", out1.device(), " and output2 is on", out2.device());
+  AT_CHECK(out1.dim() == out2.dim(), "reduce_op(): expected both outputs to have same number of dims, but output1 has ", out1.dim(),
+      " and output2 has ", out2.dim());
+  AT_CHECK(out1.sizes() == out2.sizes(), "reduce_op(): expected both outputs to have same sizes, but output1 has ", out1.sizes(),
+      " and output2 has ", out2.sizes());
+  AT_CHECK(out1.strides() == out2.strides(), "reduce_op(): expected both outputs to have same strides, but output1 has ", out1.strides(),
+           " and output2 has ", out2.strides());
+  auto builder = TensorIterator::Builder();
+  builder.add_output(out1);
+  builder.add_output(out2);
+  builder.add_input(a);
+  builder.iter_->promote_gpu_output_dtypes_ = true;
+  builder.iter_->resize_outputs_ = false;
+  builder.iter_->is_reduction_ = true;
+  return builder.build();
+}
+
 void TensorIterator::mark_outputs() {
   for (int i = 0; i < num_outputs_; i++) {
     operands_[i].is_output = true;
 
@@ -148,11 +148,14 @@ struct CAFFE2_API TensorIterator {
   static std::unique_ptr<TensorIterator> unary_op(Tensor& out, const Tensor& a);
   static std::unique_ptr<TensorIterator> nullary_op(Tensor& out);
   static std::unique_ptr<TensorIterator> reduce_op(Tensor& out, const Tensor& a);
+  static std::unique_ptr<TensorIterator> reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
 
   int ndim() const { return shape_.size(); }
   IntArrayRef shape() const { return shape_; }
   int64_t numel() const;
   int ntensors() const { return operands_.size(); }
+  int noutputs() const { return num_outputs_; }
+  int ninputs() const { return ntensors() - noutputs(); }
 
   /// number of elements in the output operand. this is the same as numel() for
   /// operations that are not reductions.
@@ -182,6 +185,11 @@ struct CAFFE2_API TensorIterator {
     return operands_[arg].tensor;
   }
 
+  Tensor input(int arg=0) const {
+    AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
+    return operands_[num_outputs_ + arg].tensor;
+  }
+
   /// Removes an operand from this iterator
   void remove_operand(int arg);
   /// Removes a dimension from this iterator
 
@@ -27,13 +27,13 @@ void TensorIterator::parallel_reduce(const loop2d_t& loop) {
 }
 
 static bool use_two_pass_reduction(TensorIterator& iter) {
-  return iter.tensor(0).numel() == 1;
+  return iter.output(0).numel() == 1;
 }
 
 static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop) {
   int max_threads = at::get_num_threads();
 
-  auto& dst = iter.tensor(0);
+  auto dst = iter.output(0);
   auto buffer_shape = DimVector(dst.sizes());
   buffer_shape.insert(buffer_shape.begin(), max_threads);
   auto buffer = at::empty(buffer_shape, dst.options());
@@ -47,7 +47,7 @@ static void two_pass_reduction(TensorIterator& iter, const loop2d_t& loop) {
     auto slice = buffer[thread_num];
     slice.copy_(dst);
 
-    auto sub_iter = TensorIterator::reduce_op(slice, iter.tensor(1));
+    auto sub_iter = TensorIterator::reduce_op(slice, iter.input(0));
     sub_iter->serial_for_each(loop, {begin, end});
   });
 
@@ -117,13 +117,14 @@ static void parallel_dim_reduction(TensorIterator& iter, const loop2d_t& loop) {
 }
 
 void TensorIterator::foreach_reduced_elt(const loop_subiter_t &loop, bool parallelize) {
-  AT_ASSERT(ntensors() == 2 && num_outputs_ == 1);
+  AT_ASSERT(ninputs() == 1);
+  AT_ASSERT(noutputs() >= 1);
 
   auto shape = this->shape();
-  if (tensor(0).numel() == 0) {
+  if (output(0).numel() == 0) {
     return;
   }
-  if (tensor(0).numel() == 1) {
+  if (output(0).numel() == 1) {
     loop(*this);
   }
   else if (numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
 
@@ -25,6 +25,44 @@ static inline bool is_outer_reduction(const int64_t* strides) {
          strides[3] == sizeof(typename traits::arg2_t);
 }
 
+template<typename traits, typename res_t>
+static void set_result(const int index, const res_t result, const TensorIterator &iter, const int num_outputs) {
+  static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
+  if (index < num_outputs) {
+    char *out = (char *) iter.data_ptr(index);
+    *(res_t *) out = result;
+  }
+}
+
+template<typename traits, typename res_t>
+static void set_results(const res_t result, const TensorIterator &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs == 1);
+  set_result<traits>(0, result, iter, num_outputs);
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+static inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIterator &iter, const int num_outputs) {
+  return i;
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+static inline typename std::enable_if<i < sizeof...(tuple_t), std::size_t>::type
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIterator &iter, const int num_outputs) {
+  if (i < num_outputs) {
+    set_result<traits>(i, std::get<i>(t), iter, num_outputs);
+    return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
+  }
+  return i;
+}
+
+template<typename traits, typename... res_t>
+static void set_results(const std::tuple<res_t...>& result, const TensorIterator &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs >= 1);
+  std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
+  AT_ASSERT(num_outputs == result_size);
+}
+
 template <typename T, typename... Args>
 struct all_same : c10::guts::conjunction<
   std::is_same<T, Args>...
@@ -64,7 +102,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
   using c_traits = binary_function_traits<cf_t>;
   using p_traits = unary_function_traits<pf_t>;
   using acc_t = typename p_traits::arg1_t;
-  using data_t = typename p_traits::result_type;
+  using data_t = typename r_traits::arg2_t;
   static_assert(
     all_same<
       acc_t,
@@ -75,19 +113,17 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
       typename c_traits::arg2_t,
       typename c_traits::result_type>::value,
     "all accumulate types must match");
-  static_assert(
-    std::is_same<data_t, typename r_traits::arg2_t>::value,
-    "all data types must match");
   static_assert(
     std::is_default_constructible<acc_t>::value,
     "the accumulate type must be default-constructible"
   );
-  iter.foreach_reduced_elt([&](TensorIterator &sub_iter) {
-    auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t {
-      sub_iter.serial_for_each([&acc, &ops](int ntensors, char** data, const int64_t* strides, int64_t size) {
-        AT_ASSERT(ntensors == 2);
-        char *in = data[1];
-        int64_t stride = strides[1];
+  const int num_outputs = iter.noutputs();
+  iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIterator &sub_iter) {
+    auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
+      sub_iter.serial_for_each([&acc, &ops, num_outputs](int ntensors, char** data, const int64_t* strides, int64_t size) {
+        AT_ASSERT(ntensors - num_outputs == 1);
+        char *in = data[ntensors - 1];
+        int64_t stride = strides[ntensors - 1];
         for (int64_t i = 0; i < size; ++i) {
           acc = ops.reduce(acc, *(data_t*)in);
           in += stride;
@@ -118,8 +154,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
         total_acc = ops.combine(total_acc, buffer[i]);
       }
     }
-    char *out = (char *)sub_iter.data_ptr(0);
-    *(data_t*)out = ops.project(total_acc);
+    set_results<r_traits>(ops.project(total_acc), sub_iter, num_outputs);
   });
 }
 
 
@@ -38,7 +38,7 @@ static void std_var_kernel_impl(TensorIterator &iter, bool unbiased, bool take_s
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "std_cpu", [&] {
     binary_kernel_reduce(
       iter,
-      WelfordOps<scalar_t, double, int64_t, double> { unbiased, take_sqrt },
+      WelfordOps<scalar_t, double, int64_t, double, std::tuple<scalar_t, scalar_t>> { unbiased, take_sqrt },
       WelfordData<double, int64_t, double>()
     );
   });