@@ -114,6 +114,41 @@ static std::unique_ptr<TensorIterator> make_reduction(
114114 return TensorIterator::reduce_op (viewed_result, self.to (dtype));
115115}
116116
117+ static std::unique_ptr<TensorIterator> make_reduction (
118+ const char * name, Tensor& result1, Tensor& result2, const Tensor& self, IntArrayRef dim,
119+ bool keepdim, ScalarType dtype)
120+ {
121+ // check that result type and dtype match if provided
122+ for (const Tensor *t: {&result1, &result2}) {
123+ const Tensor& result = *t;
124+ AT_CHECK (
125+ !result.defined () || result.type ().scalarType () == dtype,
126+ name, " : provided dtype must match dtype of result. Got " ,
127+ toString (result.type ().scalarType ()),
128+ " and " ,
129+ toString (dtype),
130+ " ." );
131+ }
132+
133+ int64_t ndim = self.dim ();
134+ DimMask mask = make_dim_mask (dim, ndim);
135+ allocate_reduction_result (result1, self, mask, keepdim, dtype);
136+ auto viewed_result1 = review_reduce_result (result1, ndim, mask, keepdim);
137+
138+ allocate_reduction_result (result2, self, mask, keepdim, dtype);
139+ auto viewed_result2 = review_reduce_result (result2, ndim, mask, keepdim);
140+
141+ // special case for type promotion in mixed precision, improves computational
142+ // efficiency.
143+ // We don't generalize this to common mismatched input/output types to avoid cross
144+ // product of templated kernel launches.
145+ if (self.type ().scalarType () == dtype ||
146+ (self.is_cuda () && self.type ().scalarType () == kHalf && dtype == kFloat )) {
147+ return TensorIterator::reduce_op (viewed_result1, viewed_result2, self);
148+ }
149+ return TensorIterator::reduce_op (viewed_result1, viewed_result2, self.to (dtype));
150+ }
151+
117152static inline int64_t n_dim_size (const Tensor& self, IntArrayRef dim) {
118153 int64_t numel = 1 ;
119154 for (auto d : dim) {
@@ -611,6 +646,68 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
611646 return result;
612647}
613648
649+ static std::tuple<Tensor&,Tensor&> std_var_mean_out (const char * fname, Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) {
650+ AT_ASSERT (result1.defined () && result2.defined ());
651+ AT_CHECK (self.type ().backend () == Backend::CPU || self.type ().backend () == Backend::CUDA,
652+ fname, " only support CPU and CUDA backend, got: " , toString (self.type ().backend ()));
653+ AT_CHECK (at::isFloatingType (self.type ().scalarType ()), fname, " only support floating-point dtypes" );
654+ AT_CHECK (result1.type ().scalarType () == result2.type ().scalarType (),
655+ " provided by result1 dtype must match dtype of result2. Got " ,
656+ toString (result1.type ().scalarType ()),
657+ " and " ,
658+ toString (result2.type ().scalarType ()),
659+ " ." );
660+ ScalarType dtype = get_dtype (result1, self, {}, true );
661+ auto iter = make_reduction (fname, result1, result2, self, dim, keepdim, dtype);
662+ if (iter->numel () == 0 ) {
663+ result1.fill_ (NAN);
664+ result2.fill_ (NAN);
665+ } else {
666+ std_var_stub (iter->device_type (), *iter, unbiased, take_sqrt);
667+ }
668+ return std::tuple<Tensor&, Tensor&>(result1, result2);
669+ }
670+
671+ std::tuple<Tensor&,Tensor&> var_mean_out (Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) {
672+ return std_var_mean_out (" var_mean" , result1, result2, self, dim, unbiased, keepdim, false );
673+ }
674+
675+ std::tuple<Tensor&,Tensor&> std_mean_out (Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim) {
676+ return std_var_mean_out (" std_mean" , result1, result2, self, dim, unbiased, keepdim, true );
677+ }
678+
679+ std::tuple<Tensor&,Tensor&> var_mean_out (Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) {
680+ return std_var_mean_out (" var_mean" , result1, result2, self, {}, unbiased, false , false );
681+ }
682+
683+ std::tuple<Tensor&,Tensor&> std_mean_out (Tensor &result1, Tensor &result2, const Tensor &self, bool unbiased) {
684+ return std_var_mean_out (" std_mean" , result1, result2, self, {}, unbiased, false , true );
685+ }
686+
687+ std::tuple<Tensor,Tensor> var_mean (const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
688+ Tensor result1 = at::empty ({0 }, self.options ());
689+ Tensor result2 = at::empty ({0 }, self.options ());
690+ return at::native::var_mean_out (result1, result2, self, dim, unbiased, keepdim);
691+ }
692+
693+ std::tuple<Tensor,Tensor> std_mean (const Tensor& self, IntArrayRef dim, bool unbiased, bool keepdim) {
694+ Tensor result1 = at::empty ({0 }, self.options ());
695+ Tensor result2 = at::empty ({0 }, self.options ());
696+ return at::native::std_mean_out (result1, result2, self, dim, unbiased, keepdim);
697+ }
698+
699+ std::tuple<Tensor,Tensor> std_mean (const Tensor& self, bool unbiased) {
700+ Tensor result1 = at::empty ({0 }, self.options ());
701+ Tensor result2 = at::empty ({0 }, self.options ());
702+ return at::native::std_mean_out (result1, result2, self, unbiased);
703+ }
704+
705+ std::tuple<Tensor,Tensor> var_mean (const Tensor& self, bool unbiased) {
706+ Tensor result1 = at::empty ({0 }, self.options ());
707+ Tensor result2 = at::empty ({0 }, self.options ());
708+ return at::native::var_mean_out (result1, result2, self, unbiased);
709+ }
710+
614711Tensor var (const Tensor& self, bool unbiased) {
615712 TORCH_CHECK (self.type ().backend () == Backend::CPU || self.type ().backend () == Backend::CUDA,
616713 " var only supports CPU AND CUDA backend, got: " , toString (self.type ().backend ()));
0 commit comments