pytorch · wenleix · Mar 9, 2021 · Mar 9, 2021 · Mar 9, 2021 · Mar 9, 2021
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
@@ -187,19 +187,19 @@ std::vector<Tensor> chunk_batching_rule(const Tensor& self, int64_t chunks, int6
   return result;
 }
 
-Tensor clamp_batching_rule(const Tensor& self, optional<Scalar> min, optional<Scalar> max) {
+Tensor clamp_batching_rule(const Tensor& self, const optional<Scalar>& min, const optional<Scalar>& max) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto result = at::clamp(self_physical.tensor(), min, max);
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
-Tensor clamp_min_batching_rule(const Tensor& self, Scalar min) {
+Tensor clamp_min_batching_rule(const Tensor& self, const Scalar& min) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto result = at::clamp_min(self_physical.tensor(), min);
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
-Tensor clamp_max_batching_rule(const Tensor& self, Scalar max) {
+Tensor clamp_max_batching_rule(const Tensor& self, const Scalar& max) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   auto result = at::clamp_max(self_physical.tensor(), max);
   return self_physical.getPhysicalToLogicalMap().apply(result);
@@ -233,7 +233,7 @@ Tensor unsqueeze_batching_rule(const Tensor& self, int64_t dim) {
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
-Tensor& fill_inplace_scalar_batching_rule(Tensor& self, Scalar value) {
+Tensor& fill_inplace_scalar_batching_rule(Tensor& self, const Scalar& value) {
   auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
   self_physical.tensor().fill_(value);
   return self;
@@ -708,7 +708,7 @@ Tensor unwrap_and_call_method(const Tensor& input, ExtraArgs... extra_args) {
   return makeBatched(output_physical, BatchDims(old_bdims.begin(), old_bdims.end()));
 }
 
-Tensor pow_scalar_Tensor_batching_rule(Scalar other, const Tensor& self) {
+Tensor pow_scalar_Tensor_batching_rule(const Scalar& other, const Tensor& self) {
   auto* self_batched = unsafeGetBatchedImpl(self);
   auto output_physical = at::pow(other, self_batched->value());
   auto old_bdims = self_batched->bdims();
@@ -1120,36 +1120,36 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
 #undef TO_BATCHING_RULE
   m.impl("clone", clone_batching_rule);
 
-  using TensorTensorScalarType = Tensor (*)(const Tensor&, const Tensor&, Scalar);
+  using TensorTensorScalarType = Tensor (*)(const Tensor&, const Tensor&, const Scalar&);
   using TensorTensorType = Tensor (*)(const Tensor&, const Tensor&);
-  using TensorScalarType = Tensor (*)(const Tensor&, Scalar);
+  using TensorScalarType = Tensor (*)(const Tensor&, const Scalar&);
 
 #define BINARY_POINTWISE(op) \
   m.impl(#op".Tensor", binary_pointwise_batching_rule<TensorTensorType, at::op>); \
-  m.impl(#op".Scalar", unwrap_and_call<TensorScalarType, at::op, Scalar>);
+  m.impl(#op".Scalar", unwrap_and_call<TensorScalarType, at::op, const Scalar&>);
 #define BINARY_POINTWISE_VA(op, ...) \
   { \
     using Binop = Tensor (*)(const Tensor&, const Tensor&, __VA_ARGS__); \
-    using Unop = Tensor (*)(const Tensor&, Scalar, __VA_ARGS__); \
+    using Unop = Tensor (*)(const Tensor&, const Scalar&, __VA_ARGS__); \
     m.impl(#op".Tensor", binary_pointwise_batching_rule<Binop, at::op, __VA_ARGS__>); \
-    m.impl(#op".Scalar", unwrap_and_call<Unop, at::op, Scalar, __VA_ARGS__>); \
+    m.impl(#op".Scalar", unwrap_and_call<Unop, at::op, const Scalar&, __VA_ARGS__>); \
   }
 
-  BINARY_POINTWISE_VA(add, Scalar);
-  BINARY_POINTWISE_VA(sub, Scalar);
-  BINARY_POINTWISE_VA(rsub, Scalar);
+  BINARY_POINTWISE_VA(add, const Scalar&);
+  BINARY_POINTWISE_VA(sub, const Scalar&);
+  BINARY_POINTWISE_VA(rsub, const Scalar&);
   BINARY_POINTWISE(mul);
   BINARY_POINTWISE(div);
   {
     using Binop = Tensor (*)(const Tensor&, const Tensor&, std::string);
-    using Unop = Tensor (*)(const Tensor&, Scalar, std::string);
+    using Unop = Tensor (*)(const Tensor&, const Scalar&, std::string);
     m.impl("div.Tensor_mode", binary_pointwise_batching_rule<Binop, at::div, std::string>);
-    m.impl("div.Scalar_mode", unwrap_and_call<Unop, at::div, Scalar, std::string>);
+    m.impl("div.Scalar_mode", unwrap_and_call<Unop, at::div, const Scalar&, std::string>);
   }
 
   // at::pow has three out-of-place overloads
   m.impl("pow.Tensor_Tensor", binary_pointwise_batching_rule<TensorTensorType, at::pow>);
-  m.impl("pow.Tensor_Scalar", unwrap_and_call<TensorScalarType, at::pow, Scalar>);
+  m.impl("pow.Tensor_Scalar", unwrap_and_call<TensorScalarType, at::pow, const Scalar&>);
   m.impl("pow.Scalar", pow_scalar_Tensor_batching_rule);
 
   m.impl("sigmoid_backward", binary_pointwise_batching_rule<TensorTensorType, at::sigmoid_backward>);
@@ -1158,15 +1158,15 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
       binary_pointwise_batching_rule<
           TensorTensorScalarType,
           at::threshold_backward,
-          Scalar>);
+          const Scalar&>);
 
   // for at::result_type, call the native::result_type implementation.
   // We don't have to do anything special because native::result_type operates
   // on the logical shape of the tensors.
   m.impl("result_type.Tensor", static_cast<ScalarType (*)(const Tensor&, const Tensor&)>(native::result_type));
-  m.impl("result_type.Scalar", static_cast<ScalarType (*)(const Tensor&, Scalar)>(native::result_type));
-  m.impl("result_type.Scalar_Tensor", static_cast<ScalarType (*)(Scalar, const Tensor&)>(native::result_type));
-  m.impl("result_type.Scalar_Scalar", static_cast<ScalarType (*)(Scalar, Scalar)>(native::result_type));
+  m.impl("result_type.Scalar", static_cast<ScalarType (*)(const Tensor&, const Scalar&)>(native::result_type));
+  m.impl("result_type.Scalar_Tensor", static_cast<ScalarType (*)(const Scalar&, const Tensor&)>(native::result_type));
+  m.impl("result_type.Scalar_Scalar", static_cast<ScalarType (*)(const Scalar&, const Scalar&)>(native::result_type));
 
 #undef BINARY_POINTWISE_VA
 #undef BINARY_POINTWISE
@@ -1207,7 +1207,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   // Comparison ops
 #define COMPARISON_POINTWISE(op) \
   m.impl(#op".Tensor", comparison_pointwise_batching_rule<TensorTensorType, at::op>); \
-  m.impl(#op".Scalar", unwrap_and_call<TensorScalarType, at::op, Scalar>);
+  m.impl(#op".Scalar", unwrap_and_call<TensorScalarType, at::op, const Scalar&>);
 
   COMPARISON_POINTWISE(eq);
   COMPARISON_POINTWISE(gt);

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -442,7 +442,7 @@ Tensor _th_std(const Tensor & self, bool unbiased) {
             AT_ERROR("_th_std not supported on CPUType for ", dispatch_scalar_type);
     }
 }
-Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+Tensor & _th_renorm_out(Tensor & result, const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
 
@@ -468,7 +468,7 @@ Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t
     }
     return result;
 }
-Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+Tensor _th_renorm(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
     auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
@@ -493,7 +493,7 @@ Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
     }
     return result;
 }
-Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
+Tensor & _th_renorm_(Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
 
@@ -517,7 +517,7 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
     }
     return self;
 }
-Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) {
+Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, const Scalar& min, const Scalar& max) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
 
@@ -543,7 +543,7 @@ Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scala
     }
     return result;
 }
-Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
+Tensor _th_histc(const Tensor & self, int64_t bins, const Scalar& min, const Scalar& max) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
     auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h
@@ -30,11 +30,11 @@ std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, co
 std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdim);
 Tensor _th_var(const Tensor & self, bool unbiased);
 Tensor _th_std(const Tensor & self, bool unbiased);
-Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
-Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
-Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
-Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
-Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
+Tensor & _th_renorm_out(Tensor & result, const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
+Tensor _th_renorm(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
+Tensor & _th_renorm_(Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
+Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, const Scalar& min, const Scalar& max);
+Tensor _th_histc(const Tensor & self, int64_t bins, const Scalar& min, const Scalar& max);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
 std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
 std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self);

diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -18,8 +18,8 @@ namespace native {
 namespace legacy {
 namespace cuda {
 
-Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, Scalar value);
-Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, Scalar value);
+Tensor & _th_masked_fill_(Tensor & self, const Tensor & mask, const Scalar& value);
+Tensor & _th_masked_fill_bool_(Tensor & self, const Tensor & mask, const Scalar& value);
 Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source);
 Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index);
 Tensor _th_take(const Tensor & self, const Tensor & index);
@@ -32,9 +32,9 @@ std::tuple<Tensor &,Tensor &> _th_sort_out_stable(Tensor & values, Tensor & indi
 std::tuple<Tensor,Tensor> _th_sort_stable(const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending);
 std::tuple<Tensor &,Tensor &> _th_topk_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted);
 std::tuple<Tensor,Tensor> _th_topk(const Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted);
-Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
-Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
-Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
+Tensor & _th_renorm_out(Tensor & result, const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
+Tensor _th_renorm(const Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
+Tensor & _th_renorm_(Tensor & self, const Scalar& p, int64_t dim, const Scalar& maxnorm);
 Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim);
 Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
@@ -44,10 +44,10 @@ Tensor _th_potri(const Tensor & self, bool upper);
 std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self);
 std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self);
 Tensor & _th_copy_ignoring_overlaps_(Tensor & self, const Tensor & src);
-Tensor & _thnn_multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
-Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
-Tensor & _thnn_multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
-Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, Scalar p, Scalar margin, const Tensor & weight, int64_t reduction);
+Tensor & _thnn_multi_margin_loss_forward_out(Tensor & output, const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const Tensor & weight, int64_t reduction);
+Tensor _thnn_multi_margin_loss_forward(const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const Tensor & weight, int64_t reduction);
+Tensor & _thnn_multi_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const Tensor & weight, int64_t reduction);
+Tensor _thnn_multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar& p, const Scalar& margin, const Tensor & weight, int64_t reduction);
 std::tuple<Tensor &,Tensor &> _thnn_multilabel_margin_loss_forward_out(Tensor & output, Tensor & is_target, const Tensor & self, const Tensor & target, int64_t reduction);
 std::tuple<Tensor,Tensor> _thnn_multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, int64_t reduction);
 Tensor & _thnn_multilabel_margin_loss_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target);
@@ -68,10 +68,10 @@ std::tuple<Tensor &,Tensor &> _thnn_log_sigmoid_forward_out(Tensor & output, Ten
 std::tuple<Tensor,Tensor> _thnn_log_sigmoid_forward(const Tensor & self);
 Tensor & _thnn_log_sigmoid_backward_out(Tensor & grad_input, const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
 Tensor _thnn_log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer);
-Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
-Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
-Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training);
-Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, Scalar lower, Scalar upper, bool training, c10::optional<at::Generator> generator);
+Tensor & _thnn_rrelu_with_noise_forward_out(Tensor & output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
+Tensor _thnn_rrelu_with_noise_forward(const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
+Tensor _thnn_rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training);
+Tensor & _thnn_rrelu_with_noise_forward_(Tensor & self, const Tensor & noise, const Scalar& lower, const Scalar& upper, bool training, c10::optional<at::Generator> generator);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_forward_out(Tensor & output, Tensor & columns, Tensor & ones, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding);
 std::tuple<Tensor,Tensor,Tensor> _thnn_conv2d_forward(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const Tensor & bias, IntArrayRef stride, IntArrayRef padding);
 std::tuple<Tensor &,Tensor &,Tensor &> _thnn_conv2d_backward_out(Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias, const Tensor & grad_output, const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, const Tensor & columns, const Tensor & ones);

diff --git a/aten/src/ATen/ScalarOps.cpp b/aten/src/ATen/ScalarOps.cpp
@@ -13,23 +13,23 @@
 namespace at {
 namespace {
 template <typename scalar_t>
-inline void fill_inplace(Tensor& self, Scalar value_scalar) {
+inline void fill_inplace(Tensor& self, const Scalar& value_scalar) {
   auto value = value_scalar.to<scalar_t>();
   scalar_t* dptr = static_cast<scalar_t*>(self.data_ptr());
   *dptr = value;
 }
 }
 
 namespace detail {
-Tensor& scalar_fill(Tensor& self, Scalar value) {
+Tensor& scalar_fill(Tensor& self, const Scalar& value) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
       kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() {
         fill_inplace<scalar_t>(self, value);
       });
   return self;
 }
 
-Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt) {
+Tensor scalar_tensor_static(const Scalar& s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt) {
   at::tracer::impl::NoTracerDispatchMode tracer_guard;
   at::AutoNonVariableTypeMode non_var_type_mode(true);
   auto result = at::detail::empty_cpu({}, dtype_opt, c10::nullopt, device_opt, c10::nullopt, c10::nullopt);

diff --git a/aten/src/ATen/ScalarOps.h b/aten/src/ATen/ScalarOps.h
@@ -11,8 +11,8 @@ namespace detail {
 // Ideally this fast pass should be implemented in TensorIterator,
 // but we also want to skip compute_types which in not avoidable
 // in TensorIterator for now.
-Tensor& scalar_fill(Tensor& self, Scalar value);
-TORCH_API Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt);
+Tensor& scalar_fill(Tensor& self, const Scalar& value);
+TORCH_API Tensor scalar_tensor_static(const Scalar& s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt);
 } // namespace detail
 } // namespace at
 
@@ -21,7 +21,7 @@ namespace c10 {
 
 // FIXME: this should be (and was) Scalar::toTensor, but there is currently no way
 // to implement this without going through Derived Types (which are not part of core).
-inline at::Tensor scalar_to_tensor(Scalar s, const Device device = at::kCPU) {
+inline at::Tensor scalar_to_tensor(const Scalar& s, const Device device = at::kCPU) {
   // This is the fast track we have for CPU scalar tensors.
   if (device == at::kCPU) {
     if (s.isFloatingPoint()) {