pytorch
diff --git a/‎aten/src/ATen/NestedTensorImpl.cpp‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/NestedTensorImpl.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/interned_strings.h‎
Lines changed: 3 additions & 2 deletions b/‎aten/src/ATen/core/interned_strings.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/Blas.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/Blas.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 18 additions & 3 deletions b/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/SoftMax.cpp‎
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/native/SoftMax.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/PersistentSoftmax.cuh‎
Lines changed: 5 additions & 10 deletions b/‎aten/src/ATen/native/cuda/PersistentSoftmax.cuh‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎aten/src/ATen/test/basic.cpp‎
Lines changed: 1 addition & 3 deletions b/‎aten/src/ATen/test/basic.cpp‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎binaries/bench_gen/bench_gen.py‎
Lines changed: 1 addition & 1 deletion b/‎binaries/bench_gen/bench_gen.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎caffe2/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎caffe2/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎caffe2/__init__.py‎
Lines changed: 8 additions & 0 deletions
@@ -23,6 +23,9 @@ NestedTensorImpl::NestedTensorImpl(
           buffer.device()),
       buffer_(std::move(buffer)),
       nested_size_tensor_(std::move(nested_size_tensor)) {
+  TORCH_WARN_ONCE(
+      "The PyTorch API of nested tensors is in prototype stage and will change "
+      "in the near future.");
   TORCH_INTERNAL_ASSERT(nested_size_tensor_.is_contiguous());
   int64_t size_dim = nested_size_tensor_.dim();
   TORCH_INTERNAL_ASSERT(size_dim == 0 || size_dim == 2);
 
@@ -247,7 +247,7 @@ namespace c10 {
   _(onnx, Less)                      \
   _(onnx, LessOrEqual)               \
   _(onnx, Not)                       \
-  _(onnx, ATen)                      \
+  _(aten, ATen)                      \
   _(onnx, Split)                     \
   _(onnx, ConstantOfShape)           \
   _(onnx, Cast)                      \
@@ -316,7 +316,8 @@ namespace c10 {
   _(attr, new_axis)                  \
   _(attr, warn_id)                   \
   _(attr, allowzero)                 \
-  _(attr, seen_none)
+  _(attr, seen_none)                 \
+  _(attr, overload_name)
 
 enum class _keys : unique_t {
     #define DEFINE_KEY(ns, s) ns##_##s,
 
@@ -165,7 +165,7 @@ Tensor dot(const Tensor &self, const Tensor &other){
     return r;
   }
 
-  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] {
     Tensor result = at::empty({}, self.options());
     result.fill_(dot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));
     return result;
 
@@ -1223,7 +1223,7 @@ static void addmm_impl_cpu_(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c.is_conj());
 
   // Apply BLAS routine
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16,
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
       result.scalar_type(), "addmm_impl_cpu_",
       [&]{
         at::native::cpublas::gemm(
@@ -1428,6 +1428,20 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
   // is_bmm_out: true for bmm_out, false for baddbmm_
   // self_or_result is "self" for baddbmm_ and "result" for bmm_out
   Tensor& self_or_result = const_cast<Tensor&>(self_or_result_);
+  CheckedFrom c = (is_bmm_out ? "bmm" : "baddbmm");
+
+  auto checkOnCPU = [](const Tensor& t, CheckedFrom c) {
+    TORCH_CHECK(
+        !t.is_cuda(),
+        "Expect tensor to have CPU backend, but got tensor with ",
+        toString(t.options().backend()),
+        " Backend (while checking arguments for ",
+        c);
+  };
+
+  checkOnCPU(self_or_result, c);
+  checkOnCPU(batch1, c);
+  checkOnCPU(batch2, c);
 
   const auto batch1_sizes = batch1.sizes();
   const auto batch2_sizes = batch2.sizes();
@@ -1464,15 +1478,16 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
 
   if (contraction_size * res_rows * res_cols < 400) {
     if (is_bmm_out) {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, batch1.scalar_type(), "bmm", [&] {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "bmm", [&] {
           baddbmm_cpu_kernel<scalar_t, true>(self_or_result, batch1, batch2, beta, alpha);
         });
     } else {
-      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, batch1.scalar_type(), "baddbmm", [&] {
+      AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, batch1.scalar_type(), "baddbmm", [&] {
           baddbmm_cpu_kernel<scalar_t, false>(self_or_result, batch1, batch2, beta, alpha);
         });
     }
   } else if (at::hasMKL() && ((
+            self_or_result.scalar_type() != kHalf &&
             self_or_result.scalar_type() != kBFloat16 &&
             at::native::is_floating_point(self_or_result)) ||
             at::native::is_complex(self_or_result))
 
@@ -9,6 +9,7 @@
 #include <ATen/NamedTensorUtils.h>
 
 #include <c10/core/TensorOptions.h>
+#include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 
 namespace at {
@@ -148,7 +149,7 @@ void host_softmax(
   int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
   parallel_for(
       0, outer_size * inner_size, grain_size,
-      [&](int64_t begin, int64_t end) {
+      [&](int64_t begin, int64_t end) __ubsan_ignore_float_divide_by_zero__ {
         for (const auto i : c10::irange(begin, end)) {
           int64_t outer_idx = i / inner_size;
           int64_t inner_idx = i % inner_size;
 
@@ -167,6 +167,11 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
                         elements[i][it] = std::exp(elements[i][it] - max_value[i]);
                         sum[i] += elements[i][it];
                     }
+                } else {
+                  if (!is_log_softmax) {
+                    // Masked values are treated as -infinity, and std::exp(-infinity) is 0.
+                    elements[i][it] = 0;
+                  }
                 }
             }
         }
@@ -183,16 +188,6 @@ __global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batc
         for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
             int element_index = local_idx + it * WARP_SIZE;
             if (element_index < element_count) {
-                if (is_masked) {
-                    int idx = it*WARP_SIZE;
-                    if (!is_transformer_mask) {
-                        idx += i*element_count;
-                    }
-                    if (mask[idx]) {
-                        dst[i*element_count+it*WARP_SIZE] = 0;
-                        continue;
-                    }
-                }
                 if (is_log_softmax) {
                     dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i];
                 } else {
 
@@ -41,9 +41,7 @@ void TestOnesAndDot(DeprecatedTypeProperties& type) {
   Tensor b = ones({3, 4}, type);
   ASSERT_EQ_RESOLVED((b + b).sum().item<double>(), 24);
   ASSERT_EQ_RESOLVED(b.numel(), 12);
-  if (type.backend() != Backend::CPU || type.scalarType() != kHalf) {
-    ASSERT_EQ_RESOLVED(b.view(-1).dot(b.view(-1)).item<double>(), 12);
-  }
+  ASSERT_EQ_RESOLVED(b.view(-1).dot(b.view(-1)).item<double>(), 12);
 }
 
 void TestSort(DeprecatedTypeProperties& type) {
 
@@ -59,7 +59,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Utilitity to generate Caffe2 benchmark models.")
+        description="Utility to generate Caffe2 benchmark models.")
     parser.add_argument("operator", help="Caffe2 operator to benchmark.")
     parser.add_argument("-b", "--blob",
                         help="Instantiate a blob --blob name=dim1,dim2,dim3",
 
@@ -1943,6 +1943,8 @@ if(BUILD_PYTHON)
   # ---[ Python.
   if(BUILD_CAFFE2)
   add_library(caffe2_pybind11_state MODULE ${Caffe2_CPU_PYTHON_SRCS})
+  target_compile_definitions(torch PRIVATE BUILD_CAFFE2)
+  target_compile_definitions(torch_python PRIVATE BUILD_CAFFE2)
   if(USE_NUMPY)
     target_compile_options(caffe2_pybind11_state PRIVATE "-DUSE_NUMPY")
     target_link_libraries(caffe2_pybind11_state  PRIVATE numpy::numpy)
 
@@ -0,0 +1,8 @@
+import warnings
+
+
+try:
+    from caffe2.proto import caffe2_pb2
+except ImportError:
+    warnings.warn("Caffe2 was not built with this PyTorch build. "
+                  "Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.")
Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ Tensor dot(const Tensor &self, const Tensor &other){`
`165`	`165`	`return r;`
`166`	`166`	`}`
`167`	`167`
`168`		`- return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] {`
	`168`	`+ return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "dot", [&] {`
`169`	`169`	`Tensor result = at::empty({}, self.options());`
`170`	`170`	`result.fill_(dot_impl<scalar_t>(self.numel(), self.data_ptr<scalar_t>(), self.stride(0), other.data_ptr<scalar_t>(), other.stride(0)));`
`171`	`171`	`return result;`