pytorch
diff --git a/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 26 additions & 0 deletions b/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/Tensor.h‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/core/Tensor.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/core/TensorMethods.h‎
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/core/TensorMethods.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/core/aten_interned_strings.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/cuda/NumericLimits.cuh‎
Lines changed: 8 additions & 0 deletions b/‎aten/src/ATen/cuda/NumericLimits.cuh‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/function_wrapper.py‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/function_wrapper.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/native/TensorFactories.cpp‎
Lines changed: 49 additions & 0 deletions b/‎aten/src/ATen/native/TensorFactories.cpp‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/TypeProperties.cpp‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/native/TypeProperties.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cpu/IndexKernel.cpp‎
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/cpu/IndexKernel.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/BinaryOpsKernel.cu‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/BinaryOpsKernel.cu‎
Lines changed: 1 addition & 1 deletion
@@ -6,6 +6,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   device_guard: False
   return: argument 0
   options:
@@ -45,6 +46,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   options:
     - arguments:
       - THTensor* self
@@ -63,6 +65,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   device_guard: False
   return: bool
   arguments:
@@ -172,6 +175,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   variants:
     - function
   return: argument 0
@@ -191,6 +195,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   arguments:
     - THTensor* self
 ]]
@@ -201,6 +206,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   variants:
     - function
   device_guard: False
@@ -217,6 +223,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   variants:
     - function
   return: self
@@ -330,6 +337,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   device_guard: False
   return: argument 0
   arguments:
@@ -622,6 +630,7 @@
   name: _th_lt
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants:
     - function
   return: argument 0
@@ -644,6 +653,7 @@
   name: _th_lt_
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   return: self
   variants: function
   options:
@@ -663,6 +673,7 @@
   name: _th_gt
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants:
     - function
   return: argument 0
@@ -685,6 +696,7 @@
   name: _th_gt_
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   return: self
   variants: function
   options:
@@ -704,6 +716,7 @@
   name: _th_le
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants:
     - function
   return: argument 0
@@ -726,6 +739,7 @@
   name: _th_le_
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   return: self
   variants: function
   options:
@@ -745,6 +759,7 @@
   name: _th_ge
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants:
     - function
   return: argument 0
@@ -767,6 +782,7 @@
   name: _th_ge_
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   return: self
   variants: function
   options:
@@ -786,6 +802,7 @@
   name: _th_eq
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants:
     - function
   return: argument 0
@@ -808,6 +825,7 @@
   name: _th_eq_
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   return: self
   variants: function
   options:
@@ -827,6 +845,7 @@
   name: _th_ne
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants:
     - function
   return: argument 0
@@ -849,6 +868,7 @@
   name: _th_ne_
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   return: self
   variants: function
   options:
@@ -908,6 +928,7 @@
   name: _th_max
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants:
     - function
   options:
@@ -928,6 +949,7 @@
   name: _th_max
   cpu_bool: True
   cuda_bool: True
+  cuda_bfloat16: True
   variants: function
   options:
     - cname: max
@@ -1003,6 +1025,7 @@
 ]]
 [[
   name: _th_abs
+  cuda_bfloat16: True
   cname: abs
   backends:
     - CUDA
@@ -1802,6 +1825,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   variants:
     - function
   arguments:
@@ -2779,6 +2803,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   variants:
     - function
   options:
@@ -2806,6 +2831,7 @@
   cpu_bool: True
   cuda_bool: True
   cpu_bfloat16: True
+  cuda_bfloat16: True
   return: self
   arguments:
     - arg: THTensor* self
 
@@ -406,6 +406,7 @@ class CAFFE2_API Tensor {
   Tensor diag_embed(int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) const;
   Tensor diagflat(int64_t offset=0) const;
   Tensor diagonal(int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const;
+  Tensor & fill_diagonal_(Scalar fill_value, bool wrap=false);
   Tensor div(const Tensor & other) const;
   Tensor & div_(const Tensor & other);
   Tensor div(Scalar other) const;
 
@@ -281,6 +281,10 @@ inline Tensor Tensor::diagonal(int64_t offset, int64_t dim1, int64_t dim2) const
     static auto table = globalATenDispatch().getOpTable("aten::diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)");
     return table->getOp<Tensor (const Tensor &, int64_t, int64_t, int64_t)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, offset, dim1, dim2);
 }
+inline Tensor & Tensor::fill_diagonal_(Scalar fill_value, bool wrap) {
+    static auto table = globalATenDispatch().getOpTable("aten::fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)");
+    return table->getOp<Tensor & (Tensor &, Scalar, bool)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, fill_value, wrap);
+}
 inline Tensor Tensor::div(const Tensor & other) const {
     static auto table = globalATenDispatch().getOpTable("aten::div(Tensor self, Tensor other) -> Tensor");
     return table->getOp<Tensor (const Tensor &, const Tensor &)>(tensorTypeIdToBackend(type_id()), is_variable())(*this, other);
 
@@ -284,6 +284,7 @@ _(aten, diag) \
 _(aten, diag_embed) \
 _(aten, diagflat) \
 _(aten, diagonal) \
+_(aten, fill_diagonal_) \
 _(aten, digamma) \
 _(aten, dim) \
 _(aten, dist) \
 
@@ -86,6 +86,14 @@ struct numeric_limits<int64_t> {
 #endif
 };
 
+template <>
+struct numeric_limits<at::BFloat16> {
+  static inline __host__ __device__ at::BFloat16 lowest() { return at::BFloat16(0xFF7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 max() { return at::BFloat16(0x7F7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 lower_bound() { return at::BFloat16(0xFF80, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 upper_bound() { return at::BFloat16(0x7F80, at::BFloat16::from_bits()); }
+};
+
 template <>
 struct numeric_limits<at::Half> {
   static inline __host__ __device__ at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits()); }
 
@@ -516,6 +516,7 @@ def __getitem__(self, x):
     'with_gil': bool,
     'cpu_half': bool,
     'cpu_bfloat16': bool,
+    'cuda_bfloat16': bool,
     'deprecated': bool,
     'cpu_bool': bool,
     'cuda_bool': bool,
 
@@ -287,6 +287,55 @@ Tensor full_like(const Tensor& self, Scalar fill_value, const TensorOptions& opt
   return native::full(self.sizes(), fill_value, options);
 }
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fill diagonal ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tensor& fill_diagonal_(Tensor& self, Scalar fill_value, bool wrap) {
+  int64_t nDims = self.dim();
+  TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
+
+  int64_t height = self.size(0);
+  int64_t width = self.size(1);
+
+  if (nDims > 2) {
+    int64_t dim1 = height;
+    for (int64_t i = 1; i < nDims; i++) {
+      if (self.size(i) != dim1) {
+        AT_ERROR("all dimensions of input must be of equal length");
+      }
+    }
+  }
+
+  int64_t storage_offset = self.storage_offset();
+  std::vector<int64_t> sizes;
+  std::vector<int64_t> strides;
+  int64_t size = std::min(height, width);
+
+  int64_t stride = 0;
+  for (int64_t i = 0; i < nDims; i++) {
+    stride += self.stride(i);
+  }
+  strides.push_back(stride);
+  sizes.push_back(size);
+
+  auto main_diag = self.as_strided(sizes, strides, storage_offset);
+  main_diag.fill_(fill_value);
+
+  if (wrap && nDims == 2 && height > width + 1) {
+    std::vector<int64_t> wrap_sizes;
+
+    int64_t step = width + 1;
+    int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
+    wrap_sizes.push_back(wrap_size);
+
+    int64_t offset = self.stride(0) * (width + 1);
+
+    auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
+    wrap_diag.fill_(fill_value);
+  }
+
+  return self;
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ linspace ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor linspace(
 
@@ -25,6 +25,9 @@ bool is_signed(const Tensor &self) {
   if (self.scalar_type() == ScalarType::Half) {
     return true;
   }
+  if (self.scalar_type() == ScalarType::BFloat16) {
+    return true;
+  }
   return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "is_signed", [&]() -> bool {
     return std::is_signed<scalar_t>();
   });
 
@@ -93,7 +93,7 @@ void cpu_index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef
 }
 
 void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride) {
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, iter.dtype(), "index_cpu", [&] {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "index_cpu", [&] {
     cpu_index_kernel<scalar_t>(iter, index_size, index_stride, [](char* dst, char* src, int64_t offset) {
       *(scalar_t*)dst = *(scalar_t*)(src + offset);
     });
@@ -102,7 +102,7 @@ void index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef inde
 
 void index_put_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate) {
   // NOTE: duplicate indices are only supported if accumulate is true.
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::Bool, iter.dtype(), "index_put", [&] {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "index_put", [&] {
     if (accumulate) {
       // TODO: investigate parallelization of the accumulate kernel. Unlike the non-accumulate case,
       // this needs to be thread-safe.
 
@@ -13,7 +13,7 @@
 namespace at { namespace native {
 
 void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
-  AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "add_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "add_cuda", [&]() {
     auto alpha = alpha_scalar.to<scalar_t>();
     gpu_kernel_with_scalars(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
       return a + alpha * b;