pytorch
diff --git a/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 138 additions & 0 deletions b/‎aten/src/ATen/native/LinearAlgebra.cpp‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/LinearAlgebra.h‎
Lines changed: 7 additions & 0 deletions b/‎aten/src/ATen/native/LinearAlgebra.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp‎
Lines changed: 35 additions & 0 deletions b/‎aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/LinearAlgebra.cu‎
Lines changed: 79 additions & 0 deletions b/‎aten/src/ATen/native/cuda/LinearAlgebra.cu‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 10 additions & 0 deletions b/‎aten/src/ATen/native/native_functions.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎test/test_autograd.py‎
Lines changed: 0 additions & 26 deletions b/‎test/test_autograd.py‎
Lines changed: 0 additions & 26 deletions
@@ -22,6 +22,8 @@
 #include <functional>
 #include <limits>
 #include <numeric>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/native/TensorIterator.h>
 
 namespace at {
 namespace native {
@@ -2722,6 +2724,142 @@ struct KronImpl final {
 };
 }
 
+DEFINE_DISPATCH(unpack_pivots_stub);
+
+std::tuple<Tensor, Tensor, Tensor> lu_unpack(
+    const Tensor& LU_data,
+    const Tensor& LU_pivots,
+    bool unpack_data,
+    bool unpack_pivots
+    ) {
+  TORCH_CHECK(LU_pivots.is_contiguous() && (LU_pivots.scalar_type() == at::kInt),
+      "lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype."
+      "Note: this function is intended to be used with the output produced by torch{.linalg}.lu");
+
+  // trivial case
+  if (!unpack_data && !unpack_pivots) {
+    return std::make_tuple(Tensor(), Tensor(), Tensor());
+  }
+
+  Tensor L, U;
+  // In the generalized LU factorization, the following shape relations hold:
+  // A.shape[-2:] == (m, n),
+  // P.shape[-2:] == (m, m),
+  // U.shape[-2:] == (m, k),
+  // L.shape[-2:] == (k, n),
+  // where k = min(m, n)
+  int64_t m = LU_data.size(-2);
+  int64_t n = LU_data.size(-1);
+  int64_t k = std::min(m, n);
+
+  if (unpack_data) {
+    U = LU_data.triu();
+    if (m != k) {
+      U = U.narrow(-2, 0, k);
+    }
+
+    L = LU_data.tril();
+    if (k != n) {
+      L = L.narrow(-1, 0, k);
+    }
+    L.diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).fill_(1);
+  }
+
+  if (!unpack_pivots) {
+    return std::make_tuple(Tensor(), L, U);
+  }
+
+  auto unpacked_pivots_sizes = LU_pivots.sizes().vec();
+  unpacked_pivots_sizes[LU_pivots.dim() - 1] = m;
+  auto unpacked_pivots = at::empty(
+    unpacked_pivots_sizes,
+    LU_pivots.options().memory_format(at::MemoryFormat::Contiguous)
+  );
+
+  // Fill `unpacked_pivots` with identity permutation
+  auto id_perm = at::arange(m, LU_pivots.options());
+  unpacked_pivots.copy_(id_perm);
+
+  // WARNING: we assume that unchanged LAPACK pivots are provided.
+  // Since LAPACK relies on the FORTRAN's 1-based indexing,
+  // we subtract 1 to convert the pivots to the C-style 0-based indexing.
+  // This behaviour could change in the future.
+  auto LU_pivots_zero_idx = LU_pivots - 1;
+
+  auto iter = TensorIteratorConfig()
+    .set_check_mem_overlap(false)
+    .check_all_same_dtype(false)
+    .resize_outputs(false)
+    .declare_static_shape(LU_pivots.sizes(), /*squash_dim=*/LU_pivots.dim() - 1)
+    .add_output(unpacked_pivots)
+    .add_input(LU_pivots_zero_idx)
+    .build();
+  // }
+
+  unpack_pivots_stub(
+    LU_pivots.device().type(),
+    iter,
+    LU_pivots.size(-1)
+  );
+
+  // The permutation matrix is converted to LU_data.dtype
+  // because `matmul` does not work with integer matrices.
+  unpacked_pivots_sizes.push_back(m);
+  auto permutation_matrix = at::zeros(
+    unpacked_pivots_sizes,
+    LU_data.options().memory_format(at::MemoryFormat::Contiguous)
+  );
+
+  // now that we know the final permutation,
+  // scatter 1s at proper locations.
+  permutation_matrix.scatter_(
+    -2,
+    unpacked_pivots.unsqueeze(-2).to(at::kLong),
+    at::ones({1}, permutation_matrix.options()).expand(permutation_matrix.sizes())
+  );
+
+  return std::make_tuple(permutation_matrix, L, U);
+}
+
+using TupleTensorRefs3 = std::tuple<Tensor&, Tensor&, Tensor&>;
+
+TupleTensorRefs3 lu_unpack_out(
+    const Tensor& LU_data,
+    const Tensor& LU_pivots,
+    bool unpack_data,
+    bool unpack_pivots,
+    Tensor& P,
+    Tensor& L,
+    Tensor& U
+    ) {
+  Tensor P_tmp, L_tmp, U_tmp;
+  std::tie(P_tmp, L_tmp, U_tmp) = at::lu_unpack(LU_data, LU_pivots, unpack_data, unpack_pivots);
+
+  if (unpack_pivots) {
+    checkSameDevice("lu_unpack", P, LU_data, "P");
+    // Note that lu_unpack returns P such that P.dtype == LU_data.dtype,
+    // because otherwise we cannot use P in matric products (no int -> float promotion)
+    checkLinalgCompatibleDtype("lu_unpack", P, LU_data, "L");
+
+    at::native::resize_output(P, P_tmp.sizes());
+    P.copy_(P_tmp);
+  }
+
+  if (unpack_data) {
+    checkSameDevice("lu_unpack", L, LU_data, "L");
+    checkSameDevice("lu_unpack", U, LU_data, "U");
+    checkLinalgCompatibleDtype("lu_unpack", L, LU_data, "L");
+    checkLinalgCompatibleDtype("lu_unpack", U, LU_data, "U");
+
+    at::native::resize_output(L, L_tmp.sizes());
+    at::native::resize_output(U, U_tmp.sizes());
+    L.copy_(L_tmp);
+    U.copy_(U_tmp);
+  }
+
+  return TupleTensorRefs3(P, L, U);
+}
+
 /*
 Calculates the Kronecker product between two Tensors.
 */
 
@@ -13,4 +13,11 @@ DECLARE_DISPATCH(addr_fn, addr_stub);
 using linalg_vector_norm_fn = void(*)(TensorIterator &, Scalar);
 DECLARE_DISPATCH(linalg_vector_norm_fn, linalg_vector_norm_stub);
 
+using unpack_pivots_fn = void(*)(
+  TensorIterator& iter,
+  int64_t dim_size
+);
+DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub);
+
+
 }} // namespace at::native
@@ -123,11 +123,46 @@ static void linalg_vector_norm_kernel_cpu(TensorIterator& iter, Scalar ord) {
   });
 }
 
+void unpack_pivots_cpu_kernel(
+  TensorIterator& iter,
+  int64_t dim_size
+) {
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  auto loop = [&](char** data, const int64_t* strides, int64_t nelems) {
+    auto* unpacked_pivots_ptr = data[0];
+    const auto* pivots_ptr = data[1];
+
+    for (int64_t elem = 0; elem < nelems; ++elem) {
+      // WARNING: torch.lu returns int32 pivots,
+      // this behavior could change in the future.
+      auto* unpacked_pivots_data = reinterpret_cast<int32_t*>(unpacked_pivots_ptr);
+      auto* pivots_data = reinterpret_cast<const int32_t*>(pivots_ptr);
+
+      for (int64_t i = 0; i < dim_size; ++i) {
+        std::swap(
+          unpacked_pivots_data[i],
+          unpacked_pivots_data[pivots_data[i]]
+        );
+      }
+
+      unpacked_pivots_ptr += strides[0];
+      pivots_ptr += strides[1];
+    }
+  };
+
+  iter.for_each(loop);
+}
+
 } // anonymous namespace
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 REGISTER_DISPATCH(addr_stub, &addr_kernel);
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 REGISTER_DISPATCH(linalg_vector_norm_stub, &linalg_vector_norm_kernel_cpu);
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+REGISTER_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel);
 
 }} // namespace at::native
@@ -575,9 +575,88 @@ static void linalg_vector_norm_kernel_cuda(TensorIterator& iter, Scalar ord) {
   });
 }
 
+template <int n_threads, int n_elems_per_thread, typename func_t>
+C10_LAUNCH_BOUNDS_2(n_threads, n_elems_per_thread)
+__global__ void _elementwise_kernel(int total_n_elems, func_t f) {
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  int idx = total_work_block * blockIdx.x + threadIdx.x;
+
+  #pragma unroll
+  for (int i = 0; i < n_elems_per_thread; ++i) {
+    if (idx < total_n_elems) {
+      f(idx);
+      idx += n_threads;
+    }
+  }
+}
+
+template <int n_threads, int n_elems_per_thread, typename func_t>
+static void _launch_kernel(int total_n_elems, func_t f) {
+  TORCH_INTERNAL_ASSERT(
+    total_n_elems >= 0 && total_n_elems <= std::numeric_limits<int32_t>::max()
+  );
+
+  dim3 block(n_threads);
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  dim3 grid((total_n_elems + total_work_block - 1) / total_work_block);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  _elementwise_kernel<n_threads, n_elems_per_thread, func_t>
+    <<<grid, block, 0, stream>>>(total_n_elems, f);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void _unpack_pivots_internal_kernel(
+  TensorIterator& iter,
+  int64_t dim_size
+) {
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      _unpack_pivots_internal_kernel(sub_iter, dim_size);
+    }
+    return;
+  }
+
+  auto offset_calculator = make_offset_calculator<2>(iter);
+
+  char* unpacked_pivots_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  const char* const __restrict__ pivots_ptr = reinterpret_cast<const char*>(iter.data_ptr(1));
+
+  auto loop = [=]C10_DEVICE(int i) {
+    auto offsets = offset_calculator.get(i);
+
+    auto* unpacked_pivots_data = reinterpret_cast<int32_t*>(
+      unpacked_pivots_ptr + offsets[0]);
+    const auto* const __restrict__ pivots_data = reinterpret_cast<const int32_t*>(
+      pivots_ptr + offsets[1]);
+
+    // QUESTION: can we mix 64bit offsets with 32bit Iterator indexing?
+    for (int64_t i = 0; i < dim_size; ++i) {
+      thrust::swap(
+        unpacked_pivots_data[i],
+        unpacked_pivots_data[pivots_data[i]]
+      );
+    }
+  };
+
+  _launch_kernel<num_threads, thread_work_size>(iter.numel(), loop);
+}
+
+void unpack_pivots_cuda_kernel(
+  TensorIterator& iter,
+  int64_t dim_size
+) {
+  _unpack_pivots_internal_kernel(iter, dim_size);
+}
+
 } // anonymous namespace
 
 REGISTER_DISPATCH(addr_stub, &addr_kernel_cuda);
 REGISTER_DISPATCH(linalg_vector_norm_stub, &linalg_vector_norm_kernel_cuda);
+REGISTER_DISPATCH(unpack_pivots_stub, &unpack_pivots_cuda_kernel);
 
 }}
@@ -6417,6 +6417,16 @@
   dispatch:
     CompositeExplicitAutograd: lu_solve
 
+- func: lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
+  variants: function
+  dispatch:
+    CPU, CUDA: lu_unpack
+
+- func: lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+  variants: function
+  dispatch:
+    CPU, CUDA: lu_unpack_out
+
 # TODO: remove dispatch section when porting TH CUDA to ATen
 - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
 
@@ -8234,32 +8234,6 @@ def test_logcumsumexp_large_value(self, device):
         gradcheck(lambda x: x.logcumsumexp(2), a)
         gradgradcheck(lambda x: x.logcumsumexp(2), a)
 
-    @slowTest
-    def test_lu_backward(self, device):
-        def run_test(*sizes):
-            x = torch.rand(*sizes, device=device, dtype=torch.double).requires_grad_(True)
-
-            gradcheck(lambda x: x.lu(get_infos=True), x)
-            gradgradcheck(lambda x: x.lu(get_infos=True), x)
-
-            gradcheck(lambda x: x.lu(get_infos=False), x)
-            gradgradcheck(lambda x: x.lu(get_infos=False), x)
-
-            # there is no pivot-less LU factorization on CPU
-            if x.device.type == 'cuda':
-                gradcheck(lambda x: x.lu(pivot=False, get_infos=True), x)
-                gradgradcheck(lambda x: x.lu(pivot=False, get_infos=True), x)
-
-                gradcheck(lambda x: x.lu(pivot=False, get_infos=False), x)
-                gradgradcheck(lambda x: x.lu(pivot=False, get_infos=False), x)
-
-        run_test(3, 3)
-        run_test(3, 3, 3)
-        run_test(3, 3, 3, 3)
-        run_test(5, 5)
-        run_test(3, 5, 5)
-        run_test(3, 3, 5, 5)
-
     def test_strided_leaf_grad_layout(self, device):
         # (1) If leaf is non-overlapping and dense, grad's layout should match its leaf.
         for fmt_a in (torch.contiguous_format, torch.channels_last):