Add half tensor support

Will Feng · Will Feng · commit a656bd0eb09a · 2018-05-31T14:45:40.000-04:00
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -1,5 +1,6 @@
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
+#include "ATen/cuda/CUDATypeConversion.cuh"
 
 #include <THC/THCGeneral.h>
 #include <THC/THCThrustAllocator.cuh>
@@ -42,27 +43,44 @@ Tensor& randperm_out_cuda(Tensor& result, int64_t n, Generator* generator) {
     throw std::runtime_error(oss.str());
   }
 
-  result.resize_({n});
+  auto result_tmp = result;
+  if (result.type().scalarType() == at::ScalarType::Half) {
+    // Make sure n is within range of Half
+    assert(Scalar(n).toHalf());
+    result_tmp = CUDA(kFloat).tensor();
+  }
+  result_tmp.resize_({n});
 
   if (n < 30000) {  // For small inputs, we offload it to CPU instead.
-    auto result_cpu = result.type().toBackend(kCPU).tensor({n});
+    auto result_cpu = result_tmp.type().toBackend(kCPU).tensor({n});
     randperm_out(result_cpu, n, generator);
-    result = result.type().copy(result_cpu);
+    result_tmp = result_tmp.type().copy(result_cpu);
   } else {
     // Generate random values for the keys array
-    auto keys = result.type().tensor(result.sizes()).random_(generator);
+    AT_DISPATCH_ALL_TYPES(
+      result_tmp.type(), "randperm_out_cuda", [&] {
+        using cuda_scalar_t = cuda::type<scalar_t>;
+
+        auto keys = result_tmp.type().tensor(result_tmp.sizes()).random_(generator);
 
-    auto result_data = thrust::device_ptr<int64_t>(result.data<int64_t>());
-    auto keys_data = thrust::device_ptr<int64_t>(keys.data<int64_t>());
+        auto result_data = thrust::device_ptr<cuda_scalar_t>(result_tmp.data<cuda_scalar_t>());
+        auto keys_data = thrust::device_ptr<cuda_scalar_t>(keys.data<cuda_scalar_t>());
 
-    auto state = globalContext().getTHCState();
-    THCThrustAllocator thrustAlloc(state);
-    auto policy = thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state));
+        auto state = globalContext().getTHCState();
+        THCThrustAllocator thrustAlloc(state);
+        auto policy = thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state));
 
-    thrust::sequence(policy, result_data, result_data + n);
+        thrust::sequence(policy, result_data, result_data + n);
+
+        // Use the sorted order of keys to rearrange the result array
+        thrust::sort_by_key(policy, keys_data, keys_data + n, result_data);
+      }
+    );
+  }
 
-    // Use the sorted order of keys to rearrange the result array
-    thrust::sort_by_key(policy, keys_data, keys_data + n, result_data);
+  if (result.type().scalarType() == at::ScalarType::Half) {
+    result.resize_({n});
+    result.copy_(result_tmp);
   }
 
   return result;
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -1652,6 +1652,18 @@ def test_randperm_cuda(self):
         torch.randperm(100000, out=res2, device=cuda)
         self.assertEqual(res1, res2, 0)
 
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(100, dtype=torch.half, device=cuda)
+        res2 = torch.cuda.HalfTensor()
+        torch.randperm(100, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
+        with torch.random.fork_rng(devices=[0]):
+            res1 = torch.randperm(50000, dtype=torch.half, device=cuda)
+        res2 = torch.cuda.HalfTensor()
+        torch.randperm(50000, out=res2, device=cuda)
+        self.assertEqual(res1, res2, 0)
+
         # randperm of 0 elements is an empty tensor
         res1 = torch.randperm(0, device=cuda)
         res2 = torch.cuda.LongTensor(5)