Merge commit 'aec182ae72d51dad0f46cdfe7ff9a41380d7da35'

soumith · soumith · commit 8b61ee522e63 · 2017-03-04T08:58:21.000-08:00
diff --git a/torch/lib/THC/THCReduceAll.cuh b/torch/lib/THC/THCReduceAll.cuh
@@ -331,7 +331,7 @@ bool THC_reduceAll(THCState* state,
   // If our destination is not on the device, copy the value back to
   // the host (synchronous!)
   if (!outOnDevice) {
-    cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);
+    THCudaCheck(cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost));
   }
 
   if (freeDevOut) {
diff --git a/torch/lib/THC/THCScanUtils.cuh b/torch/lib/THC/THCScanUtils.cuh
@@ -6,8 +6,8 @@
 // Collection of in-kernel scan / prefix sum utilities
 
 // Inclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency>
-__device__ void inclusivePrefixSum(T* smem, T in, T* out) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusivePrefixScan(T* smem, T in, T* out, BinaryFunction binop) {
   // FIXME: this is a slow, simple implementation; need up/down sweep,
   // prevent smem conflicts
   smem[threadIdx.x] = in;
@@ -18,7 +18,7 @@ __device__ void inclusivePrefixSum(T* smem, T in, T* out) {
     T val = 0;
 
     if (threadIdx.x >= offset) {
-      val = smem[threadIdx.x - offset] + smem[threadIdx.x];
+      val = binop(smem[threadIdx.x - offset], smem[threadIdx.x]);
     }
 
     __syncthreads();
@@ -38,11 +38,11 @@ __device__ void inclusivePrefixSum(T* smem, T in, T* out) {
 }
 
 // Exclusive prefix sum using shared memory
-template <typename T, bool KillWARDependency>
-__device__ void exclusivePrefixSum(T* smem, T in, T* out, T* carry) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunction binop) {
   // FIXME: crappy implementation
   // We kill write-after-read dependencies separately below, hence the `false`
-  inclusivePrefixSum<T, false>(smem, in, out);
+  inclusivePrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
 
   *out -= in;
   *carry = smem[blockDim.x - 1];
@@ -55,8 +55,8 @@ __device__ void exclusivePrefixSum(T* smem, T in, T* out, T* carry) {
 
 // Inclusive prefix sum for binary vars using intra-warp voting +
 // shared memory
-template <typename T, bool KillWARDependency>
-__device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
   // Within-warp, we use warp voting.
   T vote = __ballot(in);
   T index = __popc(getLaneMaskLe() & vote);
@@ -77,16 +77,16 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
     int current = 0;
     for (int i = 0; i < blockDim.x / 32; ++i) {
       T v = smem[i];
-      smem[i] += current;
-      current += v;
+      smem[i] = binop(smem[i], current);
+      current = binop(current, v);
     }
   }
 
   __syncthreads();
 
   // load the carry from the preceding warp
   if (warp >= 1) {
-    index += smem[warp - 1];
+    index = binop(index, smem[warp - 1]);
   }
 
   *out = index;
@@ -98,9 +98,9 @@ __device__ void inclusiveBinaryPrefixSum(T* smem, bool in, T* out) {
 
 // Exclusive prefix sum for binary vars using intra-warp voting +
 // shared memory
-template <typename T, bool KillWARDependency>
-__device__ void exclusiveBinaryPrefixSum(T* smem, bool in, T* out, T* carry) {
-  inclusiveBinaryPrefixSum<T, false>(smem, in, out);
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
+  inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
 
   // Inclusive to exclusive
   *out -= (T) in;
diff --git a/torch/lib/THC/THCTensorTopK.cu b/torch/lib/THC/THCTensorTopK.cu
@@ -5,6 +5,7 @@
 #include "THCAsmUtils.cuh"
 #include "THCScanUtils.cuh"
 #include "THCTensorTypeUtils.cuh"
+#include "THCTensorMathReduce.cuh"
 #include <algorithm> // for std::min
 
 #if CUDA_VERSION >= 7000
@@ -322,7 +323,7 @@ __global__ void gatherTopK(TensorInfo<float, IndexType> input,
 
     int index;
     int carry;
-    exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
 
     if (hasTopK) {
       int writeIndex = writeIndexStart + index;
@@ -354,7 +355,7 @@ __global__ void gatherTopK(TensorInfo<float, IndexType> input,
 
     int index;
     int carry;
-    exclusiveBinaryPrefixSum<int, true>(smem, hasTopK, &index, &carry);
+    exclusiveBinaryPrefixScan<int, true>(smem, hasTopK, &index, &carry, AddOp<int>());
 
     if (hasTopK && index < topKRemaining) {
       int writeIndex = writeIndexStart + index;
diff --git a/torch/lib/THC/generic/THCTensorMath.cu b/torch/lib/THC/generic/THCTensorMath.cu
@@ -87,8 +87,8 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
   // loop below will overwrite the value
   int maxDim = dimension + 1;
 
-  // ldimension is the actual dimension we cat along (minus 1, for 0-based indexing)
-  int ldimension = dimension;
+  // cat_dimension is the actual dimension we cat along
+  int cat_dimension = dimension;
 
   for (i = 0; i < numInputs; i++)
   {
@@ -100,13 +100,13 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
   // In the event that the user specified -1 as the concat dimension, then
   // we want to pick the maxDim  as dimension to cat along (and thus maxDim - 1 as the
   // value due to 0-based indexing). If the maxDim is // 0 (i.e. we are catting all
-  // empty tensors), then we set ldimension to be 0
+  // empty tensors), then we set cat_dimension to be 0
   if (dimension + TH_INDEX_BASE == -1) {
-    ldimension = maxDim ? (maxDim - 1) : 0;
+    cat_dimension = maxDim ? (maxDim - 1) : 0;
   }
 
   THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
-  THArgCheck(ldimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
+  THArgCheck(cat_dimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
 
   size = THLongStorage_newWithSize(maxDim);
   for(i = 0; i < maxDim; i++)
@@ -115,7 +115,7 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
     long dimSize = i < THCTensor_(nDimension)(state, inputs[0])
                        ? THCTensor_(size)(state, inputs[0], i)
                        : THMin(THCTensor_(nDimension)(state, inputs[0]), 1);
-    if (i == ldimension)
+    if (i == cat_dimension)
     {
       for (j = 1; j < numInputs; j++)
       {
@@ -203,15 +203,15 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
 
     // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-  CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock>>>(data, d_inputs, param, ldimension, param.outputStride[dimension]);
+  CatArrayBatchedCopy<real, unsigned int, DIMS><<<applyGrid, applyBlock>>>(data, d_inputs, param, cat_dimension, param.outputStride[cat_dimension]);
 
     // Now we loop
     offset = 0;
     for (i = 0; i < numInputs; i += CAT_ARRAY_BATCH_SIZE) {
       cohortMax = 0;
       for (j = 0; j < CAT_ARRAY_BATCH_SIZE && (i+j) < numInputs; ++j) {
-        long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[i+j])
-          ? THCTensor_(size)(state, inputs[i+j], ldimension)
+        long dimSize = cat_dimension < THCTensor_(nDimension)(state, inputs[i+j])
+          ? THCTensor_(size)(state, inputs[i+j], cat_dimension)
           : 1;
 
         stackInputs[j].input = THCTensor_(data)(state, inputs[i+j]);
@@ -223,7 +223,7 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
         // update offset
         offset += dimSize;
       }
-      cudaMemcpy(d_inputs, stackInputs, j * sizeof(CatArrInputTensor<real, unsigned int>), cudaMemcpyHostToDevice);
+      THCudaCheck(cudaMemcpy(d_inputs, stackInputs, j * sizeof(CatArrInputTensor<real, unsigned int>), cudaMemcpyHostToDevice));
 
       // Next, let's consider how we set our kernel launch parameters.
       // We borrow from THCApply, which the kernel's internal indexing
@@ -267,12 +267,12 @@ void THCTensor_(catArray)(THCState *state, THCTensor *result,
       // No reason to copy when input is empty
       if (!THCTensor_(nDimension)(state, inputs[j])) continue;
 
-      long dimSize = ldimension < THCTensor_(nDimension)(state, inputs[j])
-               ? THCTensor_(size)(state, inputs[j], ldimension)
+      long dimSize = cat_dimension < THCTensor_(nDimension)(state, inputs[j])
+               ? THCTensor_(size)(state, inputs[j], cat_dimension)
                : 1;
 
       THCTensor *nt = THCTensor_(newWithTensor)(state, result);
-      THCTensor_(narrow)(state, nt, NULL, ldimension, offset, dimSize);
+      THCTensor_(narrow)(state, nt, NULL, cat_dimension, offset, dimSize);
       THCTensor_(copy)(state, nt, inputs[j]);
       THCTensor_(free)(state, nt);
       offset += dimSize;
diff --git a/torch/lib/THC/generic/THCTensorMathBlas.cu b/torch/lib/THC/generic/THCTensorMathBlas.cu
@@ -430,7 +430,7 @@ __global__ void createBatchGemmBuffer(const real** buffer, real* data,
 THC_API void
 THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
                     real alpha, THCTensor *batch1, THCTensor *batch2) {
-#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
   THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor");
   THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
@@ -522,8 +522,10 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
     ldb = batch2_->stride[1];
   }
 
-  // Compute pointers to matrices in each batch.
   long num_batches = result_->size[0];
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+  // Compute pointers to matrices in each batch.
   size_t matrices_size = num_batches * sizeof(real*);
 
   // Copy pointers to device.
@@ -580,6 +582,24 @@ THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
   THCudaFree(state, d_matrices2);
   THCudaFree(state, d_result_matrices);
 
+#elif defined(THC_REAL_IS_HALF)
+  // Currently no HgemmBatched in Cublas
+  for (long i = 0; i < num_batches; ++i) {
+    THCudaBlas_Hgemm(
+        state,
+        transpose_batch1,
+        transpose_batch2,
+        result_->size[transpose_result ? 2 : 1],
+        result_->size[transpose_result ? 1 : 2],
+        batch1_->size[transpose_result ? 1 : 2],
+        alpha,
+        THCTensor_(data)(state, batch1_) + i * batch1_->stride[0], lda,
+        THCTensor_(data)(state, batch2_) + i * batch2_->stride[0], ldb,
+        beta,
+        THCTensor_(data)(state, result_) + i * result_->stride[0], ldc);
+  }
+#endif
+
   if (batch1_ != batch1) {
     THCTensor_(free)(state, batch1_);
   }

Original file line number	Diff line number	Diff line change
`@@ -331,7 +331,7 @@ bool THC_reduceAll(THCState* state,`
`331`	`331`	`// If our destination is not on the device, copy the value back to`
`332`	`332`	`// the host (synchronous!)`
`333`	`333`	`if (!outOnDevice) {`
`334`		`- cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);`
	`334`	`+ THCudaCheck(cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost));`
`335`	`335`	`}`
`336`	`336`
`337`	`337`	`if (freeDevOut) {`