Rebase + don't waste memory when not in max mode

EthanSteinberg · EthanSteinberg · commit 48b0d0734b09 · 2018-03-15T14:14:23.000-07:00
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -177,7 +177,7 @@ embedding_bag_cpu(const Tensor &weight, const Tensor &indices__,
     }
     make_bag_size(offsets, indices, mode, bag_size);
     auto ret = apply_bag_size(offsets, indices, mode, output, bag_size);
-    return std::tuple<Tensor, Tensor, Tensor>(ret, offset2bag, bag_size);
+    return std::tuple<Tensor, Tensor, Tensor, Tensor>(ret, offset2bag, bag_size, bag_size);
   } else { // MODE_MAX
     return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       weight.type(), "embedding_bag_cpu_max", [&]() {
@@ -296,37 +296,19 @@ Tensor embedding_bag_backward_cpu(const Tensor &grad_, const Tensor &indices__,
           if (grad.type().scalarType() == kFloat) {
             auto igwd = index_grad_weight.data<float>();
             auto gd = grad.data<float>();
-            THFloatBlas_axpy(ddim, (float)scale, gd + ddim * source, 1,
-                             igwd + ddim * index, 1);
+            axpy<float>(ddim, (float)scale, gd + ddim * source, 1,
+                        igwd + ddim * index, 1);
           } else if (grad.type().scalarType() == kDouble) {
             auto igwd = index_grad_weight.data<double>();
             auto gd = grad.data<double>();
-            THDoubleBlas_axpy(ddim, (double)scale, gd + ddim * source, 1,
-                              igwd + ddim * index, 1);
-          } else {
-            index_grad_weight[index].add_(grad[source], scale);
+            axpy<double>(ddim, (double)scale, gd + ddim * source, 1,
+                         igwd + ddim * index, 1);
           }
         }
-<<<<<<< HEAD
-      }
-      int64_t ddim = grad.sizes()[1];
-      if (grad.type().scalarType() == kFloat) {
-        auto igwd = index_grad_weight.data<float>();
-        auto gd = grad.data<float>();
-        axpy<float>(ddim, (float)scale, gd + ddim * source, 1,
-                    igwd + ddim * index, 1);
-      } else if (grad.type().scalarType() == kDouble) {
-        auto igwd = index_grad_weight.data<double>();
-        auto gd = grad.data<double>();
-        axpy<double>(ddim, (double)scale, gd + ddim * source, 1,
-                     igwd + ddim * index, 1);
-      }
-=======
       } 
   } else if (mode == MODE_MAX) {
     for (int64_t dim = 0; dim < grad.sizes()[1]; dim++) {
       index_grad_weight.select(1, dim).index_add_(0, max_indices_.select(1, dim), grad_.select(1, dim));
->>>>>>> Add max mode support to EmbeddingBag
     }
   }
 
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -330,7 +330,12 @@ embedding_bag_cuda(const Tensor &weight, const Tensor &indices,
   cudaStream_t stream = globalContext().getCurrentCUDAStream();
 
   auto output = at::zeros(weight.type(), {offsets.sizes()[0], weight.sizes()[1]});
-  auto max_indices = at::zeros(indices.type(), {offsets.sizes()[0], weight.sizes()[1]});
+  
+  Tensor max_indices;
+  
+  if (mode == MODE_MAX) {
+    max_indices = at::zeros(indices.type(), {offsets.sizes()[0], weight.sizes()[1]});
+  }
 
   dim3 block = dim3(32, 8);
   int grid = 1024;
@@ -340,7 +345,7 @@ embedding_bag_cuda(const Tensor &weight, const Tensor &indices,
         indices.data<int64_t>(), offsets.data<int64_t>(),
         weight.data<cuda_scalar_t>(), output.data<cuda_scalar_t>(),
         offset2bag.data<int64_t>(), numIndices, numBags, stride, mode,
-        bag_size.data<int64_t>(), max_indices.data<int64_t>());
+        bag_size.data<int64_t>(), mode == MODE_MAX ? max_indices.data<int64_t>() : NULL);
   });
 
   THCudaCheck(cudaGetLastError());