Perform weight re-init for embedding table in sparse_lookup.py (#22348)

Alyssa Wang · facebook-github-bot · commit d9e15bccb0a1 · 2019-07-03T10:33:40.000-07:00
Summary: Pull Request resolved: #22348 This is the last step of LRU hash eviction weight re-init. This diff checks if there's evicted values in sparse_lookup, if so call op created in D15709866 to re-init the values for indicies in evicted_values. Also created gradient op for the operator. The gradient op just passes the output gradient as input gradient. Reviewed By: itomatik Differential Revision: D16044736 fbshipit-source-id: 9afb85209b0de1038c5153bcb7dfc5f52e0b2abb
diff --git a/caffe2/operators/copy_rows_to_tensor_op.cc b/caffe2/operators/copy_rows_to_tensor_op.cc
@@ -2,7 +2,11 @@
 
 namespace caffe2 {
 namespace {
+
 REGISTER_CPU_OPERATOR(CopyRowsToTensor, CopyRowsToTensorOp<CPUContext>);
+REGISTER_CPU_GRADIENT_OPERATOR(
+    CopyRowsToTensorGradient,
+    CopyRowsToTensorGradientOp<CPUContext>);
 
 OPERATOR_SCHEMA(CopyRowsToTensor)
     .NumInputs(3)
@@ -30,5 +34,36 @@ OPERATOR_SCHEMA(CopyRowsToTensor)
       return out;
     });
 
+GRADIENT_OPERATOR_SCHEMA(CopyRowsToTensorGradient)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}});
+
+class GetCopyRowsToTensorGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    if (g_output_[0].IsDense()) {
+      return SingleGradientDef(
+          "CopyRowsToTensorGradient",
+          "",
+          vector<string>{GO(0)},
+          vector<string>{GI(0)});
+    } else {
+      return vector<OperatorDef>{CreateOperatorDef(
+                                     "CopyRowsToTensorGradient",
+                                     "",
+                                     std::vector<string>{GO_I(0)},
+                                     std::vector<string>{GI_I(0)}),
+                                 CreateOperatorDef(
+                                     "CopyRowsToTensorGradient",
+                                     "",
+                                     std::vector<string>{GO_V(0)},
+                                     std::vector<string>{GI_V(0)})};
+    }
+  }
+};
+
+REGISTER_GRADIENT(CopyRowsToTensor, GetCopyRowsToTensorGradient);
+
 } // namespace
 } // namespace caffe2
diff --git a/caffe2/operators/copy_rows_to_tensor_op.h b/caffe2/operators/copy_rows_to_tensor_op.h
@@ -53,4 +53,30 @@ class CopyRowsToTensorOp : public Operator<Context> {
  protected:
   INPUT_TAGS(INPUT_TENSOR, INDICES, ROW);
 };
+
+template <class Context>
+class CopyRowsToTensorGradientOp : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  CopyRowsToTensorGradientOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<Context>(operator_def, ws) {}
+
+  bool RunOnDevice() override {
+    return DispatchHelper<
+        TensorTypes<at::Half, float, double, int32_t, int64_t>>::
+        call(this, Input(0));
+  }
+  template <typename T>
+  bool DoRunWithType() {
+    auto* output = Output(0);
+    output->ResizeLike(Input(0));
+    auto* output_data = output->template mutable_data<T>();
+    auto& input = Input(0);
+    const auto* input_data = input.template data<T>();
+    std::memcpy(output_data, input_data, input.size(0) * sizeof(T));
+
+    return true;
+  }
+};
+
 } // namespace caffe2
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
@@ -125,6 +125,12 @@ def __init__(self, model, input_record, inner_shape, reducer,
 
         self.weight_init = weight_init or default_init_op
 
+        self.evicted_values = None
+        if schema.equal_schemas(self.input_record, IdListWithEvicted) or \
+            schema.equal_schemas(self.input_record, IdScoreListWithEvicted,
+                                 check_field_types=False):
+            self.evicted_values = self.input_record._evicted_values
+
         # If fp16 is used, make sure fp16 init op is used
         if self.trainer_version == "fp16":
             assert self.reducer in self._fp16_compatible_reducers, (
@@ -169,6 +175,14 @@ def __init__(self, model, input_record, inner_shape, reducer,
                 average_length=avg_length),
             regularizer=regularizer
         )
+        if self.evicted_values:
+            self.reinit_vec = self.create_param(
+                param_name="reinit_vec",
+                shape=inner_shape,
+                initializer=self.weight_init,
+                optimizer=model.NoOptim,
+                regularizer=None,
+            )
 
         self.scale_bias_init = ('ConstantFill', {'value': 0.0})
 
@@ -407,6 +421,9 @@ def _add_ops_id_score_list(self, net, version):
                 "Trying to create with {}".format(self.reducer)
 
     def _add_ops(self, net, version='fp32'):
+        if self.evicted_values:
+            net.CopyRowsToTensor(
+                [self.w, self.evicted_values.get(), self.reinit_vec], [self.w])
         if _is_id_list(self.input_record):
             self._add_ops_id_list(net, version=version)
         elif _is_id_score_list(self.input_record):
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
@@ -31,7 +31,6 @@
     is_request_only_scalar,
     get_key,
 )
-
 import logging
 logger = logging.getLogger(__name__)
 
@@ -231,6 +230,46 @@ def testFCwithAxis2(self):
 
         train_init_net, train_net = self.get_training_nets()
 
+    def testSparseLookupSumPoolingWithEviction(self):
+        # Create test embedding table of 1 row
+        record = schema.NewRecord(self.model.net, schema.Struct(
+            ('sparse', schema.Struct(
+                ('sparse_feature_0', schema.ListWithEvicted(
+                    schema.Scalar(np.int64,
+                                  metadata=schema.Metadata(categorical_limit=1)),)),)),
+        ))
+        embedding_dim = 8
+        lengths_blob = record.sparse.sparse_feature_0.lengths.get()
+        values_blob = record.sparse.sparse_feature_0.items.get()
+        evicted_values_blob = record.sparse.sparse_feature_0._evicted_values.get()
+        lengths = np.array([1]).astype(np.int32)
+        values = np.array([0]).astype(np.int64)
+        # Need to reset row 0
+        evicted_values = np.array([0]).astype(np.int64)
+        workspace.FeedBlob(lengths_blob, lengths)
+        workspace.FeedBlob(values_blob, values)
+        workspace.FeedBlob(evicted_values_blob, evicted_values)
+
+        embedding_after_pooling = self.model.SparseLookup(
+            record.sparse.sparse_feature_0, [embedding_dim], 'Sum', weight_init=("ConstantFill", {"value": 1.0}))
+
+        self.model.output_schema = schema.Struct()
+        self.assertEqual(
+            schema.Scalar((np.float32, (embedding_dim, ))),
+            embedding_after_pooling
+        )
+        train_init_net, train_net = self.get_training_nets()
+        workspace.RunNetOnce(train_init_net)
+        embedding_after_init = workspace.FetchBlob("sparse_lookup/w")
+        # Change row 0's value before reset
+        new_values = np.array([[2, 2, 2, 2, 2, 2, 2, 2]]).astype(np.float32)
+        workspace.FeedBlob("sparse_lookup/w", new_values)
+        workspace.RunNetOnce(train_net.Proto())
+        embedding_after_training = workspace.FetchBlob("sparse_lookup/w")
+        # Verify row 0's value does not change after reset
+        self.assertEquals(embedding_after_training.all(), embedding_after_init.all())
+
+
 
     def testSparseLookupSumPooling(self):
         record = schema.NewRecord(self.model.net, schema.Struct(
diff --git a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
@@ -15,8 +15,6 @@ def get_input_tensors():
     height = np.random.randint(1, 10)
     width = np.random.randint(1, 10)
     dtype = np.float32
-    print("height", height)
-    print("width", width)
     input_tensor = hu.arrays(
         dims=[height, width],
         dtype=dtype,
@@ -43,12 +41,12 @@ def ref(input_tensor, indices, row):
             for idx in indices:
                 input_tensor[idx] = row
             return [input_tensor]
-
+        op = core.CreateOperator(
+            "CopyRowsToTensor", ["input_tensor", "indices", "row"], ["input_tensor"]
+        )
         self.assertReferenceChecks(
             device_option=gc,
-            op=core.CreateOperator(
-                "CopyRowsToTensor", ["input_tensor", "indices", "row"], ["input_tensor"]
-            ),
+            op=op,
             inputs=[input_tensor, indices, row],
             reference=ref,
         )