Update on "[quant][graphmode][fix] cloning schema in insert_observers"

jerryzh168 · jerryzh168 · commit 85541a3e86a9 · 2020-06-26T19:32:13.000-07:00
Summary: Previously we didn't clone schema, so the default schema is used, this is causing issue for some models Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D22259519](https://our.internmc.facebook.com/intern/diff/D22259519) [ghstack-poisoned]
diff --git a/aten/src/ATen/test/extension_backend_test.cpp b/aten/src/ATen/test/extension_backend_test.cpp
@@ -29,12 +29,12 @@ Tensor add_override(const Tensor & a, const Tensor & b , Scalar c) {
   return a;
 }
 
+TORCH_LIBRARY_IMPL(aten, MSNPU, m) {
+  m.impl_UNBOXED("aten::empty.memory_format",  empty_override);
+  m.impl_UNBOXED("aten::add.Tensor",           add_override);
+}
+
 TEST(BackendExtensionTest, TestRegisterOp) {
-  EXPECT_ANY_THROW(empty({5, 5}, at::kMSNPU));
-  auto registry1 = torch::RegisterOperators()
-    .op(torch::RegisterOperators::options()
-      .schema("aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor")
-      .impl_unboxedOnlyKernel<decltype(empty_override), &empty_override>(DispatchKey::MSNPU));
   Tensor a = empty({5, 5}, at::kMSNPU);
   ASSERT_EQ(a.device().type(), at::kMSNPU);
   ASSERT_EQ(a.device().index(), 1);
@@ -46,11 +46,6 @@ TEST(BackendExtensionTest, TestRegisterOp) {
   ASSERT_EQ(b.device().index(), 1);
   ASSERT_EQ(b.dtype(), caffe2::TypeMeta::Make<float>());
 
-  EXPECT_ANY_THROW(add(a, b));
-  auto registry2 = torch::RegisterOperators()
-    .op(torch::RegisterOperators::options()
-      .schema("aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor")
-      .impl_unboxedOnlyKernel<decltype(add_override), &add_override>(DispatchKey::MSNPU));
   add(a, b);
   ASSERT_EQ(test_int, 2);
 
diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py
@@ -3,14 +3,21 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 from collections import defaultdict
+
 import numpy as np
 from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer, AccessedFeatures
+from caffe2.python.layers.layers import AccessedFeatures, ModelLayer
 
 
 class FeatureSparseToDense(ModelLayer):
     def __init__(
-        self, model, input_record, input_specs, name="feature_sparse_to_dense", **kwargs
+        self,
+        model,
+        input_record,
+        input_specs,
+        name="feature_sparse_to_dense",
+        default_dense_value=None,
+        **kwargs
     ):
         """
         `input_specs` follows the format of FeatureSpec from schema. To be more
@@ -20,6 +27,11 @@ def __init__(
         super(FeatureSparseToDense, self).__init__(model, name, input_record, **kwargs)
 
         self.input_specs = input_specs
+        model.maybe_add_global_constant(
+            "DEFAULT_FLOAT_FEATURE_VALUE", float(default_dense_value or 0.0)
+        )
+        self.default_float_value = model.global_constants["DEFAULT_FLOAT_FEATURE_VALUE"]
+        self.zero_range = model.global_constants["ZERO_RANGE"]
 
         outputs = []
         for field, feature_specs in self.input_specs:
@@ -158,8 +170,6 @@ def __init__(
             schema.attach_metadata_to_scalars(
                 self.output_schema[field], schema.Metadata(feature_specs=feature_specs)
             )
-        self.zero = model.global_constants["ZERO"]
-        self.zero_range = model.global_constants["ZERO_RANGE"]
 
     # Add operators to all types that need to be densified
     def add_ops(self, net):
@@ -170,7 +180,7 @@ def add_ops(self, net):
                     [
                         record[field].keys(),
                         record[field].values(),
-                        self.zero,
+                        self.default_float_value,
                         record[field].lengths(),
                     ],
                     [self.output_schema[field]()],
@@ -304,8 +314,7 @@ def get_accessed_features(self):
         for field, feature_specs in self.input_specs:
             accessed_features[field].append(
                 AccessedFeatures(
-                    feature_specs.feature_type,
-                    set(feature_specs.feature_ids)
+                    feature_specs.feature_type, set(feature_specs.feature_ids)
                 )
             )
 
diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cuh b/caffe2/sgd/adagrad_fused_op_gpu.cuh
@@ -16,6 +16,21 @@
 
 namespace caffe2 {
 
+template <typename srcType, typename dstType>
+inline __device__ dstType convertPrecisionToPrecision(srcType param) {
+  return param;
+}
+
+template <>
+inline __device__ float convertPrecisionToPrecision<at::Half, float>(at::Half param) {
+  return __half2float(param);
+}
+
+template <>
+inline __device__ at::Half convertPrecisionToPrecision<float, at::Half>(float param) {
+  return __float2half(param);
+}
+
 
 static inline __device__ void gpuAtomicAdd(float* address, float val) {
   atomicAdd(address, val);
@@ -89,7 +104,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
 
       // post == blockDim.x
       const size_t paramIdx = index * post + threadIdx.x; // index for param
-      const float x_ij = grad[gradIdx] + weight_decay * param[paramIdx];
+      const float x_ij = grad[gradIdx] + weight_decay * convertPrecisionToPrecision<TParam, float>(param[paramIdx]);
       sum_squares += x_ij * x_ij;
 
       // Return the warp-wide sums to each lane0 (threads 0, 32, 64, 96, ...)
@@ -106,7 +121,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
 
       // update param
       float step = LR / (sqrtf(param_mom[index]) + epsilon);
-      param[paramIdx] = param[paramIdx] + x_ij * step;
+      param[paramIdx] = convertPrecisionToPrecision<float, TParam>(convertPrecisionToPrecision<TParam, float>(param[paramIdx]) + x_ij * step);
     }
   } else {
     // TODO: Tuning NumThreads for sum_squares
@@ -123,7 +138,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
       for (int i = threadIdx.x; i < post; i += blockDim.x) {
         // i: index in the embedding dimension
         const float x_ij =
-            grad[group * post + i] + weight_decay * param[index * post + i];
+            grad[group * post + i] + weight_decay * convertPrecisionToPrecision<TParam, float>(param[index * post + i]);
         sum_squares += x_ij * x_ij;
       }
       float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
@@ -140,10 +155,10 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
       for (int i = threadIdx.x; i < post; i += blockDim.x) {
         size_t paramIdx = index * post + i; // index for param
         float x_ij = grad[group * post + i] + weight_decay * param[paramIdx];
-        float param_new = param[paramIdx] + x_ij * step;
+        float param_new = convertPrecisionToPrecision<TParam, float>(param[paramIdx]) + x_ij * step;
         // float param_new1 = param[paramIdx];
         // printf("step %f, x_ij %f", step, x_ij);
-        param[paramIdx] = param_new;
+        param[paramIdx] = convertPrecisionToPrecision<float, TParam>(param_new);
       }
     }
   }
diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp
@@ -1,9 +1,9 @@
 #include <torch/extension.h>
+#include <torch/library.h>
 #include <ATen/Generator.h>
 #include <ATen/Tensor.h>
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/cpu/DistributionTemplates.h>
-#include <ATen/core/op_registration/op_registration.h>
 #include <memory>
 
 using namespace at;
@@ -53,21 +53,13 @@ size_t getInstanceCount() {
   return instance_count;
 }
 
-void registerOps() {
-  static auto registry = torch::RegisterOperators()
-      .op(torch::RegisterOperators::options()
-        .schema("aten::random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)")
-        .impl_unboxedOnlyKernel<decltype(random_from_to), &random_from_to>(DispatchKey::CustomRNGKeyId))
-      .op(torch::RegisterOperators::options()
-        .schema("aten::random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)")
-        .impl_unboxedOnlyKernel<decltype(random_to), &random_to>(DispatchKey::CustomRNGKeyId))
-      .op(torch::RegisterOperators::options()
-        .schema("aten::random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)")
-        .impl_unboxedOnlyKernel<decltype(random_), &random_>(DispatchKey::CustomRNGKeyId));
+TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) {
+  m.impl_UNBOXED("aten::random_.from",                 random_from_to);
+  m.impl_UNBOXED("aten::random_.to",                   random_to);
+  m.impl_UNBOXED("aten::random_",                      random_);
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("registerOps", &registerOps);
   m.def("createTestCPUGenerator", &createTestCPUGenerator);
   m.def("getInstanceCount", &getInstanceCount);
   m.def("identity", &identity);
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
@@ -141,7 +141,6 @@ class TestRNGExtension(common.TestCase):
 
     def setUp(self):
         super(TestRNGExtension, self).setUp()
-        rng_extension.registerOps()
 
     def test_rng(self):
         fourty_two = torch.full((10,), 42, dtype=torch.int64)