Skip to content

Commit 85541a3

Browse files
committed
Update on "[quant][graphmode][fix] cloning schema in insert_observers"
Summary: Previously we didn't clone schema, so the default schema is used, this is causing issue for some models Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D22259519](https://our.internmc.facebook.com/intern/diff/D22259519) [ghstack-poisoned]
2 parents b6d3eee + 1cbfe28 commit 85541a3

File tree

5 files changed

+46
-36
lines changed

5 files changed

+46
-36
lines changed

aten/src/ATen/test/extension_backend_test.cpp

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ Tensor add_override(const Tensor & a, const Tensor & b , Scalar c) {
2929
return a;
3030
}
3131

32+
TORCH_LIBRARY_IMPL(aten, MSNPU, m) {
33+
m.impl_UNBOXED("aten::empty.memory_format", empty_override);
34+
m.impl_UNBOXED("aten::add.Tensor", add_override);
35+
}
36+
3237
TEST(BackendExtensionTest, TestRegisterOp) {
33-
EXPECT_ANY_THROW(empty({5, 5}, at::kMSNPU));
34-
auto registry1 = torch::RegisterOperators()
35-
.op(torch::RegisterOperators::options()
36-
.schema("aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor")
37-
.impl_unboxedOnlyKernel<decltype(empty_override), &empty_override>(DispatchKey::MSNPU));
3838
Tensor a = empty({5, 5}, at::kMSNPU);
3939
ASSERT_EQ(a.device().type(), at::kMSNPU);
4040
ASSERT_EQ(a.device().index(), 1);
@@ -46,11 +46,6 @@ TEST(BackendExtensionTest, TestRegisterOp) {
4646
ASSERT_EQ(b.device().index(), 1);
4747
ASSERT_EQ(b.dtype(), caffe2::TypeMeta::Make<float>());
4848

49-
EXPECT_ANY_THROW(add(a, b));
50-
auto registry2 = torch::RegisterOperators()
51-
.op(torch::RegisterOperators::options()
52-
.schema("aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor")
53-
.impl_unboxedOnlyKernel<decltype(add_override), &add_override>(DispatchKey::MSNPU));
5449
add(a, b);
5550
ASSERT_EQ(test_int, 2);
5651

caffe2/python/layers/feature_sparse_to_dense.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,21 @@
33
from __future__ import absolute_import, division, print_function, unicode_literals
44

55
from collections import defaultdict
6+
67
import numpy as np
78
from caffe2.python import schema
8-
from caffe2.python.layers.layers import ModelLayer, AccessedFeatures
9+
from caffe2.python.layers.layers import AccessedFeatures, ModelLayer
910

1011

1112
class FeatureSparseToDense(ModelLayer):
1213
def __init__(
13-
self, model, input_record, input_specs, name="feature_sparse_to_dense", **kwargs
14+
self,
15+
model,
16+
input_record,
17+
input_specs,
18+
name="feature_sparse_to_dense",
19+
default_dense_value=None,
20+
**kwargs
1421
):
1522
"""
1623
`input_specs` follows the format of FeatureSpec from schema. To be more
@@ -20,6 +27,11 @@ def __init__(
2027
super(FeatureSparseToDense, self).__init__(model, name, input_record, **kwargs)
2128

2229
self.input_specs = input_specs
30+
model.maybe_add_global_constant(
31+
"DEFAULT_FLOAT_FEATURE_VALUE", float(default_dense_value or 0.0)
32+
)
33+
self.default_float_value = model.global_constants["DEFAULT_FLOAT_FEATURE_VALUE"]
34+
self.zero_range = model.global_constants["ZERO_RANGE"]
2335

2436
outputs = []
2537
for field, feature_specs in self.input_specs:
@@ -158,8 +170,6 @@ def __init__(
158170
schema.attach_metadata_to_scalars(
159171
self.output_schema[field], schema.Metadata(feature_specs=feature_specs)
160172
)
161-
self.zero = model.global_constants["ZERO"]
162-
self.zero_range = model.global_constants["ZERO_RANGE"]
163173

164174
# Add operators to all types that need to be densified
165175
def add_ops(self, net):
@@ -170,7 +180,7 @@ def add_ops(self, net):
170180
[
171181
record[field].keys(),
172182
record[field].values(),
173-
self.zero,
183+
self.default_float_value,
174184
record[field].lengths(),
175185
],
176186
[self.output_schema[field]()],
@@ -304,8 +314,7 @@ def get_accessed_features(self):
304314
for field, feature_specs in self.input_specs:
305315
accessed_features[field].append(
306316
AccessedFeatures(
307-
feature_specs.feature_type,
308-
set(feature_specs.feature_ids)
317+
feature_specs.feature_type, set(feature_specs.feature_ids)
309318
)
310319
)
311320

caffe2/sgd/adagrad_fused_op_gpu.cuh

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,21 @@
1616

1717
namespace caffe2 {
1818

19+
template <typename srcType, typename dstType>
20+
inline __device__ dstType convertPrecisionToPrecision(srcType param) {
21+
return param;
22+
}
23+
24+
template <>
25+
inline __device__ float convertPrecisionToPrecision<at::Half, float>(at::Half param) {
26+
return __half2float(param);
27+
}
28+
29+
template <>
30+
inline __device__ at::Half convertPrecisionToPrecision<float, at::Half>(float param) {
31+
return __float2half(param);
32+
}
33+
1934

2035
static inline __device__ void gpuAtomicAdd(float* address, float val) {
2136
atomicAdd(address, val);
@@ -89,7 +104,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
89104

90105
// post == blockDim.x
91106
const size_t paramIdx = index * post + threadIdx.x; // index for param
92-
const float x_ij = grad[gradIdx] + weight_decay * param[paramIdx];
107+
const float x_ij = grad[gradIdx] + weight_decay * convertPrecisionToPrecision<TParam, float>(param[paramIdx]);
93108
sum_squares += x_ij * x_ij;
94109

95110
// Return the warp-wide sums to each lane0 (threads 0, 32, 64, 96, ...)
@@ -106,7 +121,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
106121

107122
// update param
108123
float step = LR / (sqrtf(param_mom[index]) + epsilon);
109-
param[paramIdx] = param[paramIdx] + x_ij * step;
124+
param[paramIdx] = convertPrecisionToPrecision<float, TParam>(convertPrecisionToPrecision<TParam, float>(param[paramIdx]) + x_ij * step);
110125
}
111126
} else {
112127
// TODO: Tuning NumThreads for sum_squares
@@ -123,7 +138,7 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
123138
for (int i = threadIdx.x; i < post; i += blockDim.x) {
124139
// i: index in the embedding dimension
125140
const float x_ij =
126-
grad[group * post + i] + weight_decay * param[index * post + i];
141+
grad[group * post + i] + weight_decay * convertPrecisionToPrecision<TParam, float>(param[index * post + i]);
127142
sum_squares += x_ij * x_ij;
128143
}
129144
float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
@@ -140,10 +155,10 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_kernel(
140155
for (int i = threadIdx.x; i < post; i += blockDim.x) {
141156
size_t paramIdx = index * post + i; // index for param
142157
float x_ij = grad[group * post + i] + weight_decay * param[paramIdx];
143-
float param_new = param[paramIdx] + x_ij * step;
158+
float param_new = convertPrecisionToPrecision<TParam, float>(param[paramIdx]) + x_ij * step;
144159
// float param_new1 = param[paramIdx];
145160
// printf("step %f, x_ij %f", step, x_ij);
146-
param[paramIdx] = param_new;
161+
param[paramIdx] = convertPrecisionToPrecision<float, TParam>(param_new);
147162
}
148163
}
149164
}

test/cpp_extensions/rng_extension.cpp

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#include <torch/extension.h>
2+
#include <torch/library.h>
23
#include <ATen/Generator.h>
34
#include <ATen/Tensor.h>
45
#include <ATen/native/DistributionTemplates.h>
56
#include <ATen/native/cpu/DistributionTemplates.h>
6-
#include <ATen/core/op_registration/op_registration.h>
77
#include <memory>
88

99
using namespace at;
@@ -53,21 +53,13 @@ size_t getInstanceCount() {
5353
return instance_count;
5454
}
5555

56-
void registerOps() {
57-
static auto registry = torch::RegisterOperators()
58-
.op(torch::RegisterOperators::options()
59-
.schema("aten::random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)")
60-
.impl_unboxedOnlyKernel<decltype(random_from_to), &random_from_to>(DispatchKey::CustomRNGKeyId))
61-
.op(torch::RegisterOperators::options()
62-
.schema("aten::random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)")
63-
.impl_unboxedOnlyKernel<decltype(random_to), &random_to>(DispatchKey::CustomRNGKeyId))
64-
.op(torch::RegisterOperators::options()
65-
.schema("aten::random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)")
66-
.impl_unboxedOnlyKernel<decltype(random_), &random_>(DispatchKey::CustomRNGKeyId));
56+
TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) {
57+
m.impl_UNBOXED("aten::random_.from", random_from_to);
58+
m.impl_UNBOXED("aten::random_.to", random_to);
59+
m.impl_UNBOXED("aten::random_", random_);
6760
}
6861

6962
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
70-
m.def("registerOps", &registerOps);
7163
m.def("createTestCPUGenerator", &createTestCPUGenerator);
7264
m.def("getInstanceCount", &getInstanceCount);
7365
m.def("identity", &identity);

test/test_cpp_extensions_aot.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,6 @@ class TestRNGExtension(common.TestCase):
141141

142142
def setUp(self):
143143
super(TestRNGExtension, self).setUp()
144-
rng_extension.registerOps()
145144

146145
def test_rng(self):
147146
fourty_two = torch.full((10,), 42, dtype=torch.int64)

0 commit comments

Comments
 (0)