Skip to content

Commit 06d43dc

Browse files
Hector Yuenfacebook-github-bot
authored andcommitted
default ice-ref to c-step (#4812)
Summary: Pull Request resolved: pytorch/glow#4812 if no compilation options are passed, default to c-step fixed the FC and batchmatmul implementations to match C-step fixed the fakelowp map calling to make sure we use the fp32 substitution of operators updated the accumulator test to make it pass with fp32 Test Plan: fakelowp tests glow/test/numerics net_runner Reviewed By: jfix71 Differential Revision: D23086534 fbshipit-source-id: 3fbb8c4055bb190becb39ce8cdff6671f8558734
1 parent fa6b34b commit 06d43dc

File tree

3 files changed

+86
-82
lines changed

3 files changed

+86
-82
lines changed

caffe2/contrib/fakelowp/fp16_gemm_utils.cc

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
#include "caffe2/core/context.h"
77
#include "caffe2/utils/math.h"
88

9+
C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
10+
911
namespace caffe2 {
1012

1113
// dimA(before transpose) = M x N, dimA (after transpose) = N x M.
@@ -26,12 +28,6 @@ void custom_fp16_gemm_with_trans(
2628
float* C,
2729
const bool use_acc_fp16,
2830
const bool use_temp_accumulator) {
29-
if (!use_acc_fp16 && !use_temp_accumulator) {
30-
math::Gemm<float, CPUContext>(
31-
trans_A, trans_B, m, n, k, 1.0f, A, B, beta, C, nullptr);
32-
return;
33-
}
34-
3531
switch (trans_A) {
3632
case CblasNoTrans: {
3733
switch (trans_B) {
@@ -126,22 +122,6 @@ void custom_fp16_gemm(
126122
float* C,
127123
const bool use_acc_fp16,
128124
const bool use_temp_accumulator) {
129-
if (!use_acc_fp16 && !use_temp_accumulator) {
130-
math::Gemm<float, CPUContext>(
131-
CblasNoTrans,
132-
CblasNoTrans,
133-
m,
134-
n,
135-
k,
136-
1.0f,
137-
A_fp16,
138-
B_fp16,
139-
beta,
140-
C,
141-
nullptr);
142-
return;
143-
}
144-
145125
#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
146126
clock_t begin = clock();
147127
#endif
@@ -300,6 +280,29 @@ void custom_fp16_gemm(
300280
}
301281
}
302282
}
283+
284+
if (!use_acc_fp16) {
285+
constexpr int kSize=8;
286+
int i = 0;
287+
for (; i + kSize <= C_size; i+= kSize) {
288+
__m256 mC = _mm256_loadu_ps(C + i);
289+
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
290+
_mm256_storeu_ps(C + i, mC);
291+
}
292+
if (i < C_size){
293+
vector<float> tmp(8);
294+
for (int kk =0; kk + i < C_size; kk++) {
295+
tmp[kk] = C[i + kk];
296+
}
297+
__m256 mC = _mm256_loadu_ps(tmp.data());
298+
mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
299+
_mm256_storeu_ps(tmp.data(), mC);
300+
for (int kk =0; kk + i < C_size; kk++) {
301+
C[i + kk] = tmp[kk];
302+
}
303+
}
304+
}
305+
303306
#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
304307
clock_t end = clock();
305308
double elapsed_secs = double(end - begin) / CLOCkS_PER_SEC;

caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@
2121
class TestBatchMatMul(serial.SerializedTestCase):
2222
@given(
2323
# C=0, #st.integers(min_value=0, max_value=3), # number of batch dims
24-
M=st.integers(min_value=1, max_value=10),
25-
K=st.integers(min_value=1, max_value=10),
26-
N=st.integers(min_value=1, max_value=10),
24+
M=st.integers(min_value=1, max_value=50),
25+
K=st.integers(min_value=1, max_value=50),
26+
N=st.integers(min_value=1, max_value=50),
2727
rand_seed=st.integers(0, 65534),
2828
trans_a=st.booleans(),
2929
trans_b=st.booleans(),
@@ -65,7 +65,9 @@ def test_batch_matmul(self, M, K, N, rand_seed, trans_a, trans_b, run_ints):
6565
)
6666

6767
pred_net_ref = core.Net("pred_net_ref")
68-
pred_net_ref.BatchMatMulFP16Acc16Fake(
68+
69+
# Reference updated to fp16 with fp32 accumulation
70+
pred_net_ref.BatchMatMulFP16Acc32Fake(
6971
["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b)
7072

7173
print("dims", batch_dims, X.shape, Y.shape)
@@ -98,7 +100,10 @@ def test_batch_matmul(self, M, K, N, rand_seed, trans_a, trans_b, run_ints):
98100
print_test_debug_info("bmm", {
99101
"seed": rand_seed,
100102
"m": M, "k": K,
101-
"n": N, "X": X, "Y": Y,
103+
"n": N, "X": X.shape, "Y": Y.shape,
104+
"trans_a": trans_a,
105+
"trans_b": trans_b,
106+
"run_ints": run_ints,
102107
"out_glow": out_glow,
103108
"out_c2_fakefp16": out_c2_fakefp16,
104109
"diff": diff

caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py

Lines changed: 51 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,18 @@ def test_clip(self, seed):
7676
Y_glow = workspace.FetchBlob("Y")
7777
np.testing.assert_allclose(Y_glow, np.full((m, n), 65504.0, dtype))
7878

79-
@given(seed=st.integers(0, 65534))
79+
@given(
80+
m=st.integers(4, 50),
81+
k=st.integers(4, 50),
82+
n=st.integers(4, 50),
83+
seed=st.integers(0, 65534)
84+
)
8085
@settings(deadline=None)
81-
def test_fc_exercise(self, seed):
86+
def test_fc_exercise(self, m, k, n, seed):
8287
""" Test that the matmul engine is working, this doesn't test
8388
precision
8489
"""
8590
np.random.seed(seed)
86-
m = np.random.randint(low=4, high=50)
87-
k = np.random.randint(low=4, high=50)
88-
n = np.random.randint(low=4, high=50)
8991
dtype = np.float32
9092
pred_net = caffe2_pb2.NetDef()
9193
pred_net.name = "pred"
@@ -144,13 +146,11 @@ def test_fc_exercise(self, seed):
144146
"diff": np.abs((Y_c2 - Y_glow) / Y_c2)})
145147
assert(0)
146148

147-
@given(seed=st.integers(0, 65534))
148-
@settings(deadline=None)
149-
def test_fc_numeric_cases(self, seed):
149+
@settings(deadline=None, max_examples=1)
150+
def test_fc_numeric_cases(self):
150151
""" Test numerics, use examples found from the unit test.
151152
Use Fp16FCAcc16NNPI as a reference.
152153
"""
153-
np.random.seed(seed)
154154
m = 1
155155
k = 20
156156
n = 1
@@ -172,7 +172,7 @@ def test_fc_numeric_cases(self, seed):
172172
pred_net_ref.external_output.append("Y")
173173
pred_net_ref.op.add().CopyFrom(
174174
core.CreateOperator(
175-
"Fp16FCAcc16NNPI",
175+
"Fp16FCAcc32NNPI",
176176
["X", "W0", "b0"],
177177
["Y"],
178178
)
@@ -203,11 +203,6 @@ def test_fc_numeric_cases(self, seed):
203203
1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
204204
np.testing.assert_equal(num_onnxified_ops, 1)
205205

206-
X0 = np.random.rand(m, k).astype(dtype) - 0.5
207-
workspace.FeedBlob("X", X0)
208-
workspace.CreateNet(pred_net_onnxified)
209-
workspace.CreateNet(pred_net_ref)
210-
211206
X_inputs = [
212207
np.array([[
213208
-2.94921875e-01, -3.58642578e-01, -1.92871094e-01,
@@ -235,6 +230,11 @@ def test_fc_numeric_cases(self, seed):
235230
0.24389648, -0.23486328]], dtype=np.float32)
236231
]
237232

233+
# keep onnxifi happy by feeding something with a shape
234+
workspace.FeedBlob("X", X_inputs[0])
235+
workspace.CreateNet(pred_net_onnxified)
236+
workspace.CreateNet(pred_net_ref)
237+
238238
for i in range(len(X_inputs)):
239239
workspace.FeedBlob("X", X_inputs[i])
240240
# Run Glow net
@@ -263,17 +263,18 @@ def test_fc_numeric_cases(self, seed):
263263
"rowdiff": rowdiff})
264264
assert(0)
265265

266-
@settings(max_examples=5, deadline=None)
267-
@given(seed=st.integers(0, 65535))
268-
def test_fc_num0(self, seed):
266+
@settings(deadline=None)
267+
@given(
268+
m=st.integers(1, 50),
269+
k=st.integers(1, 1000),
270+
n=st.integers(1, 50),
271+
seed=st.integers(0, 65534),
272+
use_packed=st.integers(0, 2)
273+
)
274+
def test_fc_num0(self, seed, m, k, n, use_packed):
269275
""" Test numerics, fix a dimension and determine the ranges of error.
270276
Use Fp16FCAcc16 as a reference.
271277
"""
272-
np.random.seed(seed)
273-
m = np.random.randint(low=4, high=50)
274-
k = np.random.randint(low=4, high=1000)
275-
n = np.random.randint(low=4, high=50)
276-
use_packed = np.random.randint(2)
277278
W = "W_packed" if use_packed else "W0"
278279
dtype = np.float32
279280
pred_net = caffe2_pb2.NetDef()
@@ -293,7 +294,7 @@ def test_fc_num0(self, seed):
293294
pred_net_ref.external_output.append("Y")
294295
pred_net_ref.op.add().CopyFrom(
295296
core.CreateOperator(
296-
"Fp16FCAcc16NNPI",
297+
"Fp16FCAcc32NNPI",
297298
["X", W, "b0"],
298299
["Y"],
299300
)
@@ -329,37 +330,32 @@ def test_fc_num0(self, seed):
329330
workspace.CreateNet(pred_net_onnxified)
330331
workspace.CreateNet(pred_net_ref)
331332

332-
num_iterations = 10
333-
for _ in range(num_iterations):
334-
X0 = 100 * (np.random.rand(m, k) - 0.5).\
335-
astype(np.float16).astype(np.float32)
336-
workspace.FeedBlob("X", X0)
337-
# Run Glow net
338-
workspace.RunNet(pred_net_onnxified.name)
339-
Y_glow = workspace.FetchBlob('Y')
340-
# Run caffe2 net
341-
workspace.RunNet(pred_net_ref.name)
342-
Y_c2 = workspace.FetchBlob('Y')
343-
344-
diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
345-
rowdiff = np.max(diff, axis=1)
346-
347-
n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
348-
if n_offenders > 0:
349-
print_test_debug_info("fc", {
350-
"seed": seed,
351-
"iter": _,
352-
"m": m,
353-
"k": k,
354-
"n": n,
355-
"X": X0,
356-
"W0": W0,
357-
"b0": b0,
358-
"Y_glow": Y_glow,
359-
"Y_c2": Y_c2,
360-
"diff": diff,
361-
"rowdiff": rowdiff})
362-
assert(0)
333+
workspace.RunNet(pred_net_onnxified.name)
334+
Y_glow = workspace.FetchBlob('Y')
335+
336+
# Run caffe2 net
337+
workspace.RunNet(pred_net_ref.name)
338+
Y_c2 = workspace.FetchBlob('Y')
339+
340+
diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
341+
rowdiff = np.max(diff, axis=1)
342+
343+
n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
344+
if n_offenders > 0:
345+
print_test_debug_info("fc", {
346+
"seed": seed,
347+
"use_packed": use_packed,
348+
"m": m,
349+
"k": k,
350+
"n": n,
351+
"X": X0.shape,
352+
"W0": W0.shape,
353+
"b0": b0.shape,
354+
"Y_glow": Y_glow,
355+
"Y_c2": Y_c2,
356+
"diff": diff,
357+
"rowdiff": rowdiff})
358+
assert(0)
363359

364360
if __name__ == '__main__':
365361
unittest.main()

0 commit comments

Comments
 (0)