Skip to content

Commit c543034

Browse files
mingzhe09088facebook-github-bot
authored andcommitted
add cuda sync when ops running on gpu (#29936)
Summary: Pull Request resolved: #29936 This diff adds synchronization after op execution to ensure all the cuda streams complete. Test Plan: ``` buck run mode/opt //caffe2/benchmarks/operator_benchmark:benchmark_all_test -- --iterations 1 # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: add # Mode: Eager # Name: add_M64_N64_K64_cpu # Input: M: 64, N: 64, K: 64, device: cpu Forward Execution Time (us) : 154.412 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M64_N64_K64_cuda # Input: M: 64, N: 64, K: 64, device: cuda Forward Execution Time (us) : 101.115 ... Reviewed By: hl475 Differential Revision: D18542732 fbshipit-source-id: b979d26a174f488e971074dc1e16b00e17179c80
1 parent f1860ae commit c543034

File tree

2 files changed

+9
-4
lines changed

2 files changed

+9
-4
lines changed

benchmarks/operator_benchmark/benchmark_core.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,17 +249,18 @@ def _iteration_result_is_significant(self, iters, run_time_sec, curr_test_total_
249249
def _launch_forward(self, test_case, iters, print_per_iter):
250250
""" Use Python's timeit module to measure execution time (unit: second).
251251
"""
252+
cuda_sync = True if 'cuda' in test_case.test_config.test_name else False
252253
func = test_case.run_forward
253254
if self.use_jit:
254255
func = test_case.run_jit_forward
255-
forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter), number=1)
256+
forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter, cuda_sync), number=1)
256257
return forward_time
257258

258259
def _launch_backward(self, test_case, iters, print_per_iter=False):
259260
""" This function runs forward path of an op to get an output. Then the backward path is executed
260261
and the execution time is reported
261262
"""
262-
test_case.run_forward(num_runs=1, print_per_iter=False)
263+
test_case.run_forward(num_runs=1, print_per_iter=False, cuda_sync=False)
263264
if test_case.framework == "PyTorch":
264265
test_case._output_mean()
265266
backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters,

benchmarks/operator_benchmark/benchmark_pytorch.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def __init__(self, op_bench, test_config):
127127
self.framework = "PyTorch"
128128
self.time_series = []
129129

130-
def run_jit_forward(self, num_runs, print_per_iter=False):
130+
def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
131131
""" Run the forward path of an op with JIT mode
132132
"""
133133
if self.op_bench._jit_forward is None:
@@ -147,18 +147,22 @@ def _print_per_iter(self):
147147
}
148148
))
149149

150-
def run_forward(self, num_runs, print_per_iter):
150+
def run_forward(self, num_runs, print_per_iter, cuda_sync):
151151
""" Run the forward path of an op with eager mode
152152
"""
153153
if print_per_iter:
154154
for _ in range(num_runs):
155155
start_time = time.time()
156156
self.output = self.op_bench.forward()
157+
if cuda_sync:
158+
torch.cuda.synchronize(torch.cuda.current_device())
157159
end_time = time.time()
158160
self.time_series.append((end_time - start_time) * 1e3)
159161
else:
160162
for _ in range(num_runs):
161163
self.output = self.op_bench.forward()
164+
if cuda_sync:
165+
torch.cuda.synchronize(torch.cuda.current_device())
162166

163167
def _output_mean(self):
164168
""" TODO (mingzhe): it is not necessary to sum up everything by myself,

0 commit comments

Comments
 (0)