add cuda sync when ops running on gpu (#29936)

mingzhe09088 · facebook-github-bot · commit c543034531d7 · 2019-11-15T18:02:48.000-08:00
Summary: Pull Request resolved: #29936 This diff adds synchronization after op execution to ensure all the cuda streams complete. Test Plan: ``` buck run mode/opt //caffe2/benchmarks/operator_benchmark:benchmark_all_test -- --iterations 1 # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: add # Mode: Eager # Name: add_M64_N64_K64_cpu # Input: M: 64, N: 64, K: 64, device: cpu Forward Execution Time (us) : 154.412 # Benchmarking PyTorch: add # Mode: Eager # Name: add_M64_N64_K64_cuda # Input: M: 64, N: 64, K: 64, device: cuda Forward Execution Time (us) : 101.115 ... Reviewed By: hl475 Differential Revision: D18542732 fbshipit-source-id: b979d26a174f488e971074dc1e16b00e17179c80
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
@@ -249,17 +249,18 @@ def _iteration_result_is_significant(self, iters, run_time_sec, curr_test_total_
     def _launch_forward(self, test_case, iters, print_per_iter):
         """ Use Python's timeit module to measure execution time (unit: second).
         """
+        cuda_sync = True if 'cuda' in test_case.test_config.test_name else False 
         func = test_case.run_forward
         if self.use_jit:
             func = test_case.run_jit_forward
-        forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter), number=1)
+        forward_time = timeit.timeit(functools.partial(func, iters, print_per_iter, cuda_sync), number=1)
         return forward_time
 
     def _launch_backward(self, test_case, iters, print_per_iter=False):
         """ This function runs forward path of an op to get an output. Then the backward path is executed
         and the execution time is reported
         """
-        test_case.run_forward(num_runs=1, print_per_iter=False)
+        test_case.run_forward(num_runs=1, print_per_iter=False, cuda_sync=False)
         if test_case.framework == "PyTorch":
             test_case._output_mean()
         backward_time = timeit.timeit(functools.partial(test_case.run_backward, iters,
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -127,7 +127,7 @@ def __init__(self, op_bench, test_config):
         self.framework = "PyTorch"
         self.time_series = []
 
-    def run_jit_forward(self, num_runs, print_per_iter=False):
+    def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         """ Run the forward path of an op with JIT mode
         """
         if self.op_bench._jit_forward is None:
@@ -147,18 +147,22 @@ def _print_per_iter(self):
                 }
             ))
 
-    def run_forward(self, num_runs, print_per_iter):
+    def run_forward(self, num_runs, print_per_iter, cuda_sync):
         """ Run the forward path of an op with eager mode
         """
         if print_per_iter:
             for _ in range(num_runs):
                 start_time = time.time()
                 self.output = self.op_bench.forward()
+                if cuda_sync: 
+                    torch.cuda.synchronize(torch.cuda.current_device())
                 end_time = time.time()
                 self.time_series.append((end_time - start_time) * 1e3)
         else:
             for _ in range(num_runs):
                 self.output = self.op_bench.forward()
+            if cuda_sync: 
+                torch.cuda.synchronize(torch.cuda.current_device())
 
     def _output_mean(self):
         """ TODO (mingzhe): it is not necessary to sum up everything by myself,