Update on "Delegate Python ~ (invert operator) to Tensor.bitwise_not()."

xuhdev · xuhdev · commit 1f44c8aad281 · 2019-07-01T15:27:07.000-07:00
Delegate Python ~ (invert operator) to Tensor.bitwise_not(). Close #20024, Close #22246, Close #22262 Related #22324 gh-metadata: pytorch pytorch 22326 gh/xuhdev/8/head
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
@@ -389,6 +389,24 @@ def test_zero_(self):
             x2.zero_().to_dense(),
         )
 
+    def test_is_mkldnn(self):
+        x = torch.randn(1, dtype=torch.float32)
+        self.assertFalse(x.is_mkldnn)
+        self.assertTrue(x.to_mkldnn().is_mkldnn)
+
+    def test_is_mkldnn_jit(self):
+        class EnsureMkldnn(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                if not x.is_mkldnn:
+                    x = x.to_mkldnn()
+                return x
+
+        m = EnsureMkldnn()
+        x = torch.randn(1, dtype=torch.float32)
+        self.assertTrue(m(x).is_mkldnn)
+        self.assertTrue(m(x.to_mkldnn()).is_mkldnn)
+
     def _test_imagenet_model(self, model):
         model = model.train(False).float()
         mkldnn_model = mkldnn_utils.to_mkldnn(copy.deepcopy(model))
diff --git a/test/test_throughput_benchmark.py b/test/test_throughput_benchmark.py
@@ -65,8 +65,7 @@ def linear_test(self, Module):
             num_iters=1000,
         )
 
-        print("Avg latency (ms): {}".format(stats.latency_avg_ms))
-        print("Number of iterations: {}".format(stats.num_iters))
+        print(stats)
 
 
     def test_script_module(self):
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -1754,7 +1754,7 @@ def test_bitwise_not(self):
                   torch.ByteTensor, torch.LongTensor, torch.IntTensor, torch.ShortTensor, torch.CharTensor):
             if t == torch.BoolTensor:
                 a = torch.tensor([True, False])
-                expected_res = torch.BoolTensor([False, True])
+                expected_res = torch.tensor([False, True])
             else:
                 a = torch.arange(127, dtype=t.dtype)
                 expected_res = res.type(t)
diff --git a/third_party/fbgemm b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 278c146b929caf751f8e4daf31a039effe2bfb0c
+Subproject commit 61928df38bece4c1c16b98296355450f13ca0afe
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
@@ -11,7 +11,10 @@
 FUNCTION_TEMPLATE = CodeTemplate("""\
 inline at::Tensor ${name}(${formals}) {
   ${pre_record_trace}
-  at::Tensor tensor = at::${name}(${actuals});
+  at::Tensor tensor = ([&]() {
+    at::AutoNonVariableTypeMode non_var_type_mode(true);
+    return at::${name}(${actuals});
+  })();
   at::Tensor result =
     autograd::make_variable_consuming(std::move(tensor), /*requires_grad=*/${requires_grad});
   ${post_record_trace}
diff --git a/tools/autograd/templates/variable_factories.h b/tools/autograd/templates/variable_factories.h
@@ -22,8 +22,10 @@ namespace torch {
 #define TENSOR(T, S, _1)                                                   \
   inline at::Tensor tensor(                                                \
       at::ArrayRef<T> values, const at::TensorOptions& options) {          \
-    at::Tensor result =                                                    \
-        at::tensor(values, at::TensorOptions(options).is_variable(false)); \
+    at::Tensor result = ([&]() {                                           \
+      at::AutoNonVariableTypeMode non_var_type_mode(true);                 \
+      return at::tensor(values, at::TensorOptions(options).is_variable(false)); \
+    })();                                                                  \
     return autograd::make_variable(result, options.requires_grad());       \
   }                                                                        \
   inline at::Tensor tensor(                                                \
@@ -62,8 +64,10 @@ inline at::Tensor from_blob(
     at::IntArrayRef strides,
     const Deleter& deleter,
     const at::TensorOptions& options = at::TensorOptions()) {
-  at::Tensor tensor =
-      at::from_blob(data, sizes, strides, deleter, options.is_variable(false));
+  at::Tensor tensor = ([&]() {
+    at::AutoNonVariableTypeMode non_var_type_mode(true);
+    return at::from_blob(data, sizes, strides, deleter, options.is_variable(false));
+  })();
   return autograd::make_variable(tensor, options.requires_grad());
 }
 
@@ -96,8 +100,10 @@ inline at::Tensor from_blob(
     at::IntArrayRef sizes,
     const Deleter& deleter,
     const at::TensorOptions& options = at::TensorOptions()) {
-  at::Tensor tensor =
-      at::from_blob(data, sizes, deleter, options.is_variable(false));
+  at::Tensor tensor = ([&]() {
+    at::AutoNonVariableTypeMode non_var_type_mode(true);
+    return at::from_blob(data, sizes, deleter, options.is_variable(false));
+  })();
   return autograd::make_variable(tensor, options.requires_grad());
 }
 
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
@@ -420,6 +420,14 @@ PyObject *THPVariable_is_sparse(THPVariable *self)
   END_HANDLE_TH_ERRORS
 }
 
+PyObject *THPVariable_is_mkldnn(THPVariable *self)
+{
+  HANDLE_TH_ERRORS
+  auto& self_ = self->cdata;
+  return torch::autograd::utils::wrap(self_.is_mkldnn());
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject *THPVariable_is_quantized(THPVariable *self)
 {
   HANDLE_TH_ERRORS
@@ -468,6 +476,7 @@ static struct PyGetSetDef THPVariable_properties[] = {
   {"shape", (getter)THPVariable_get_shape, nullptr, nullptr, nullptr},
   {"is_cuda", (getter)THPVariable_is_cuda, nullptr, nullptr, nullptr},
   {"is_sparse", (getter)THPVariable_is_sparse, nullptr, nullptr, nullptr},
+  {"is_mkldnn", (getter)THPVariable_is_mkldnn, nullptr, nullptr, nullptr},
   {"is_quantized", (getter)THPVariable_is_quantized, nullptr, nullptr, nullptr},
   {"dtype", (getter)THPVariable_dtype, nullptr, nullptr, nullptr},
   {"layout", (getter)THPVariable_layout, nullptr, nullptr, nullptr},
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
@@ -500,6 +500,14 @@ RegisterOperators reg(
            push(stack, a.is_cuda());
            return 0;
          }),
+     Operator(
+         "prim::is_mkldnn(Tensor a) -> bool",
+         [](Stack& stack) {
+           at::Tensor a;
+           pop(stack, a);
+           push(stack, a.is_mkldnn());
+           return 0;
+         }),
      Operator(
          "aten::cpu(Tensor(a) self) -> Tensor(a|b)",
          [](Stack& stack) {
diff --git a/torch/csrc/jit/script/sugared_value.cpp b/torch/csrc/jit/script/sugared_value.cpp
@@ -73,6 +73,7 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
         "device",
         "shape",
         "is_cuda",
+        "is_mkldnn",
         "requires_grad",
     };
     if (fields.count(field)) {
diff --git a/torch/csrc/utils/throughput_benchmark-inl.h b/torch/csrc/utils/throughput_benchmark-inl.h
@@ -50,7 +50,7 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
   int64_t initialized{0};
   int64_t finished{0};
   bool start{false};
-  std::atomic<int64_t> num_forwards{0};
+  std::atomic<int64_t> num_attempted_iters{0};
   std::vector<std::thread> callers;
 
   for (auto thread_id = 0; thread_id < config.num_calling_threads;
@@ -71,7 +71,7 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
         }
       }
       LOG(INFO) << "Starting forward thread " << thread_id;
-      while (num_forwards.fetch_add(1) < config.num_iters) {
+      while (num_attempted_iters.fetch_add(1) < config.num_iters) {
         runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]]));
         ++input_iters[thread_id];
       }
@@ -115,9 +115,12 @@ BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
                             end_time - start_time)
                             .count() /
       1000.0 / 1000.0;
+  // We use config.num_iters instead of num_attempted_iters as it is
+  // repsesatative of the real work done. Last attempted iteration on each
+  // calling threads doesn't represent the real work (i.e. running the model)
   stats.latency_avg_ms =
-      total_time_ms * config.num_calling_threads / num_forwards;
-  stats.num_iters = num_forwards;
+      total_time_ms * config.num_calling_threads / config.num_iters;
+  stats.num_iters = config.num_iters;
 
   for (auto& t : callers) {
     t.join();
diff --git a/torch/utils/throughput_benchmark.py b/torch/utils/throughput_benchmark.py
@@ -2,6 +2,61 @@
 
 import torch._C
 
+def format_time(time_us=None, time_ms=None, time_s=None):
+    '''Defines how to format time'''
+    assert sum([time_us is not None, time_ms is not None, time_s is not None]) == 1
+    if time_us is None:
+        if time_ms is not None:
+            time_us = time_ms * 1000.0
+        elif time_s is not None:
+            time_us = time_s * 1000.0 * 1000.0
+        else:
+            assert False, "Incorrect code :)"
+
+    US_IN_SECOND = 1000.0 * 1000.0
+    US_IN_MS = 1000.0
+    if time_us >= US_IN_SECOND:
+        return '{:.3f}s'.format(time_us / US_IN_SECOND)
+    if time_us >= US_IN_MS:
+        return '{:.3f}ms'.format(time_us / US_IN_MS)
+    return '{:.3f}us'.format(time_us)
+
+
+class ExecutionStats(object):
+    def __init__(self, c_stats, benchmark_config):
+        self._c_stats = c_stats
+        self.benchmark_config = benchmark_config
+
+    @property
+    def latency_avg_ms(self):
+        return self._c_stats.latency_avg_ms
+
+    @property
+    def num_iters(self):
+        return self._c_stats.num_iters
+
+    @property
+    def iters_per_second(self):
+        '''
+        Returns total number of iterations per second across all calling threads
+        '''
+        return self.num_iters / self.total_time_seconds
+
+    @property
+    def total_time_seconds(self):
+        return self.num_iters * (
+            self.latency_avg_ms / 1000.0) / self.benchmark_config.num_calling_threads
+
+
+    def __str__(self):
+        return '\n'.join([
+            "Average latency per example: " + format_time(time_ms=self.latency_avg_ms),
+            "Total number of iterations: {}".format(self.num_iters),
+            "Total number of iterations per second (across all threads): {:.2f}".format(self.iters_per_second),
+            "Total time: " + format_time(time_s=self.total_time_seconds)
+        ])
+
+
 class ThroughputBenchmark(object):
     '''
     This class is a wrapper around a c++ component throughput_benchmark::ThroughputBenchmark
@@ -84,4 +139,5 @@ def benchmark(self, num_calling_threads=1, num_warmup_iters=10, num_iters=100):
         config.num_calling_threads = num_calling_threads
         config.num_warmup_iters = num_warmup_iters
         config.num_iters = num_iters
-        return self._benchmark.benchmark(config)
+        c_stats = self._benchmark.benchmark(config)
+        return ExecutionStats(c_stats, config)

Original file line number	Diff line number	Diff line change
`@@ -65,8 +65,7 @@ def linear_test(self, Module):`
`65`	`65`	`num_iters=1000,`
`66`	`66`	`)`
`67`	`67`
`68`		`- print("Avg latency (ms): {}".format(stats.latency_avg_ms))`
`69`		`- print("Number of iterations: {}".format(stats.num_iters))`
	`68`	`+ print(stats)`
`70`	`69`
`71`	`70`
`72`	`71`	`def test_script_module(self):`