pytorch
diff --git a/‎.jenkins/caffe2/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎.jenkins/caffe2/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎android/pytorch_android/src/main/cpp/pytorch_jni.cpp‎
Lines changed: 0 additions & 2 deletions b/‎android/pytorch_android/src/main/cpp/pytorch_jni.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 0 additions & 28 deletions b/‎aten/src/ATen/Declarations.cwrap‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎aten/src/ATen/core/ATenDispatch.h‎
Lines changed: 19 additions & 8 deletions b/‎aten/src/ATen/core/ATenDispatch.h‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎aten/src/ATen/cpu/vec256/vec256.h‎
Lines changed: 30 additions & 0 deletions b/‎aten/src/ATen/cpu/vec256/vec256.h‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎aten/src/ATen/function_wrapper.py‎
Lines changed: 50 additions & 11 deletions b/‎aten/src/ATen/function_wrapper.py‎
Lines changed: 50 additions & 11 deletions
@@ -135,7 +135,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
     # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
     # Fix the pip error: Couldn't find a version that satisfies the requirement
     sudo pip install --upgrade pip
-    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==0.5.0.dev1012
+    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==0.5.0.dev1020
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi
 
@@ -615,7 +615,6 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     }
     auto output = [&]() {
       torch::autograd::AutoGradMode guard(false);
-      at::AutoNonVariableTypeMode non_var_type_mode(true);
       return module_.forward(std::move(inputs));
     }();
     return JIValue::newJIValueFromAtIValue(output);
@@ -638,7 +637,6 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     if (auto method = module_.find_method(methodName)) {
       auto output = [&]() {
         torch::autograd::AutoGradMode guard(false);
-        at::AutoNonVariableTypeMode non_var_type_mode(true);
         return (*method)(std::move(inputs));
       }();
       return JIValue::newJIValueFromAtIValue(output);
 
@@ -762,20 +762,6 @@
       output: True
     - THTensor* self
 ]]
-[[
-  name: _th_log1p
-  cname: log1p
-  types:
-    - floating_point
-  backends:
-    - CUDA
-  variants: function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-]]
 [[
   name: _th_exp
   cname: exp
@@ -944,20 +930,6 @@
       output: True
     - THTensor* self
 ]]
-[[
-  name: _th_sqrt
-  cname: sqrt
-  types:
-    - floating_point
-  backends:
-    - CUDA
-  variants: function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-]]
 [[
   name: _th_frac_
   types:
 
@@ -14,6 +14,7 @@
 #include <mutex>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/stack.h>
+#include <torch/csrc/jit/script/function_schema_parser.h>
 
 // TODO: Rewrite this comment
 //
@@ -74,7 +75,7 @@ namespace detail {
   }
 }
 
-using FallbackBoxedFunction = void(const char* schema, torch::jit::Stack*);
+using FallbackBoxedFunction = void(const c10::FunctionSchema& schema, torch::jit::Stack*);
 
 // Assume T is decayed
 template <typename T>
@@ -129,9 +130,19 @@ class CAFFE2_API ATenOpTable {
 
   C10_NORETURN void reportError(TensorTypeId tid) const;
 
+  const FunctionSchema& function_schema() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!parsed_schema_.has_value()) {
+      parsed_schema_ = torch::jit::parseSchema(schema_);
+    }
+    return *parsed_schema_;
+  }
+
   friend class ATenDispatch;
 
   std::string schema_;
+  mutable c10::optional<c10::FunctionSchema> parsed_schema_ = c10::nullopt;
+  mutable std::mutex mutex_;
   void* function_table_[static_cast<int64_t>(TensorTypeId::NumTensorIds)] = {nullptr};
 };
 
@@ -141,9 +152,9 @@ class CAFFE2_API ATenDispatch {
   ATenDispatch& registerOp(TensorTypeId id, const char* schema, FuncType* fn) {
     std::lock_guard<std::mutex> lock(mutex_);
     if (op_tables_.find(schema) == op_tables_.end()) {
-      op_tables_.insert(std::make_pair(schema, ATenOpTable(schema)));
+      op_tables_.insert(std::make_pair(schema, c10::guts::make_unique<ATenOpTable>(schema)));
     }
-    op_tables_.at(schema).registerOp(id, reinterpret_cast<void*>(fn));
+    op_tables_.at(schema)->registerOp(id, reinterpret_cast<void*>(fn));
     return *this;
   }
 
@@ -157,23 +168,23 @@ class CAFFE2_API ATenDispatch {
     auto iter = op_tables_.find(schema);
     TORCH_CHECK(iter != op_tables_.end(),
         "No functions are registered for schema ", schema);
-    return &iter->second;
+    return iter->second.get();
   }
 
   FallbackBoxedFunction* getFallbackBoxedOp(TensorTypeId tid) const {
     return boxed_fallback_table_[static_cast<size_t>(tid)];
   }
 
  private:
-  std::unordered_map<std::string, ATenOpTable> op_tables_;
+  std::unordered_map<std::string, std::unique_ptr<ATenOpTable>> op_tables_;
   FallbackBoxedFunction* boxed_fallback_table_[static_cast<int64_t>(TensorTypeId::NumTensorIds)] = {nullptr};
   std::mutex mutex_;
 };
 
 CAFFE2_API ATenDispatch& globalATenDispatch();
 
 template<class Result, class... Args>
-Result callBoxedFallback(const char* schema, FallbackBoxedFunction* boxed_fallback_fn, Args&&... args,
+Result callBoxedFallback(const c10::FunctionSchema& schema, FallbackBoxedFunction* boxed_fallback_fn, Args&&... args,
   // NB: enable_if must occur in function parameter, because MSVC
   // doesn't like it when it's a template argument next to
   // a parameter pack
@@ -189,7 +200,7 @@ Result callBoxedFallback(const char* schema, FallbackBoxedFunction* boxed_fallba
 
 template<
   class Result, class... Args>
-Result callBoxedFallback(const char* schema, FallbackBoxedFunction* boxed_fallback_fn, Args&&... args,
+Result callBoxedFallback(const c10::FunctionSchema& schema, FallbackBoxedFunction* boxed_fallback_fn, Args&&... args,
   typename c10::guts::enable_if_t<
     supports_boxed_fallback<Result, Args...>::value,
     std::nullptr_t
@@ -232,7 +243,7 @@ Result ATenOpTable::callUnboxed(Args... args) const {
   auto* boxed_fallback_fn = globalATenDispatch().getFallbackBoxedOp(tid);
   if (C10_UNLIKELY(boxed_fallback_fn)) {
     if (supports_boxed_fallback<Result, Args...>::value) {
-      return callBoxedFallback<Result, Args...>(schema_.c_str(), boxed_fallback_fn, std::forward<Args>(args)...);
+      return callBoxedFallback<Result, Args...>(function_schema(), boxed_fallback_fn, std::forward<Args>(args)...);
     } else {
       TORCH_INTERNAL_ASSERT(0, schema_, " does not support boxed fallback, but boxed fallback for ", tid, " was available");
     }
 
@@ -145,17 +145,27 @@ inline interleave2<double>(const Vec256<double>& a, const Vec256<double>& b) {
   // swap lanes:
   //   a_swapped = {a0, a1, b0, b1}
   //   b_swapped = {a2, a3, b2, b3}
+#if __cpp_binary_literals >= 201304L
+  auto a_swapped = _mm256_permute2f128_pd(a, b, 0b0100000);
+  auto b_swapped = _mm256_permute2f128_pd(a, b, 0b0110001);
+#else  // TODO Remove else case once switch to C++14 is finished
   static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
   static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
   auto a_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_a);
   auto b_swapped = _mm256_permute2f128_pd(a, b, swap_ctrl_b);
+#endif
 
   // group cols crossing lanes:
   //   return {a0, b0, a1, b1}
   //          {a2, b2, a3, b3}
+#if __cpp_binary_literals >= 201304L
+  return std::make_pair(_mm256_permute4x64_pd(a_swapped, 0b11011000),
+                        _mm256_permute4x64_pd(b_swapped, 0b11011000));
+#else  // TODO Remove else case once switch to C++14 is finished
   static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6);  // 0, 2, 1, 3
   return std::make_pair(_mm256_permute4x64_pd(a_swapped, group_ctrl),
                         _mm256_permute4x64_pd(b_swapped, group_ctrl));
+#endif
 }
 
 template <>
@@ -169,10 +179,15 @@ inline interleave2<float>(const Vec256<float>& a, const Vec256<float>& b) {
   //   a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3}
   //   b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7}
   // TODO: can we support caching this?
+#if __cpp_binary_literals >= 201304L
+  auto a_swapped = _mm256_permute2f128_ps(a, b, 0b0100000);
+  auto b_swapped = _mm256_permute2f128_ps(a, b, 0b0110001);
+#else  // TODO Remove else case once switch to C++14 is finished
   static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
   static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
   auto a_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_a);
   auto b_swapped = _mm256_permute2f128_ps(a, b, swap_ctrl_b);
+#endif
 
   // group cols crossing lanes:
   //   return {a0, b0, a1, b1, a2, b2, a3, b3}
@@ -194,17 +209,27 @@ inline deinterleave2<double>(const Vec256<double>& a, const Vec256<double>& b) {
   // group cols crossing lanes:
   //   a_grouped = {a0, a1, b0, b1}
   //   b_grouped = {a2, a3, b2, b3}
+#if __cpp_binary_literals >= 201304L
+  auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000);
+  auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000);
+#else  // TODO Remove else case once switch to C++14 is finished
   static constexpr int group_ctrl = 0 | (2 << 2) | (1 << 4) | (3 << 6);  // 0, 2, 1, 3
   auto a_grouped = _mm256_permute4x64_pd(a, group_ctrl);
   auto b_grouped = _mm256_permute4x64_pd(b, group_ctrl);
+#endif
 
   // swap lanes:
   //   return {a0, a1, a2, a3}
   //          {b0, b1, b2, b3}
+#if __cpp_binary_literals >= 201304L
+  return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, 0b0100000),
+                        _mm256_permute2f128_pd(a_grouped, b_grouped, 0b0110001));
+#else  // TODO Remove else case once switch to C++14 is finished
   static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
   static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
   return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_a),
                         _mm256_permute2f128_pd(a_grouped, b_grouped, swap_ctrl_b));
+#endif
 }
 
 template <>
@@ -225,10 +250,15 @@ inline deinterleave2<float>(const Vec256<float>& a, const Vec256<float>& b) {
   // swap lanes:
   //   return {a0, a1, a2, a3, a4, a5, a6, a7}
   //          {b0, b1, b2, b3, b4, b5, b6, b7}
+#if __cpp_binary_literals >= 201304L
+  return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, 0b0100000),
+                        _mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001));
+#else  // TODO Remove else case once switch to C++14 is finished
   static constexpr int swap_ctrl_a = 0 | (2 << 4);  // 0, 2.   4 bits apart
   static constexpr int swap_ctrl_b = 1 | (3 << 4);  // 1, 3.   4 bits apart
   return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_a),
                         _mm256_permute2f128_ps(a_grouped, b_grouped, swap_ctrl_b));
+#endif
 }
 
 #endif  // defined(__AVX2__)
 
@@ -174,7 +174,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #ifdef USE_STATIC_DISPATCH
     ${static_dispatch_method_body}
 #else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::${name}", "${overload_name}"}).value();
+    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::${operator_name}", "${overload_name}"}).value();
     return c10::Dispatcher::singleton().callUnboxedOnly<${formals_types_with_return}>(
         op, impl::dispatchTypeId(${inferred_type_set})${method_actuals_with_comma_prefix});
 #endif
@@ -185,7 +185,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #ifdef USE_STATIC_DISPATCH
     ${static_dispatch_method_body}
 #else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::${name}", "${overload_name}"}).value();
+    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::${operator_name}", "${overload_name}"}).value();
     return c10::Dispatcher::singleton().callUnboxed<${formals_types_with_return}>(
         op, impl::dispatchTypeId(${inferred_type_set})${method_actuals_with_comma_prefix});
 #endif
@@ -217,7 +217,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
     ${static_dispatch_function_body}
 #else
     static c10::OperatorHandle op = c10::Dispatcher::singleton()
-        .findSchema({"aten::${name}", "${overload_name}"}).value();
+        .findSchema({"aten::${operator_name}", "${overload_name}"}).value();
     return c10::Dispatcher::singleton().callUnboxedOnly<${formals_types_with_return}>(
         op, impl::dispatchTypeId(${inferred_type_set})${native_actuals_with_comma_prefix});
 #endif
@@ -230,7 +230,7 @@ def TypedDict(name, attrs, total=True):  # type: ignore
     ${static_dispatch_function_body}
 #else
     static c10::OperatorHandle op = c10::Dispatcher::singleton()
-        .findSchema({"aten::${name}", "${overload_name}"}).value();
+        .findSchema({"aten::${operator_name}", "${overload_name}"}).value();
     return c10::Dispatcher::singleton().callUnboxed<${formals_types_with_return}>(
         op, impl::dispatchTypeId(${inferred_type_set})${native_actuals_with_comma_prefix});
 #endif
@@ -239,10 +239,17 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 
 # In order to rely on the linker to strip unused ops, it requires us to dispatch statically
 # in Functions.h and TensorMethods.h.
+#
+# NB: The default body also needs to apply a variable guard, as in some
+# situations what we think is a default body actually does have an
+# explicit derivative, and thereby would have gotten unwrapped by
+# the time you get to the implementation.
 STATIC_DISPATCH_FUNCTION_DEFAULT_BODY = CodeTemplate("""\
+at::AutoNonVariableTypeMode _var_guard(true);
 ${return_call} TypeDefault::${native_type_method_dispatch}(${native_arguments});
 """)
 STATIC_DISPATCH_FUNCTION_SWITCH_BODY = CodeTemplate("""\
+at::AutoNonVariableTypeMode _var_guard(true);
 switch(tensorTypeIdToBackend(impl::dispatchTypeId(${type_set}))) {
     ${static_dispatch_function_switches}
     default:
@@ -272,6 +279,32 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 #endif
 }
 """)
+C10_UNBOXEDONLY_FACTORY_DEFINITION = CodeTemplate("""\
+static inline ${return_type} ${api_name}(${formals}) {
+#ifdef USE_STATIC_DISPATCH
+    ${static_dispatch_function_body}
+#else
+    globalLegacyTypeDispatch().initForTensorTypeSet(${inferred_type_set});
+    static c10::OperatorHandle op = c10::Dispatcher::singleton()
+        .findSchema({"aten::${operator_name}", "${overload_name}"}).value();
+    return c10::Dispatcher::singleton().callUnboxedOnly<${formals_types_with_return}>(
+        op, impl::dispatchTypeId(${inferred_type_set})${native_actuals_with_comma_prefix});
+#endif
+}
+""")
+C10_FACTORY_DEFINITION = CodeTemplate("""\
+static inline ${return_type} ${api_name}(${formals}) {
+#ifdef USE_STATIC_DISPATCH
+    ${static_dispatch_function_body}
+#else
+    globalLegacyTypeDispatch().initForTensorTypeSet(${inferred_type_set});
+    static c10::OperatorHandle op = c10::Dispatcher::singleton()
+        .findSchema({"aten::${operator_name}", "${overload_name}"}).value();
+    return c10::Dispatcher::singleton().callUnboxed<${formals_types_with_return}>(
+        op, impl::dispatchTypeId(${inferred_type_set})${native_actuals_with_comma_prefix});
+#endif
+}
+""")
 
 ZERO_DIM_CHECK = CodeTemplate("""\
 if (${check_name}.dim() == 0) {
@@ -880,7 +913,9 @@ def get_return_types(option):
 
     def format_return_type(return_types):
         # type: (List[ReturnType]) -> str
-        if len(return_types) == 1:
+        if len(return_types) == 0:
+            return "void"
+        elif len(return_types) == 1:
             return return_types[0]['type']
         return "std::tuple<{}>".format(','.join(r['type'] for r in return_types))
 
@@ -1109,9 +1144,6 @@ def native_get_return_types(option):
             if isinstance(t_raw, string_type):
                 t = t_raw
                 name = None
-            elif t_raw is None:
-                t = 'void'
-                name = None
             else:
                 t = t_raw['type']
                 name = t_raw['name']
@@ -1142,7 +1174,6 @@ def process_native(option):
         assert option['python_module'] == '' or option['python_module'] == 'nn', \
             "Found python_module of {} for decl {}, but only \'\' string or \'nn\' are supported".format(
                 option['python_module'], option['name'])
-
         formals = native_get_formals(option)
         option['formals_list'] = formals
         option['formals'] = [format_formal(f) for f in formals]
@@ -1263,8 +1294,16 @@ def gen_namespace_function(option, multidispatch_tensors):
                     option, native_arguments=option['native_actuals'])
 
             if is_factory_method:
-                fn_definition = FACTORY_DEFINITION.substitute(
-                    option, static_dispatch_function_body=static_dispatch_function_body)
+                if option['use_c10_dispatcher'] == 'no':
+                    fn_definition = FACTORY_DEFINITION.substitute(
+                        option, static_dispatch_function_body=static_dispatch_function_body)
+                elif option['use_c10_dispatcher'] == 'unboxed_only':
+                    fn_definition = C10_UNBOXEDONLY_FACTORY_DEFINITION.substitute(
+                        option, static_dispatch_function_body=static_dispatch_function_body)
+                else:
+                    assert option['use_c10_dispatcher'] == 'full'
+                    fn_definition = C10_FACTORY_DEFINITION.substitute(
+                        option, static_dispatch_function_body=static_dispatch_function_body)
             else:
                 if option['use_c10_dispatcher'] == 'no':
                     fn_definition = FUNCTION_DEFINITION.substitute(