pytorch
diff --git a/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 0 deletions b/‎.jenkins/pytorch/test.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 11 additions & 13 deletions b/‎CODEOWNERS‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/cuda/CompareEQKernel.cu‎
Lines changed: 2 additions & 4 deletions b/‎aten/src/ATen/native/cuda/CompareEQKernel.cu‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/CompareGEKernel.cu‎
Lines changed: 2 additions & 4 deletions b/‎aten/src/ATen/native/cuda/CompareGEKernel.cu‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/CompareGTKernel.cu‎
Lines changed: 2 additions & 4 deletions b/‎aten/src/ATen/native/cuda/CompareGTKernel.cu‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/CompareLEKernel.cu‎
Lines changed: 2 additions & 4 deletions b/‎aten/src/ATen/native/cuda/CompareLEKernel.cu‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/CompareLTKernel.cu‎
Lines changed: 2 additions & 4 deletions b/‎aten/src/ATen/native/cuda/CompareLTKernel.cu‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎aten/src/ATen/native/cuda/PowKernel.cu‎
Lines changed: 4 additions & 8 deletions b/‎aten/src/ATen/native/cuda/PowKernel.cu‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎aten/src/ATen/record_function.cpp‎
Lines changed: 25 additions & 12 deletions b/‎aten/src/ATen/record_function.cpp‎
Lines changed: 25 additions & 12 deletions
@@ -340,6 +340,8 @@ test_benchmarks() {
     mkdir -p ${BENCHMARK_DATA}
     pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy
     python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json
+    pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling
+    python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_profiling_te.json
     assert_git_not_dirty
   fi
 }
 
@@ -4,10 +4,6 @@
 /docs/cpp @goldsborough @ebetica @yf225
 /torch/csrc/api/ @ebetica @goldsborough @yf225
 /test/cpp/api/ @ebetica @goldsborough @yf225
-/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao
-/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao
-/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao
-/test/test_c10d.py @pietern @mrshenli @zhaojuanmao
 /torch/utils/cpp_extension.py @goldsborough @fmassa @soumith @ezyang
 
 # Not there to strictly require the approval, but to be tagged as a reviewer
@@ -20,17 +16,19 @@
 /torch/jit/ @apaszke
 /torch/utils/data/ @apaszke
 
-# Distributed RPC Framework.
-/torch/csrc/distributed/rpc @mrshenli @pritamdamania87 @zhaojuanmao
-/torch/csrc/distributed/autograd @mrshenli @pritamdamania87 @zhaojuanmao
-/torch/distributed/rpc @mrshenli @pritamdamania87 @zhaojuanmao
-/torch/distributed/autograd @mrshenli @pritamdamania87 @zhaojuanmao
-/torch/distributed/optim @mrshenli @pritamdamania87 @zhaojuanmao @aazzolini
-
 # Tensorpipe RPC Agent.
 /torch/csrc/distributed/rpc/tensorpipe_agent.cpp @jiayisuse @osalpekar @lw @beauby
 /torch/csrc/distributed/rpc/tensorpipe_agent.h @jiayisuse @osalpekar @lw @beauby
 
+# Distributed package
+# This list is mostly if you'd like to be tagged as reviewer, feel free to add
+# or remove yourself from it.
+/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
+/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
+/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
+
 # Distributed tests
-/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao
-/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao
+# This list is mostly if you'd like to be tagged as reviewer, feel free to add
+# or remove yourself from it.
+/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma
+/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma
@@ -825,8 +825,9 @@ static_assert(std::is_same(A*, decltype(A::singleton()))::value, "hmm");
 
 [Clang-Tidy](https://clang.llvm.org/extra/clang-tidy/index.html) is a C++
 linter and static analysis tool based on the clang compiler. We run clang-tidy
-in our CI to make sure that new C++ code is safe, sane and efficient. See our
-[.travis.yml](https://github.com/pytorch/pytorch/blob/master/.travis.yml) file
+in our CI to make sure that new C++ code is safe, sane and efficient. See the
+[`clang-tidy` job in our GitHub Workflow's
+lint.yml file](https://github.com/pytorch/pytorch/blob/master/.github/workflows/lint.yml)
 for the simple commands we use for this.
 
 To run clang-tidy locally, follow these steps:
 
@@ -12,10 +12,8 @@ namespace at { namespace native {
 
 void eq_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "eq_cuda", [&] {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-        return a == b;
-      });
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a == b;
     });
   });
 }
 
@@ -12,10 +12,8 @@ namespace at { namespace native {
 
 void ge_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "ge_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "ge_cuda", [&] {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-        return a >= b;
-      });
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a >= b;
     });
   });
 }
 
@@ -12,10 +12,8 @@ namespace at { namespace native {
 
 void gt_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "gt_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "gt_cuda", [&] {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-        return a > b;
-      });
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a > b;
     });
   });
 }
 
@@ -12,10 +12,8 @@ namespace at { namespace native {
 
 void le_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "le_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "le_cuda", [&] {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-        return a <= b;
-      });
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a <= b;
     });
   });
 }
 
@@ -12,10 +12,8 @@ namespace at { namespace native {
 
 void lt_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "lt_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "lt_cuda", [&] {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-        return a < b;
-      });
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a < b;
     });
   });
 }
 
@@ -110,10 +110,8 @@ void pow_tensor_tensor_kernel(TensorIterator& iter) {
     });
   } else if (isFloatingType(iter.dtype())) {
     AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "pow_cuda", [&]() {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "pow_cuda", [&] {
-        gpu_kernel(iter, []GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
-          return pow_(base, exp);
-        });
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
+        return pow_(base, exp);
       });
     });
   } else {
@@ -170,10 +168,8 @@ void pow_tensor_scalar_kernel(TensorIterator& iter, Scalar exp_scalar) {
     });
   } else if (isFloatingType(iter.dtype()) || exp_scalar.isIntegral(false)) {
     AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "pow_cuda", [&]() {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "pow_cuda", [&] {
-        const auto exp = exp_scalar.to<scalar_t>();
-        pow_tensor_scalar_kernel_impl<scalar_t>(iter, exp);
-      });
+      const auto exp = exp_scalar.to<scalar_t>();
+      pow_tensor_scalar_kernel_impl<scalar_t>(iter, exp);
     });
   } else {
     const auto exp = exp_scalar.to<float>();
 
@@ -92,11 +92,14 @@ class CallbackManager {
     bool found_needs_ids = false;
     auto init_handles = [
         scope, &found_active_cb, &found_needs_inputs, &found_needs_ids](
-          CallbackHandles& handles, RecordFunctionCallbacks& cbs) {
+          CallbackHandles& handles, RecordFunctionCallbacks& cbs, ObserverContextList& ctx_list) {
       handles.clear();
+
+      size_t num_callbacks = 0;
       for (const auto& cb : cbs) {
         if (cb.first.shouldRun(scope)) {
           handles.push_back(cb.second);
+          ++num_callbacks;
           found_active_cb = true;
           if (cb.first.needsInputs()) {
             found_needs_inputs = true;
@@ -106,10 +109,12 @@ class CallbackManager {
           }
         }
       }
+      // Pre-allocate observer context list with nullptr.
+      ctx_list.resize(num_callbacks);
     };
 
-    init_handles(rec_fn.sorted_active_tls_handles_, sorted_tls_callbacks_);
-    init_handles(rec_fn.sorted_active_global_handles_, sorted_global_callbacks_);
+    init_handles(rec_fn.sorted_active_tls_handles_, sorted_tls_callbacks_, rec_fn.tls_ctx_);
+    init_handles(rec_fn.sorted_active_global_handles_, sorted_global_callbacks_, rec_fn.global_ctx_);
     rec_fn.active = found_active_cb;
     rec_fn.needs_inputs = found_needs_inputs;
     if (found_needs_ids && found_active_cb) {
@@ -121,11 +126,13 @@ class CallbackManager {
     mergeRunCallbacks(
         sorted_global_callbacks_,
         rf.sorted_active_global_handles_,
+        rf.global_ctx_,
         /* is_start */ true,
         rf);
     mergeRunCallbacks(
         sorted_tls_callbacks_,
         rf.sorted_active_tls_handles_,
+        rf.tls_ctx_,
         /* is_start */ true,
         rf);
     rf.called_start_callbacks_ = true;
@@ -135,21 +142,30 @@ class CallbackManager {
     mergeRunCallbacks(
         sorted_global_callbacks_,
         rf.sorted_active_global_handles_,
+        rf.global_ctx_,
         /* is_start */ false,
         rf);
     mergeRunCallbacks(
         sorted_tls_callbacks_,
         rf.sorted_active_tls_handles_,
+        rf.tls_ctx_,
         /* is_start */ false,
         rf);
   }
 
  private:
   bool tryRunCallback(
-      const std::function<void(const RecordFunction&)>& fn,
-      RecordFunction& rf) {
+      const RecordFunctionCallback& rfcb,
+      RecordFunction& rf,
+      std::unique_ptr<ObserverContext>& ctx,
+      bool is_start) {
     try {
-      fn(rf);
+      if (is_start) {
+        ctx = rfcb.start()(rf);
+      }
+      else {
+        rfcb.end()(rf, ctx.get());
+      }
       return true;
     } catch (const std::exception &e) {
       LOG(WARNING) << "Exception in RecordFunction callback: "
@@ -165,11 +181,12 @@ class CallbackManager {
   void mergeRunCallbacks(
       const RecordFunctionCallbacks& sorted_callbacks,
       const CallbackHandles& sorted_handles,
+      ObserverContextList& ctx_list,
       bool is_start,
       RecordFunction& rf) {
     size_t num_executed = 0;
     size_t idx_c = 0;
-    for (size_t idx_h = 0; idx_h < sorted_handles.size(); ++idx_h) {
+    for (size_t idx_h = 0; idx_h < sorted_handles.size() && idx_h < ctx_list.size(); ++idx_h) {
       while (idx_c < sorted_callbacks.size() &&
             sorted_callbacks[idx_c].second < sorted_handles[idx_h]) {
         ++idx_c;
@@ -178,11 +195,7 @@ class CallbackManager {
         break;
       }
       if (sorted_callbacks[idx_c].second == sorted_handles[idx_h]) {
-        if (is_start) {
-          tryRunCallback(sorted_callbacks[idx_c].first.start(), rf);
-        } else {
-          tryRunCallback(sorted_callbacks[idx_c].first.end(), rf);
-        }
+        tryRunCallback(sorted_callbacks[idx_c].first, rf, ctx_list[idx_h], is_start);
         ++num_executed;
       }
     }
Original file line number	Diff line number	Diff line change
`@@ -340,6 +340,8 @@ test_benchmarks() {`
`340`	`340`	`mkdir -p ${BENCHMARK_DATA}`
`341`	`341`	`pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy`
`342`	`342`	`python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json`
	`343`	`+ pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling`
	`344`	`+ python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_profiling_te.json`
`343`	`345`	`assert_git_not_dirty`
`344`	`346`	`fi`
`345`	`347`	`}`