[WIP][DeviceMesh] Use a shared_state to cache pg per layout, root_mesh and rank_map

fduwjj · fduwjj · commit 29b1ab2b9fc9 · 2025-11-14T09:20:33.000-08:00
ghstack-source-id: 49b401d Pull Request resolved: #166010
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
@@ -1000,6 +1000,9 @@ def test_unflatten_mesh_3d(self):
         )
         non_ep_mesh = global_mesh._unflatten(0, (2, 2, 2), ("dp", "cp", "tp"))
         ep_mesh = global_mesh._unflatten(0, (2, 2, 2), ("dp", "ep", "ep_tp"))
+        # test pg caching when unflatten into same layout.
+        self.assertEqual(non_ep_mesh["dp"].get_group(), ep_mesh["dp"].get_group())
+        self.assertEqual(non_ep_mesh["tp"].get_group(), ep_mesh["ep_tp"].get_group())
         self.assertEqual(non_ep_mesh["cp"].mesh, ep_mesh["ep"].mesh)
         self.assertEqual(non_ep_mesh["tp"].mesh, ep_mesh["ep_tp"].mesh)
         mesh_3d = global_mesh._unflatten(0, (4, 2, 1), ("dp", "cp", "tp"))
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -48,6 +49,12 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     const std::string backend;
     std::string group_name;
     std::vector<uint64_t> global_ranks_in_group;
+
+    bool operator==(const Options& other) const noexcept {
+      return timeout == other.timeout && backend == other.backend &&
+          group_name == other.group_name &&
+          global_ranks_in_group == other.global_ranks_in_group;
+    }
   };
 
   explicit Backend(int rank, int size);
@@ -511,3 +518,24 @@ class TORCH_API Backend : public torch::CustomClassHolder {
 };
 
 } // namespace c10d
+
+// small helper
+inline void hash_combine(std::size_t& seed, std::size_t value) noexcept {
+  seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
+}
+
+namespace std {
+
+template <>
+struct hash<c10d::Backend::Options> {
+  std::size_t operator()(const c10d::Backend::Options& o) const noexcept {
+    std::size_t h = 0;
+    hash_combine(h, std::hash<long long>{}(o.timeout.count()));
+    hash_combine(h, std::hash<std::string>{}(o.backend));
+    hash_combine(h, std::hash<std::string>{}(o.group_name));
+    for (auto x : o.global_ranks_in_group)
+      hash_combine(h, std::hash<uint64_t>{}(x));
+    return h;
+  }
+};
+} // namespace std
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -260,6 +260,23 @@ class TORCH_API ProcessGroupGloo : public Backend {
 
     std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
     int threads;
+
+    bool operator==(const Options& other) const noexcept {
+      // 1) compare base first
+      if (!static_cast<const Backend::Options&>(*this).operator==(other))
+        return false;
+
+      // 2) compare devices by identity
+      if (devices.size() != other.devices.size())
+        return false;
+      for (size_t i = 0; i < devices.size(); ++i) {
+        if (devices[i].get() != other.devices[i].get()) // pointer identity
+          return false;
+      }
+
+      // 3) compare added scalar fields
+      return threads == other.threads;
+    }
   };
 
   const std::string getBackendName() const override {
@@ -494,4 +511,24 @@ class TORCH_API ProcessGroupGloo : public Backend {
 
 } // namespace c10d
 
+namespace std {
+template <>
+struct hash<c10d::ProcessGroupGloo::Options> {
+  std::size_t operator()(
+      const c10d::ProcessGroupGloo::Options& o) const noexcept {
+    std::size_t h = 0;
+    // reuse base hash
+    hash_combine(
+        h,
+        std::hash<c10d::Backend::Options>{}(
+            static_cast<const c10d::Backend::Options&>(o)));
+    // add derived fields
+    for (auto const& dev : o.devices)
+      hash_combine(h, std::hash<const void*>{}(dev.get()));
+    hash_combine(h, std::hash<int>{}(o.threads));
+    return h;
+  }
+};
+} // namespace std
+
 #endif // USE_C10D_GLOO
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -550,6 +550,33 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // the int value of `NCCL_SPLIT_NOCOLOR` (-1) instead.
     int split_color{-2};
 #endif
+
+    bool operator==(const Options& other) const noexcept {
+      // 1) compare base first
+      if (!static_cast<const Backend::Options&>(*this).operator==(other))
+        return false;
+
+      // 2) simple fields
+      if (is_high_priority_stream != other.is_high_priority_stream) {
+        return false;
+      }
+      if (split_color != other.split_color) {
+        return false;
+      }
+
+      // 3) split_from: compare by identity
+      if (split_from.get() != other.split_from.get()) {
+        return false;
+      }
+
+#ifdef NCCL_HAS_CONFIG
+      // 4) config
+      if (std::memcmp(&config, &other.config, sizeof(ncclConfig_t)) != 0) {
+        return false;
+      }
+#endif
+      return true;
+    }
   };
 
   // Helper class related to TORCH_NCCL_DESYNC_DEBUG
@@ -1504,4 +1531,46 @@ typedef bool (*gil_checker_t)();
 TORCH_API gil_checker_t& get_gil_checker();
 } // namespace c10d
 
+#ifdef NCCL_HAS_CONFIG
+inline std::size_t hash_nccl_config(const ncclConfig_t& cfg) noexcept {
+  const unsigned char* p = reinterpret_cast<const unsigned char*>(&cfg);
+  std::size_t h = 0;
+  for (std::size_t i = 0; i < sizeof(cfg); ++i) {
+    hash_combine(h, static_cast<std::size_t>(p[i]));
+  }
+  return h;
+}
+#endif
+
+namespace std {
+
+template <>
+struct hash<c10d::ProcessGroupNCCL::Options> {
+  std::size_t operator()(
+      const c10d::ProcessGroupNCCL::Options& o) const noexcept {
+    std::size_t h = 0;
+
+    // 1) base
+    hash_combine(
+        h,
+        std::hash<c10d::Backend::Options>{}(
+            static_cast<const c10d::Backend::Options&>(o)));
+
+    // 2) trivial extras
+    hash_combine(h, std::hash<bool>{}(o.is_high_priority_stream));
+    hash_combine(h, std::hash<int>{}(o.split_color));
+
+    // 3) pointer identity for split_from
+    hash_combine(h, std::hash<const void*>{}(o.split_from.get()));
+
+#ifdef NCCL_HAS_CONFIG
+    // 4) config — option A: hash bytes
+    hash_combine(h, hash_nccl_config(o.config));
+#endif
+    return h;
+  }
+};
+
+} // namespace std
+
 #endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -3107,7 +3107,14 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
           .def_readwrite(
               "global_ranks_in_group",
               &::c10d::Backend::Options::global_ranks_in_group)
-          .def_readwrite("group_name", &::c10d::Backend::Options::group_name);
+          .def_readwrite("group_name", &::c10d::Backend::Options::group_name)
+          .def(
+              "__eq__",
+              [](const ::c10d::Backend::Options& a,
+                 const ::c10d::Backend::Options& b) { return a == b; })
+          .def("__hash__", [](const ::c10d::Backend::Options& a) {
+            return std::hash<::c10d::Backend::Options>{}(a);
+          });
 
 #ifdef USE_C10D_GLOO
   auto processGroupGloo =
@@ -3121,7 +3128,14 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
       processGroupGloo, "_Options", backendOptions)
       .def(py::init<>())
       .def_readwrite("_devices", &::c10d::ProcessGroupGloo::Options::devices)
-      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads);
+      .def_readwrite("_threads", &::c10d::ProcessGroupGloo::Options::threads)
+      .def(
+          "__eq__",
+          [](const ::c10d::ProcessGroupGloo::Options& a,
+             const ::c10d::ProcessGroupGloo::Options& b) { return a == b; })
+      .def("__hash__", [](const ::c10d::ProcessGroupGloo::Options& a) {
+        return std::hash<::c10d::ProcessGroupGloo::Options>{}(a);
+      });
 
   processGroupGloo
       .def_static(
@@ -3481,6 +3495,15 @@ Example::
           "split_from", &::c10d::ProcessGroupNCCL::Options::split_from)
       .def_readwrite(
           "split_color", &::c10d::ProcessGroupNCCL::Options::split_color)
+      .def(
+          "__eq__",
+          [](const ::c10d::ProcessGroupNCCL::Options& a,
+             const ::c10d::ProcessGroupNCCL::Options& b) { return a == b; })
+      .def(
+          "__hash__",
+          [](const ::c10d::ProcessGroupNCCL::Options& a) {
+            return std::hash<::c10d::ProcessGroupNCCL::Options>{}(a);
+          })
       .def(
           "__copy__",
           [](const ::c10d::ProcessGroupNCCL::Options& self) {
diff --git a/torch/distributed/_local_tensor/__init__.py b/torch/distributed/_local_tensor/__init__.py
@@ -951,7 +951,9 @@ def get_coordinate(self: DeviceMesh) -> Optional[list[int] | None]:
 
         coords: list[dict[int, int]] = [{} for _ in range(self.ndim)]
         for r in lm.ranks:
-            rank_tensor = self._layout.remap_to_tensor(self._rank_map)
+            rank_tensor = self._layout.remap_to_tensor(
+                self._shared_state.get_rank_map()
+            )
             rank_coords = (rank_tensor == r).nonzero().tolist()
             assert len(rank_coords) == 1
             for d, c in enumerate(rank_coords[0][1:]):
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py