-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Collective dispatching from Process Group #91257
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
[ghstack-poisoned]
🔗 Helpful Links🧪 See artifacts and rendered test results at hud.pytorch.org/pr/91257
Note: Links to docs will display an error until the docs builds have been completed. ✅ No FailuresAs of commit f005be3: This comment was automatically generated by Dr. CI and updates every 15 minutes. |
|
@H-Huang has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator. |
kwen2501
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the clean-up! LGTM.
Please see my inline comment.
| c10::intrusive_ptr<Work> broadcast( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::TensorList tensors, | ||
| const BroadcastOptions& opts) { | ||
| // TODO: handles the case of using a PythonProcessGroup which is used in | ||
| // Reducer.cpp This can be removed once | ||
| // https://github.com/pytorch/pytorch/issues/90659 is resolved | ||
| if (!process_group->hasBackends()) { | ||
| auto tensor_vec = tensors.vec(); | ||
| return process_group->broadcast(tensor_vec, opts); | ||
| } | ||
|
|
||
| static auto op = | ||
| c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::broadcast_", "") | ||
| .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>( | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t, | ||
| int64_t, | ||
| int64_t)>(); | ||
| // It's awakward to unbox the opts here and box them again in the custom C++ | ||
| // op. But it's also complicated to make opts as a CustomClassHolder. Leave it | ||
| // as it is now. | ||
| return std::get<1>(op.call( | ||
| tensors, | ||
| process_group, | ||
| opts.rootRank, | ||
| opts.rootTensor, | ||
| opts.timeout.count())); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> allreduce( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::TensorList tensors, | ||
| const AllreduceOptions& opts) { | ||
| // TODO: handles the case of using a PythonProcessGroup which is used in | ||
| // Reducer.cpp This can be removed once | ||
| // https://github.com/pytorch/pytorch/issues/90659 is resolved | ||
| if (!process_group->hasBackends()) { | ||
| auto tensor_vec = tensors.vec(); | ||
| return process_group->allreduce(tensor_vec, opts); | ||
| } | ||
|
|
||
| static auto op = | ||
| c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::allreduce_", "") | ||
| .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>( | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| const c10::intrusive_ptr<::c10d::ReduceOp>&, | ||
| int64_t)>(); | ||
|
|
||
| return std::get<1>(op.call( | ||
| tensors, | ||
| process_group, | ||
| c10::make_intrusive<ReduceOp>(opts.reduceOp), | ||
| opts.timeout.count())); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> allreduce_coalesced( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::TensorList tensors, | ||
| const AllreduceCoalescedOptions& opts) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::allreduce_coalesced_", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| const c10::intrusive_ptr<::c10d::ReduceOp>&, | ||
| int64_t)>(); | ||
|
|
||
| return op.call( | ||
| tensors, | ||
| process_group, | ||
| c10::make_intrusive<ReduceOp>(opts.reduceOp), | ||
| opts.timeout.count()); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> allgather( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const std::vector<std::vector<at::Tensor>>& output_tensors, | ||
| at::TensorList input_tensors, | ||
| const AllgatherOptions& opts) { | ||
| // TODO: handles the case of using a PythonProcessGroup which is used in | ||
| // Reducer.cpp This can be removed once | ||
| // https://github.com/pytorch/pytorch/issues/90659 is resolved | ||
| if (!process_group->hasBackends()) { | ||
| auto input_tensors_vec = input_tensors.vec(); | ||
| return process_group->allgather( | ||
| const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors), | ||
| input_tensors_vec, | ||
| opts); | ||
| } | ||
|
|
||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::allgather_", "") | ||
| .typed<std::tuple< | ||
| std::vector<std::vector<at::Tensor>>, | ||
| c10::intrusive_ptr<Work>>( | ||
| const std::vector<std::vector<at::Tensor>>&, | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t)>(); | ||
|
|
||
| return std::get<1>(op.call( | ||
| output_tensors, input_tensors, process_group, opts.timeout.count())); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> _allgather_base( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::Tensor& output_tensor, | ||
| at::Tensor& input_tensor, | ||
| const AllgatherOptions& opts) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::_allgather_base_", "") | ||
| .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>( | ||
| at::Tensor&, | ||
| at::Tensor&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&)>(); | ||
|
|
||
| return std::get<1>(op.call(output_tensor, input_tensor, process_group)); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> allgather_coalesced( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const std::vector<std::vector<at::Tensor>>& output_lists, | ||
| const at::TensorList& input_list, | ||
| const AllgatherOptions& opts) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::allgather_coalesced_", "") | ||
| .typed<c10::intrusive_ptr<Work>( | ||
| const std::vector<std::vector<at::Tensor>>&, | ||
| const at::TensorList&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&)>(); | ||
|
|
||
| return op.call(output_lists, input_list, process_group); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> reduce_scatter( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const at::TensorList& output_tensors, | ||
| const std::vector<std::vector<at::Tensor>>& input_tensors, | ||
| const ReduceScatterOptions& opts) { | ||
| static auto op = | ||
| c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::reduce_scatter_", "") | ||
| .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>( | ||
| const at::TensorList&, | ||
| const std::vector<std::vector<at::Tensor>>&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| const c10::intrusive_ptr<::c10d::ReduceOp>&, | ||
| int64_t)>(); | ||
| return std::get<1>(op.call( | ||
| output_tensors, | ||
| input_tensors, | ||
| process_group, | ||
| c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp), | ||
| opts.timeout.count())); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> _reduce_scatter_base( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::Tensor& output_tensor, | ||
| at::Tensor& input_tensor, | ||
| const ReduceScatterOptions& opts) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::_reduce_scatter_base_", "") | ||
| .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>( | ||
| at::Tensor&, | ||
| at::Tensor&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| const c10::intrusive_ptr<::c10d::ReduceOp>&, | ||
| int64_t)>(); | ||
| return std::get<1>(op.call( | ||
| output_tensor, | ||
| input_tensor, | ||
| process_group, | ||
| c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp), | ||
| opts.timeout.count())); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> reduce( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::TensorList tensors, | ||
| const ReduceOptions& opts) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::reduce_", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| const c10::intrusive_ptr<::c10d::ReduceOp>&, | ||
| int64_t, | ||
| int64_t, | ||
| int64_t)>(); | ||
| return op.call( | ||
| tensors, | ||
| process_group, | ||
| c10::make_intrusive<ReduceOp>(opts.reduceOp), | ||
| opts.rootRank, | ||
| opts.rootTensor, | ||
| opts.timeout.count()); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> gather( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const std::vector<std::vector<at::Tensor>>& output_tensors, | ||
| const at::TensorList& input_tensors, | ||
| const GatherOptions& opts) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::gather_", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| const std::vector<std::vector<at::Tensor>>&, | ||
| const at::TensorList&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t, | ||
| int64_t)>(); | ||
| return op.call( | ||
| output_tensors, | ||
| input_tensors, | ||
| process_group, | ||
| opts.rootRank, | ||
| opts.timeout.count()); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> scatter( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const at::TensorList& output_tensors, | ||
| const std::vector<std::vector<at::Tensor>>& input_tensors, | ||
| const ScatterOptions& opts) { | ||
| static auto op = | ||
| c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::scatter_", "") | ||
| .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>( | ||
| const at::TensorList&, | ||
| const std::vector<std::vector<at::Tensor>>&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t, | ||
| int64_t)>(); | ||
| return std::get<1>(op.call( | ||
| output_tensors, | ||
| input_tensors, | ||
| process_group, | ||
| opts.rootRank, | ||
| opts.timeout.count())); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> alltoall( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const at::TensorList& output_tensors, | ||
| const at::TensorList& input_tensors, | ||
| const AllToAllOptions& opts) { | ||
| static auto op = | ||
| c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::alltoall_", "") | ||
| .typed<std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>( | ||
| const at::TensorList&, | ||
| const at::TensorList&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t)>(); | ||
| return std::get<1>(op.call( | ||
| output_tensors, input_tensors, process_group, opts.timeout.count())); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> alltoall_base( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::Tensor& output, | ||
| at::Tensor& input, | ||
| std::vector<int64_t> output_split_sizes, | ||
| std::vector<int64_t> input_split_sizes, | ||
| const AllToAllOptions& opts) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::alltoall_base_", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| at::Tensor&, | ||
| at::Tensor&, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| std::vector<int64_t>, | ||
| std::vector<int64_t>, | ||
| int64_t)>(); | ||
| return op.call( | ||
| output, | ||
| input, | ||
| process_group, | ||
| output_split_sizes, | ||
| input_split_sizes, | ||
| opts.timeout.count()); | ||
| } | ||
|
|
||
| void monitored_barrier( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const BarrierOptions& opts, | ||
| bool wait_all_ranks) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::monitored_barrier_", "") | ||
| .typed<void( | ||
| at::Tensor, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| const std::vector<int64_t>&, | ||
| int64_t, | ||
| bool)>(); | ||
| // Default to using cpu implementation, monitored barrier is only for GLOO | ||
| at::Tensor tensor = at::empty({0}, at::TensorOptions().device(at::kCPU)); | ||
| op.call( | ||
| tensor, | ||
| process_group, | ||
| opts.device_ids, | ||
| opts.timeout.count(), | ||
| wait_all_ranks); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> barrier( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| const BarrierOptions& opts) { | ||
| static at::Tensor tensor; | ||
| // TODO: if nccl was specified then use it | ||
| if (process_group->getBackendType() == | ||
| c10d::ProcessGroup::BackendType::NCCL) { | ||
| // set cuda tensor | ||
| tensor = at::empty( | ||
| {1}, at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte)); | ||
| } else { | ||
| // Default to using cpu implementation | ||
| tensor = at::empty( | ||
| {1}, at::TensorOptions().device(at::DeviceType::CPU).dtype(at::kByte)); | ||
| } | ||
|
|
||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::barrier", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| at::Tensor, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| const std::vector<int64_t>&, | ||
| int64_t)>(); | ||
|
|
||
| return op.call(tensor, process_group, opts.device_ids, opts.timeout.count()); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> send( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::TensorList tensors, | ||
| int64_t dstRank, | ||
| int64_t tag) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::send", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t, | ||
| int64_t)>(); | ||
| return op.call(tensors, process_group, dstRank, tag); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> recv( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::TensorList tensors, | ||
| int64_t srcRank, | ||
| int64_t tag) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::recv_", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t, | ||
| int64_t)>(); | ||
| return op.call(tensors, process_group, srcRank, tag); | ||
| } | ||
|
|
||
| c10::intrusive_ptr<Work> recv_any_source( | ||
| const c10::intrusive_ptr<ProcessGroup>& process_group, | ||
| at::TensorList tensors, | ||
| int64_t tag) { | ||
| static auto op = c10::Dispatcher::singleton() | ||
| .findSchemaOrThrow("c10d::recv_any_source_", "") | ||
| .typed<c10::intrusive_ptr<::c10d::Work>( | ||
| at::TensorList, | ||
| const c10::intrusive_ptr<::c10d::ProcessGroup>&, | ||
| int64_t)>(); | ||
| return op.call(tensors, process_group, tag); | ||
| } | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After removing this code block, should we also remove the corresponding API declarations in Ops.hpp?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point! Thanks
Fixes #90932 Fixes #90659 Remove redundant collection operation definitions by calling the ops directly from `ProcessGroup` Context: #86225 Differential Revision: [D42854676](https://our.internmc.facebook.com/intern/diff/D42854676) [ghstack-poisoned]
|
@H-Huang has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator. |
|
@pytorchbot merge |
Merge startedYour change will be merged once all checks pass (ETA 0-4 Hours). Learn more about merging in the wiki. Questions? Feedback? Please reach out to the PyTorch DevX Team |
In #91257, we removed direct calls to methods in ops.cpp, so this is updating to also remove ops.hpp Pull Request resolved: #94532 Approved by: https://github.com/kwen2501
Stack from ghstack (oldest at bottom):
Fixes #90932
Fixes #90659
Remove redundant collection operation definitions by calling the ops directly from
ProcessGroupContext:
#86225
Differential Revision: D42854676