Skip to content

Commit c857acf

Browse files
committed
Update on "[Distributed] getNumKeys API to c10d TCPStore"
This PR adds a getNumKeys API to the TCP Store, which essentially returns the number of keys in the store at that point. This API will be useful for some applications related to debug logging in ProcessGroupNCCL going forward. This PR also adds some C++ tests for this API and Python tests are added in #45223. We will build on this functionality in the future by implementing this API for FileStore, HashStore, RedisStore, and ZeusStore. Differential Revision: [D22985085](https://our.internmc.facebook.com/intern/diff/D22985085/) **NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D22985085/)! [ghstack-poisoned]
2 parents 9da03d9 + 00e704e commit c857acf

File tree

177 files changed

+11515
-4853
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

177 files changed

+11515
-4853
lines changed

aten/src/ATen/LegacyTHFunctionsCPU.cpp

Lines changed: 0 additions & 255 deletions
Large diffs are not rendered by default.

aten/src/ATen/LegacyTHFunctionsCPU.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
3939
Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
4040
Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
4141
Tensor _th_trace(const Tensor & self);
42-
Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
43-
Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
44-
Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
4542
std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
4643
std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
4744
std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);

aten/src/ATen/core/boxing/KernelFunction.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) {
2222
void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) {
2323
TORCH_INTERNAL_ASSERT(0,
2424
op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "
25+
"This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). "
2526
"If it's intended to override Math kernel behavior, please open an issue to request a dedicated "
2627
"Autograd dispatch key for the backend.");
2728
}

aten/src/ATen/core/dispatch/OperatorEntry.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,9 @@ const KernelFunction& OperatorEntry::computeDispatchTableEntry(const c10::Dispat
157157
}
158158

159159
bool OperatorEntry::hasKernelForDispatchKeySet(DispatchKeySet ks) const {
160-
for (auto k : ks) {
161-
if (kernels_.find(k) != kernels_.end()) {
162-
return true;
163-
}
160+
TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end());
161+
for (auto& kv : kernels_) {
162+
if (ks.has(kv.first)) return true;
164163
}
165164
return false;
166165
}
@@ -196,6 +195,9 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
196195
// In the past we directly call into backends(filled with catchAll) after BackendSelect.
197196
// Now that we first call Autograd backend keys after BackendSelect, we should fill those
198197
// with catchAll as well.
198+
// The implementation of (2.1) & (2.3) relies on the invariant that for a given backend,
199+
// `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the
200+
// backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_]
199201
// (3) Use fallthrough kernel that are registered as fallback.
200202
// (4) Use catchAll kernel if available
201203
// Alias Key Precedence:
@@ -272,7 +274,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
272274
for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) {
273275
updateDispatchTableEntry_(dispatcher, k);
274276
}
275-
// Registering to backend key might affect computed entry at its Autograd backend key due to 2.2.
277+
// Note [Refresh Runtime Autograd entries in dispatchTable_]
278+
// Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
276279
DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
277280
updateDispatchTableEntry_(dispatcher, autograd_key);
278281
}

aten/src/ATen/core/interned_strings.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ namespace c10 {
5959
_(prim, Store) \
6060
_(prim, AutogradZero) \
6161
_(prim, AutogradAnyNonZero) \
62+
_(prim, AutogradAllNonZero) \
63+
_(prim, AutogradAllZero) \
6264
_(prim, Starred) \
6365
_(prim, TupleConstruct) \
6466
_(prim, TupleUnpack) \

aten/src/ATen/cuda/CUDABlas.cpp

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -498,46 +498,6 @@ void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) {
498498
}
499499
#endif
500500

501-
namespace {
502-
template<typename scalar_t>
503-
cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, scalar_t *alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda) {
504-
TORCH_CHECK(false, "cublas ger is defined only for float and double");
505-
return {};
506-
}
507-
template<>
508-
cublasStatus_t cublasGer<float>(const cublasHandle_t &handle, int64_t m, int64_t n, float *alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) {
509-
return cublasSger(handle, m, n, alpha, x, incx, y, incy, a, lda);
510-
}
511-
template<>
512-
cublasStatus_t cublasGer<double>(const cublasHandle_t &handle, int64_t m, int64_t n, double *alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) {
513-
return cublasDger(handle, m, n, alpha, x, incx, y, incy, a, lda);
514-
}
515-
} // anonymous namespace
516-
517-
template<typename scalar_t>
518-
void ger(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda)
519-
{
520-
_cublasAdjustLdLevel2(m, n, &lda);
521-
TORCH_CHECK((m <= INT_MAX) &&
522-
(n <= INT_MAX) &&
523-
(lda <= INT_MAX) &&
524-
(incx <= INT_MAX) &&
525-
(incy <= INT_MAX),
526-
"cublasSger/cublasDger only supports m, n, lda, incx, incy with "
527-
"the bound [val] <= %d", INT_MAX);
528-
int i_m = (int)m;
529-
int i_n = (int)n;
530-
int i_lda = (int)lda;
531-
int i_incx = (int)incx;
532-
int i_incy = (int)incy;
533-
534-
cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
535-
TORCH_CUDABLAS_CHECK(cublasGer<scalar_t>(
536-
handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
537-
}
538-
template void ger<float>(int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda);
539-
template void ger<double>(int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda);
540-
541501
/* LEVEL 1 BLAS FUNCTIONS */
542502

543503
template <>

aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ namespace at { namespace cuda {
4242
_(nvrtcGetProgramLog) \
4343
_(nvrtcGetLoweredName) \
4444
_(cuModuleLoadData) \
45+
_(cuModuleLoadDataEx) \
4546
_(cuModuleGetFunction) \
4647
_(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
4748
_(cuGetErrorString) \

aten/src/ATen/native/LinearAlgebra.cpp

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -143,50 +143,61 @@ static void check_1d(const Tensor& t, const char* arg, const char* fn) {
143143
}
144144

145145
Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
146-
check_1d(vec1, "vec1", "addr");
147-
check_1d(vec2, "vec2", "addr");
148-
Tensor b_self;
149-
std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
150-
return at::_addr(b_self, vec1, vec2, beta, alpha);
146+
TORCH_WARN(
147+
"torch.addr is deprecated and may be removed in a future PyTorch release. "
148+
"This function can be implemented using torch.outer as "
149+
"alpha * torch.outer(vec1, vec2) + beta * input when beta is not zero, "
150+
"alpha * torch.outer(vec1, vec2) when beta is zero.");
151+
152+
Tensor outer_result = at::outer(vec1, vec2) * alpha;
153+
if (beta.to<double>() == 0.0) {
154+
return outer_result;
155+
}
156+
return outer_result + (self * beta);
151157
}
152158

153159
Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
154-
check_1d(vec1, "vec1", "addr");
155-
check_1d(vec2, "vec2", "addr");
156-
return at::_addr_(self, vec1, vec2, beta, alpha);
160+
return at::addr_out(self, self, vec1, vec2, beta, alpha);
157161
}
158162

159163
Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
160-
check_1d(vec1, "vec1", "addr");
161-
check_1d(vec2, "vec2", "addr");
162-
Tensor b_self;
163-
std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr_out");
164-
return at::_addr_out(result, b_self, vec1, vec2, beta, alpha);
164+
auto addr_result = at::addr(self, vec1, vec2, beta, alpha);
165+
// Validates safe casting
166+
const auto result_dtype = addr_result.scalar_type();
167+
TORCH_CHECK(canCast(result_dtype, result.scalar_type()),
168+
"result type ", result_dtype,
169+
" can't be cast to the desired output type ", result.scalar_type());
170+
171+
at::native::resize_output(result, addr_result.sizes().vec());
172+
result.copy_(addr_result);
173+
return result;
165174
}
166175

176+
// torch.ger, alias for torch.outer
167177
Tensor& ger_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
168-
check_1d(self, "self", "ger");
169-
check_1d(vec2, "vec2", "ger");
170-
if (result.dim() != 2 || result.size(0) != self.size(0) || result.size(1) != vec2.size(0)) {
171-
result.resize_({ self.size(0), vec2.size(0) });
172-
}
173-
// resize_ does the "broadcasting", don't need to broadcast again.
174-
return at::_addr_out(result, result, self, vec2, Scalar(0), Scalar(1));
178+
TORCH_WARN("torch.ger is deprecated and will be removed in a future PyTorch release. "
179+
"Use torch.outer instead.");
180+
return at::outer_out(result, self, vec2);
175181
}
176182

177183
Tensor ger(const Tensor& self, const Tensor& vec2) {
178-
Tensor result = at::empty({0}, self.options());
179-
at::ger_out(result, self, vec2);
180-
return result;
184+
return self.outer(vec2);
181185
}
182186

183-
// torch.outer, alias for torch.ger
184187
Tensor& outer_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
185-
return at::ger_out(result, self, vec2);
188+
check_1d(self, "self", "outer");
189+
check_1d(vec2, "vec2", "outer");
190+
191+
// torch.outer is implemented as a composite op using reshape and mul
192+
at::mul_out(result, self.reshape({self.size(0), 1}), vec2);
193+
return result;
186194
}
187195

188196
Tensor outer(const Tensor& self, const Tensor& vec2) {
189-
return self.ger(vec2);
197+
check_1d(self, "self", "outer");
198+
check_1d(vec2, "vec2", "outer");
199+
200+
return self.reshape({self.size(0), 1}) * vec2;
190201
}
191202

192203
static void addmm_impl_cpu_(

aten/src/ATen/native/cuda/LinearAlgebra.cu

Lines changed: 0 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -178,120 +178,6 @@ Tensor& addmm__cuda(Tensor& self, const Tensor& mat1, const Tensor& mat2,
178178
return self;
179179
}
180180

181-
template<typename scalar_t>
182-
void addr_impl_ger_cuda(Tensor &out, const Tensor &self,
183-
const Tensor& vec1, const Tensor& vec2,
184-
scalar_t alpha, scalar_t beta) {
185-
static_assert(std::is_same<scalar_t, float>::value ||
186-
std::is_same<scalar_t, double>::value,
187-
"addr_impl_ger_cuda: only float and double are supported");
188-
if (&out != &self) {
189-
at::native::resize_as_(out, self);
190-
at::native::copy_(out, self);
191-
}
192-
if (beta == 0.0) {
193-
at::native::zero_(out);
194-
}
195-
if (beta != 1.0) {
196-
at::native::mul_(out, beta);
197-
}
198-
if (out.stride(0) == 1) {
199-
at::cuda::blas::ger<scalar_t>(
200-
vec1.size(0), vec2.size(0), alpha,
201-
vec1.data_ptr<scalar_t>(), vec1.stride(0),
202-
vec2.data_ptr<scalar_t>(), vec2.stride(0),
203-
out.data_ptr<scalar_t>(), out.stride(1)
204-
);
205-
} else if (out.stride(1) == 1) {
206-
at::cuda::blas::ger<scalar_t>(
207-
vec2.size(0), vec1.size(0), alpha,
208-
vec2.data_ptr<scalar_t>(), vec2.stride(0),
209-
vec1.data_ptr<scalar_t>(), vec1.stride(0),
210-
out.data_ptr<scalar_t>(), out.stride(0)
211-
);
212-
} else {
213-
Tensor cr = out.clone();
214-
at::cuda::blas::ger<scalar_t>(
215-
vec2.size(0), vec1.size(0), alpha,
216-
vec2.data_ptr<scalar_t>(), vec2.stride(0),
217-
vec1.data_ptr<scalar_t>(), vec1.stride(0),
218-
out.data_ptr<scalar_t>(), out.stride(0)
219-
);
220-
out.set_(cr);
221-
}
222-
}
223-
224-
template<typename scalar_t>
225-
void addr_impl_cuda(Tensor &out, const Tensor &self,
226-
const Tensor& vec1, const Tensor& vec2,
227-
scalar_t alpha, scalar_t beta) {
228-
// currently no Hger/SgerEx in Cublas.
229-
Tensor vec2T = vec2.reshape({1, vec2.size(0)});
230-
Tensor vec1M = vec1.reshape({vec1.size(0), 1});
231-
addmm_out_cuda(out, self, vec1M, vec2T, beta, alpha);
232-
}
233-
template<>
234-
void addr_impl_cuda<float>(Tensor &out, const Tensor &self,
235-
const Tensor& vec1, const Tensor& vec2,
236-
float alpha, float beta) {
237-
addr_impl_ger_cuda<float>(out, self, vec1, vec2, alpha, beta);
238-
}
239-
template<>
240-
void addr_impl_cuda<double>(Tensor &out, const Tensor &self,
241-
const Tensor& vec1, const Tensor& vec2,
242-
double alpha, double beta) {
243-
addr_impl_ger_cuda<double>(out, self, vec1, vec2, alpha, beta);
244-
}
245-
246-
Tensor& addr_out_cuda(Tensor &out, const Tensor& self,
247-
const Tensor& vec1, const Tensor& vec2,
248-
Scalar beta, Scalar alpha) {
249-
TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1,
250-
"vec1 and vec2 should be 1-dimensional vectors. Got dimensions ",
251-
vec1.dim(), " and ", vec2.dim());
252-
253-
Tensor self_;
254-
if (&out != &self) {
255-
std::tie(self_) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
256-
} else {
257-
self_ = self;
258-
}
259-
260-
TORCH_CHECK(out.device() == self_.device() &&
261-
out.device() == vec1.device() &&
262-
out.device() == vec2.device(),
263-
"Expected all tensors to be on the same device. Found: ",
264-
out.device(), ", ", self_.device(), ", ",
265-
vec1.device(), " and ", vec2.device());
266-
TORCH_CHECK(self_.dim() == 2,
267-
"2D tensor expected, got ", self_.dim(), "D tensor for input");
268-
TORCH_CHECK(self_.size(0) == vec1.size(0) && self_.size(1) == vec2.size(0),
269-
"size mismatch",
270-
", input: ", self_.sizes(),
271-
", v1: ", vec1.sizes(),
272-
", v2: ", vec2.sizes());
273-
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self_.scalar_type(), "addr_out_cuda", [&] {
274-
addr_impl_cuda<scalar_t>(out, self_, vec1, vec2,
275-
alpha.to<scalar_t>(), beta.to<scalar_t>());
276-
});
277-
return out;
278-
}
279-
280-
Tensor& addr__cuda(Tensor& self,
281-
const Tensor& vec1, const Tensor& vec2,
282-
Scalar beta, Scalar alpha) {
283-
addr_out_cuda(self, self, vec1, vec2, beta, alpha);
284-
return self;
285-
}
286-
287-
Tensor addr_cuda(const Tensor& self,
288-
const Tensor& vec1, const Tensor& vec2,
289-
Scalar beta, Scalar alpha) {
290-
Tensor out = at::empty({0}, self.options());
291-
addr_out_cuda(out, self, vec1, vec2, beta, alpha);
292-
return out;
293-
}
294-
295181
Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
296182
const Tensor& batch1, const Tensor& batch2,
297183
Scalar beta, Scalar alpha) {

0 commit comments

Comments
 (0)