Skip to content

Commit f164d70

Browse files
committed
Simplify copy kernel
Using the new type promotion and dynamic casting added to `TensorIterator`, the copy kernels could be greatly simplified. **Script:** ```python import torch import timeit import pandas import itertools from tqdm import tqdm import math print(torch.__version__) print() _10M = 10 * 1024 ** 2 d = {} for from_, to in tqdm(itertools.product(torch.testing.get_all_dtypes(), repeat=2)): if from_ not in d: d[from_] = {} a = torch.zeros(_10M, dtype=from_) min_ = math.inf for i in range(100): start = timeit.default_timer() a.to(to) end = timeit.default_timer() elapsed = end - start if elapsed < min_: min_ = elapsed d[from_][to] = int(elapsed * 1000 * 1000) pandas.DataFrame(d) ``` **Before:** ![image](https://user-images.githubusercontent.com/1032377/67171274-2e93d000-f36b-11e9-8fa0-91edd7dbc8ec.png) **After:** ![image](https://user-images.githubusercontent.com/1032377/67171200-d361dd80-f36a-11e9-9b22-66292e395a09.png) ghstack-source-id: 1269ecc Pull Request resolved: #28428
1 parent 11f4039 commit f164d70

File tree

1 file changed

+5
-13
lines changed

1 file changed

+5
-13
lines changed

aten/src/ATen/native/cuda/Copy.cu

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,12 @@
88
#include <ATen/native/TensorIterator.h>
99
#include <ATen/native/cuda/Loops.cuh>
1010
#include <THC/THC.h>
11-
#include <c10/util/TypeCast.h>
1211

1312
namespace at {
1413
namespace native {
1514

1615
using namespace at::cuda;
1716

18-
template <typename dst_t, typename src_t>
19-
void copy_kernel_impl(TensorIterator& iter) {
20-
gpu_kernel(iter, []GPU_LAMBDA(src_t x) -> dst_t {
21-
return c10::static_cast_with_inter_type<dst_t>(x);
22-
});
23-
}
24-
2517
// device-to-device copy, does type conversion
2618
static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
2719
int64_t numel = iter.numel();
@@ -66,11 +58,11 @@ static void copy_device_to_device(TensorIterator& iter, bool non_blocking) {
6658
cudaMemcpyDeviceToDevice,
6759
copy_stream));
6860
} else {
69-
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(0), "copy_", [&] {
70-
using dst_t = scalar_t;
71-
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(1), "copy_", [&] {
72-
copy_kernel_impl<dst_t, scalar_t>(iter);
73-
});
61+
// this is done intentionally done after build because copy has a "promotion"
62+
// rule that always "promote" to target dtype.
63+
iter.promote_common_dtype();
64+
AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, iter.dtype(0), "copy_", [&] {
65+
gpu_kernel(iter, []GPU_LAMBDA(scalar_t x) { return x; });
7466
});
7567
}
7668

0 commit comments

Comments
 (0)