forked from Tensor-Array/Tensor-Array
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtensor_cast.cu
More file actions
145 lines (131 loc) · 4.3 KB
/
tensor_cast.cu
File metadata and controls
145 lines (131 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*
Copyright 2024 TensorArray-Creators
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda_fp8.h>
#include <cuda_fp16.h>
#include <cuda_bf16.h>
#include <cstdio>
#ifndef TENSOR_CONTENT
#define TENSOR_CONTENT
#include "tensor.hh"
#undef TENSOR_CONTENT
#endif // !TENSOR_CONTENT
#define LOOP(seq) END(A seq)
#define BODY(x) ADD_CODE(x)
#define A(x) BODY(x) B
#define B(x) BODY(x) A
#define A_END
#define B_END
#define END(...) END_(__VA_ARGS__)
#define END_(...) __VA_ARGS__##_END
#define USING_DATA_TYPE_NVIDIA_FLOAT_8() (__nv_fp8_e4m3)(__nv_fp8_e5m2)
#define USING_DATA_TYPE_NVIDIA_FLOAT() (__half)(__nv_bfloat16)
#define USING_DATA_TYPE_FLOAT() (float)(double)
#define USING_DATA_TYPE_SINT() (int16_t)(int32_t)(int64_t)
#define USING_DATA_TYPE_UINT() (uint16_t)(uint32_t)(uint64_t)
#if CUDART_VERSION >= 12020
#define USING_DATA_TYPE_CAST_FROM() \
USING_DATA_TYPE_SINT() \
USING_DATA_TYPE_UINT() \
USING_DATA_TYPE_FLOAT() \
USING_DATA_TYPE_NVIDIA_FLOAT()
#else
#define USING_DATA_TYPE_CAST_FROM() \
USING_DATA_TYPE_SINT() \
USING_DATA_TYPE_UINT() \
USING_DATA_TYPE_FLOAT()
#endif
#if CUDART_VERSION >= 12020
#define USING_DATA_TYPE_CAST_TO() \
(bool) \
(int8_t) \
(uint8_t) \
USING_DATA_TYPE_NVIDIA_FLOAT_8() \
USING_DATA_TYPE_CAST_FROM()
#else
#define USING_DATA_TYPE_CAST_TO() \
(bool) \
(int8_t) \
(uint8_t) \
USING_DATA_TYPE_CAST_FROM()
#endif
namespace tensor_array
{
namespace value
{
template <typename T_O, typename T_I>
__global__ void type_casting(T_O* output, const T_I* input, unsigned int c_size)
{
unsigned int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_x < c_size)
output[thread_x] = static_cast<T_O>(input[thread_x]);
}
Tensor derive_reshape_cast(const Tensor& dat, const Tensor& new_shape, bool, const DataBuffer&);
template <typename T>
Tensor Tensor::cast(bool is_derive) const
{
std::vector<std::pair<Tensor, Derivation>> temp;
if (is_derive)
temp.push_back(std::make_pair(*this, Derivation(*this, derive_reshape_cast)));
cudaError cuda_status;
T* out_ptr;
devices::Device this_cuda{ devices::CUDA };
cuda_status = cudaGetDevice(&this_cuda.index);
cudaDeviceProp cu_dev_prop;
cuda_status = cudaGetDeviceProperties(&cu_dev_prop, this_cuda.index);
TensorBase base_of_this = this->get_buffer().change_device(this_cuda);
std::size_t total_size = base_of_this.data_size() / get_sizeof_type(base_of_this.type());
cuda_status = cudaMalloc(&out_ptr, total_size * sizeof(T));
dim3 block_dim(cu_dev_prop.maxThreadsDim[0]);
dim3 grid_dim(total_size / block_dim.x + (total_size % block_dim.x ? 1U : 0U));
#define ADD_CODE(TYPE) \
if(this->get_buffer().type() == typeid(TYPE)) \
type_casting<<<grid_dim, block_dim>>>(out_ptr, static_cast<const TYPE*>(base_of_this.data()), total_size);
LOOP(USING_DATA_TYPE_CAST_FROM());
#undef ADD_CODE
cuda_status = cudaDeviceSynchronize();
cuda_status = cudaGetLastError();
if (cuda_status != cudaSuccess)
{
std::printf("CUDA error: %s\n", cudaGetErrorString(cuda_status));
}
std::type_index test = typeid(T);
if (dynamic_type_size.find(test) == dynamic_type_size.end())
dynamic_type_size.insert(std::make_pair(test, sizeof(T)));
TensorBase other_buf(typeid(T), base_of_this.shape(), out_ptr, this_cuda);
cuda_status = cudaFree(out_ptr);
return Tensor(std::move(other_buf), std::move(temp));
}
Tensor Tensor::tensor_cast(const std::type_info& dtype, bool is_derive) const
{
if (this->get_buffer().type() == dtype)
return *this;
#define ADD_CODE(TYPE) \
if(dtype == typeid(TYPE)) \
return this->cast<TYPE>(is_derive);
LOOP(USING_DATA_TYPE_CAST_TO());
#undef ADD_CODE
throw std::exception();
}
}
}
#undef LOOP
#undef BODY
#undef A
#undef B
#undef A_END
#undef B_END
#undef END
#undef END_
#undef USING_DATA_TYPE