I want to load 4 floats per thread. I know they are not 16 byte aligned. What is the best way to do this?
Specific conditions:
- I cannot align the array without replicating the data because other accesses are properly aligned with the current placement
- The data is likely L1 hot. In fact, I cannot replicate because I want to keep the L1 hit rate high
- Access between threads is not coalesced (it's a lookup table)
- If I unroll some loops, I know how the vectors are misaligned, meaning off by 1-3 floats from the alignment point
- I can add padding. Therefore I can safely access the two properly aligned
float4vectors that contain the misaligned vector
If I write naive code as in the following kernel, it will generate four memory accesses per vector.
struct MisalignedFloat4
{
float x, y, z, w;
__device__ operator float4() const
{ return {x, y, z, w}; };
};
__global__ void add_version0(float4* out, int n,
const float* left, const float* middle, const float* right)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
auto leftvec = reinterpret_cast<const MisalignedFloat4*>(left);
auto middlevec = reinterpret_cast<const MisalignedFloat4*>(middle);
auto rightvec = reinterpret_cast<const MisalignedFloat4*>(right);
if (tid < n) {
float4 l = leftvec[tid];
float4 m = middlevec[tid];
float4 r = rightvec[tid];
out[tid] = float4 {
l.x + m.x + r.x, l.y + m.y + r.y,
l.z + m.z + r.z, l.w + m.w + r.w };
}
}
If I add the knowledge about the specific misalignment, I can use one or two float2 loads.
#include <cassert>
#include <cstdint>
// using std::uintptr_t
template<class T>
__device__ std::size_t misalignment(const void* ptr)
{ return reinterpret_cast<std::uintptr_t>(ptr) % sizeof(T); }
template<int misalign>
__device__ float4 loaduf4_version1(const float* addr)
{
float2 t1, t2;
switch(misalign) {
case 1:
case 3:
t1 = *reinterpret_cast<const float2*>(addr + 1);
return { addr[0], t1.x, t1.y, addr[3] };
case 2:
t1 = *reinterpret_cast<const float2*>(addr);
t2 = *reinterpret_cast<const float2*>(addr + 2);
return { t1.x, t1.y, t2.x, t2.y };
default:
return *reinterpret_cast<const float4*>(addr);
}
}
__global__ void add_version1(float4* out, int n,
const float* left, const float* middle, const float* right)
{
assert(misalignment<float4>(left) == 1 * sizeof(float));
assert(misalignment<float4>(middle) == 2 * sizeof(float));
assert(misalignment<float4>(right) == 3 * sizeof(float));
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n) {
float4 l = loaduf4_version1<1>(left + 4 * tid);
float4 m = loaduf4_version1<2>(middle + 4 * tid);
float4 r = loaduf4_version1<3>(right + 4 * tid);
out[tid] = float4 {
l.x + m.x + r.x, l.y + m.y + r.y,
l.z + m.z + r.z, l.w + m.w + r.w };
}
}
And if I permit access to the surrounding memory, I can always use two loads per vector.
template<int misalign>
__device__ float4 loaduf4_version2(const float* addr)
{
float4 f1;
float2 h1, h2;
switch(misalign) {
case 1:
f1 = *reinterpret_cast<const float4*>(addr - misalign);
return { f1.y, f1.z, f1.w, addr[3] };
case 2:
h1 = *reinterpret_cast<const float2*>(addr);
h2 = *reinterpret_cast<const float2*>(addr + 2);
return { h1.x, h1.y, h2.x, h2.y };
case 3:
f1 = *reinterpret_cast<const float4*>(addr + 4 - misalign);
return { addr[0], f1.x, f1.y, f1.z };
default:
return *reinterpret_cast<const float4*>(addr);
}
}
__global__ void add_version2(float4* out, int n,
const float* left, const float* middle, const float* right)
{
assert(misalignment<float4>(left) == 1 * sizeof(float));
assert(misalignment<float4>(middle) == 2 * sizeof(float));
assert(misalignment<float4>(right) == 3 * sizeof(float));
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n) {
float4 l = loaduf4_version2<1>(left + 4 * tid);
float4 m = loaduf4_version2<2>(middle + 4 * tid);
float4 r = loaduf4_version2<3>(right + 4 * tid);
out[tid] = float4 {
l.x + m.x + r.x, l.y + m.y + r.y,
l.z + m.z + r.z, l.w + m.w + r.w };
}
}
Is there a more efficient way to achieve the same?
If not, is there a version that is less verbose or works without compile time knowledge of the misalignment?
Here is the rest of the minimally reproducible example:
#include <cstdio>
// using std::printf
#include <cuda_runtime.h>
int main()
{
const int n = 1000, padding = 1;
// n * float4 + 4 floats in front and back
const int allocsize = 4 * (n + 2 * padding);
/*
* Allocate buffers.
* For brevity I'm not bothering with any deallocations
*/
float* hostbuf, *hostreference;
for(float** ptr: { &hostbuf, &hostreference })
if(cudaHostAlloc(ptr, n * sizeof(float4), cudaHostAllocDefault))
return 1;
for(int i = 0; i < 4 * n; ++i)
hostbuf[i] = i + 1;
float* left, *middle, *right, *out;
for(float** ptr: { &left, &middle, &right, &out })
if(cudaMalloc(ptr, allocsize * sizeof(float)))
return 2;
/*
* cause misalignment. Copy to device
*/
left += 5;
middle += 6;
right += 7;
for(float* ptr: { left, middle, right })
if(cudaMemcpy(ptr, hostbuf, n * sizeof(float4), cudaMemcpyDefault))
return 3;
/*
* Run the naive version. Store in hostreference for validation
*/
const dim3 threads = { 128, 1, 1 };
const dim3 blocks = { (n + threads.x - 1) / threads.x, 1, 1 };
float4* f4out = reinterpret_cast<float4*>(out);
add_version0<<<blocks, threads>>>(f4out, n, left, middle, right);
if(cudaMemcpy(hostreference, out, n * sizeof(float4), cudaMemcpyDefault))
return 4;
/*
* Run optimized versions. Store in hostbuf for comparison
*/
add_version1<<<blocks, threads>>>(f4out, n, left, middle, right);
if(cudaMemcpy(hostbuf, out, n * sizeof(float4), cudaMemcpyDefault))
return 5;
for(int i = 0; i < 4 * n; ++i)
if(hostbuf[i] != hostreference[i])
std::printf("Mismatch version 1 idx %d %f != %f\n",
i, hostbuf[i], hostreference[i]);
add_version2<<<blocks, threads>>>(f4out, n, left, middle, right);
if(cudaMemcpy(hostbuf, out, n * sizeof(float4), cudaMemcpyDefault))
return 6;
for(int i = 0; i < 4 * n; ++i)
if(hostbuf[i] != hostreference[i])
std::printf("Mismatch version 2 idx %d %f != %f\n",
i, hostbuf[i], hostreference[i]);
}