3

I want to load 4 floats per thread. I know they are not 16 byte aligned. What is the best way to do this?

Specific conditions:

  • I cannot align the array without replicating the data because other accesses are properly aligned with the current placement
  • The data is likely L1 hot. In fact, I cannot replicate because I want to keep the L1 hit rate high
  • Access between threads is not coalesced (it's a lookup table)
  • If I unroll some loops, I know how the vectors are misaligned, meaning off by 1-3 floats from the alignment point
  • I can add padding. Therefore I can safely access the two properly aligned float4 vectors that contain the misaligned vector

If I write naive code as in the following kernel, it will generate four memory accesses per vector.

struct MisalignedFloat4
{
  float x, y, z, w;
  __device__ operator float4() const
  { return {x, y, z, w}; };
};

__global__ void add_version0(float4* out, int n,
        const float* left, const float* middle, const float* right)
{
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  auto leftvec = reinterpret_cast<const MisalignedFloat4*>(left);
  auto middlevec = reinterpret_cast<const MisalignedFloat4*>(middle);
  auto rightvec = reinterpret_cast<const MisalignedFloat4*>(right);
  if (tid < n) {
    float4 l = leftvec[tid];
    float4 m = middlevec[tid];
    float4 r = rightvec[tid];
    out[tid] = float4 {
      l.x + m.x + r.x, l.y + m.y + r.y,
      l.z + m.z + r.z, l.w + m.w + r.w };
  }
}

If I add the knowledge about the specific misalignment, I can use one or two float2 loads.

#include <cassert>
#include <cstdint>
// using std::uintptr_t

template<class T>
__device__ std::size_t misalignment(const void* ptr)
{ return reinterpret_cast<std::uintptr_t>(ptr) % sizeof(T); }

template<int misalign>
__device__ float4 loaduf4_version1(const float* addr)
{
  float2 t1, t2;
  switch(misalign) {
  case 1:
  case 3:
    t1 = *reinterpret_cast<const float2*>(addr + 1);
    return { addr[0], t1.x, t1.y, addr[3] };
  case 2:
    t1 = *reinterpret_cast<const float2*>(addr);
    t2 = *reinterpret_cast<const float2*>(addr + 2);
    return { t1.x, t1.y, t2.x, t2.y };
  default:
    return *reinterpret_cast<const float4*>(addr);
  }
}

__global__ void add_version1(float4* out, int n,
        const float* left, const float* middle, const float* right)
{
  assert(misalignment<float4>(left) == 1 * sizeof(float));
  assert(misalignment<float4>(middle) == 2 * sizeof(float));
  assert(misalignment<float4>(right) == 3 * sizeof(float));
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  if (tid < n) {
    float4 l = loaduf4_version1<1>(left + 4 * tid);
    float4 m = loaduf4_version1<2>(middle + 4 * tid);
    float4 r = loaduf4_version1<3>(right + 4 * tid);
    out[tid] = float4 {
        l.x + m.x + r.x, l.y + m.y + r.y,
        l.z + m.z + r.z, l.w + m.w + r.w };
  }
}

And if I permit access to the surrounding memory, I can always use two loads per vector.

template<int misalign>
__device__ float4 loaduf4_version2(const float* addr)
{
  float4 f1;
  float2 h1, h2;
  switch(misalign) {
  case 1:
    f1 = *reinterpret_cast<const float4*>(addr - misalign);
    return { f1.y, f1.z, f1.w, addr[3] };
  case 2:
    h1 = *reinterpret_cast<const float2*>(addr);
    h2 = *reinterpret_cast<const float2*>(addr + 2);
    return { h1.x, h1.y, h2.x, h2.y };
  case 3:
    f1 = *reinterpret_cast<const float4*>(addr + 4 - misalign);
    return { addr[0], f1.x, f1.y, f1.z };
  default:
    return *reinterpret_cast<const float4*>(addr);
  }
}

__global__ void add_version2(float4* out, int n,
        const float* left, const float* middle, const float* right)
{
  assert(misalignment<float4>(left) == 1 * sizeof(float));
  assert(misalignment<float4>(middle) == 2 * sizeof(float));
  assert(misalignment<float4>(right) == 3 * sizeof(float));
  int tid = blockDim.x * blockIdx.x + threadIdx.x;
  if (tid < n) {
    float4 l = loaduf4_version2<1>(left + 4 * tid);
    float4 m = loaduf4_version2<2>(middle + 4 * tid);
    float4 r = loaduf4_version2<3>(right + 4 * tid);
    out[tid] = float4 {
      l.x + m.x + r.x, l.y + m.y + r.y,
      l.z + m.z + r.z, l.w + m.w + r.w };
  }
}

Is there a more efficient way to achieve the same?

If not, is there a version that is less verbose or works without compile time knowledge of the misalignment?

Here is the rest of the minimally reproducible example:

#include <cstdio>
// using std::printf
#include <cuda_runtime.h>

int main()
{
  const int n = 1000, padding = 1;
  // n * float4 + 4 floats in front and back
  const int allocsize = 4 * (n + 2 * padding);
  /*
   * Allocate buffers.
   * For brevity I'm not bothering with any deallocations
   */
  float* hostbuf, *hostreference;
  for(float** ptr: { &hostbuf, &hostreference })
    if(cudaHostAlloc(ptr, n * sizeof(float4), cudaHostAllocDefault))
      return 1;
  for(int i = 0; i < 4 * n; ++i)
    hostbuf[i] = i + 1;
  float* left, *middle, *right, *out;
  for(float** ptr: { &left, &middle, &right, &out })
    if(cudaMalloc(ptr, allocsize * sizeof(float)))
      return 2;
  /*
   * cause misalignment. Copy to device
   */
  left += 5;
  middle += 6;
  right += 7;
  for(float* ptr: { left, middle, right })
    if(cudaMemcpy(ptr, hostbuf, n * sizeof(float4), cudaMemcpyDefault))
      return 3;
  /*
   * Run the naive version. Store in hostreference for validation
   */
  const dim3 threads = { 128, 1, 1 };
  const dim3 blocks = { (n + threads.x - 1) / threads.x, 1, 1 };
  float4* f4out = reinterpret_cast<float4*>(out);
  add_version0<<<blocks, threads>>>(f4out, n, left, middle, right);
  if(cudaMemcpy(hostreference, out, n * sizeof(float4), cudaMemcpyDefault))
    return 4;
  /*
   * Run optimized versions. Store in hostbuf for comparison
   */
  add_version1<<<blocks, threads>>>(f4out, n, left, middle, right);
  if(cudaMemcpy(hostbuf, out, n * sizeof(float4), cudaMemcpyDefault))
    return 5;
  for(int i = 0; i < 4 * n; ++i)
    if(hostbuf[i] != hostreference[i])
      std::printf("Mismatch version 1 idx %d %f != %f\n",
                  i, hostbuf[i], hostreference[i]);
  add_version2<<<blocks, threads>>>(f4out, n, left, middle, right);
  if(cudaMemcpy(hostbuf, out, n * sizeof(float4), cudaMemcpyDefault))
    return 6;
  for(int i = 0; i < 4 * n; ++i)
    if(hostbuf[i] != hostreference[i])
      std::printf("Mismatch version 2 idx %d %f != %f\n",
                  i, hostbuf[i], hostreference[i]);
}
3
  • Why can't you add just a little padding? Commented Aug 24 at 20:01
  • @Johan I can add padding. It's the last bullet point. Or did you mean why I added more than absolutely necessary in the MRE? That's just for simplicity / laziness. Commented Aug 25 at 6:18
  • 2
    this may possibly be of interest Commented Aug 27 at 19:24

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.