Is there an officially sanctioned way to reuse shared data between global functions?
Consider the following code https://cuda.godbolt.org/z/KMj9EKKbf:
#include <cuda.h>
#include <stdio.h>
__global__ void kernel_A(int** s) {
__shared__ int sdata;
sdata = clock();
*s = &sdata;
printf("sdata: %p = %i\n", &sdata, sdata);
}
//somewhat surprisingly, inserting a device function with __shared__ data
//does not move kernel_B's shared data.
//Maybe this is just luck?
__device__ int& make_shared() {
__shared__ int bdata;
bdata = 1;
auto& result = bdata;
return result;
}
__global__ void kernel_B(int** ptr_sdata) {
auto& bdata = make_shared();
printf("bdata: %p = %i\n", &bdata, bdata);
__shared__ int sdata;
printf("A.sdata: %p = %i, B.sdata: %p = %i\n", *ptr_sdata, *ptr_sdata[0], &sdata, sdata);
}
int main() {
int** ptr_sdata;
cudaMalloc(&ptr_sdata, sizeof(int*));
kernel_A<<<1,1>>>(ptr_sdata);
kernel_B<<<1,1>>>(ptr_sdata);
cudaDeviceSynchronize();
}
This prints:
sdata: 0x7d52c5000000 = -594894858
bdata: 0x7d52c5000004 = 1
A.sdata: 0x7d52c5000000 = -594894858, B.sdata: 0x7d52c5000000 = -594894858
I understand that kernel_B might not get assigned to the same SM as kernel_A, but let's assume that I spawn as many kernels as there are SMs, so that every SM runs a single block.
Is there a way to reuse the shared memory contents from kernel_A in kernel_B reliably?
__global__functions is about 25ish microseconds (if I remember correctly), so I guess that leaves plenty of time to shuttle the shared buffers to global memory in A and back again in B; but still, curious.