Question

CUDA dynamic parallelism -- Is there a way to infinitely nest kernel launches?

Now, I'm using CUDA dynamic parallelism to create the kernel in a kernel function.
In the CUDA document, kernel functions can only be launched a fixed recursion depth because of resource constraints.
But in my project, I want to launch the kernel but parent kernel doesn't need to wait for exiting of child kernel. In other words, they are completely independent.
So is there some ways to launch the kernel in kernel functions but not limited by recursion depth?

I use the cudaDeviceSetLimit() to set cudaLimitDevRuntimeSyncDepth but there are still restrictions.

Example:

__global__ void do_something(MyQueue* queue, Task* task) {
  // do something ...
  task->execute();

  // If queue is not empty, pop from it and launch a kernel to execute it
  Task* t = queue->pop();
  if (t) {
    do_something<<<t->gridSize, t->blockSize, t->mem, stream>>>(queue, t);
  }
}

GPU: NVIDIA A100
CUDA version: 12.4
compile flag: -rdc=true -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61

3 54 3

1 Jan 1970

Solution

according to my testing, making use of CDP2, it certainly seems possible to launch a kernel that will call sub-kernels for a "long" sequence:

# cat t225.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>

using mt = int;
const mt my_stopc = 32768;
__global__ void k(mt *d, const mt my_stop){

  mt c = *d;
  if (c < my_stop){
    c++;
    *d = c;
    k<<<1,1,0, cudaStreamTailLaunch>>>(d, my_stop);}
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) printf("device: %lu, %s\n", (unsigned long long)c, cudaGetErrorString(err));
}

int main(int argc, char *argv[]){

  mt my_stop = my_stopc;
  if (argc > 1) my_stop = atol(argv[1]);
  mt *d;
  cudaMallocManaged(&d, sizeof(d[0]));
  *d = 0;
  k<<<1,1>>>(d, my_stop);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) std::cout << "host 1: " << cudaGetErrorString(err) << std::endl;
  err = cudaDeviceSynchronize();
  if (err != cudaSuccess) std::cout << "host 2: " << cudaGetErrorString(err) << std::endl;
  std::cout << "iter: " << *d << std::endl;
}

# nvcc -o t225 t225.cu -arch=sm_89 -rdc=true -lcudadevrt -lineinfo
# ./t225
iter: 32768
# ./t225 70000
iter: 70000
#

CUDA 12.2, L4 GPU

If I increase the count to 1000000, then the (parent) kernel completes in about 12 seconds:

# time ./t225 1000000
iter: 1000000

real    0m12.306s
user    0m10.155s
sys     0m2.080s
#

2024-07-14

Robert Crovella