admin管理员组

文章数量:1406926

I want to implement CDP for a basic forward function (I will call the forward function too many times at the same time (also from a CUDA function) and because of that I want to use CDP)

Here's the code that I'm trying to run;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    cudaDeviceSynchronize();
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

I also tried to run the function from a device function like this but still gave me the same error;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}

And also tried cudaLaunchKernel function and using __global__ instead of __device__ but they didn't work either. I'm using -rdc=true flag too and also my arch is sm_75 which should support CDP.

I want to implement CDP for a basic forward function (I will call the forward function too many times at the same time (also from a CUDA function) and because of that I want to use CDP)

Here's the code that I'm trying to run;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    cudaDeviceSynchronize();
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

I also tried to run the function from a device function like this but still gave me the same error;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}

And also tried cudaLaunchKernel function and using __global__ instead of __device__ but they didn't work either. I'm using -rdc=true flag too and also my arch is sm_75 which should support CDP.

Share Improve this question edited Mar 6 at 14:14 John Kugelman 363k69 gold badges553 silver badges597 bronze badges asked Mar 6 at 9:02 ug0x01ug0x01 477 bronze badges 6
  • 1 "And also tried cudaLaunchKernel function and using global instead of device but they didn't work either." A device function isn't a kernel function and therefore can not be launched. So the error when using __global__ would be more interesting. – paleonix Commented Mar 6 at 12:02
  • 1 This does not look like a good use-case for CDP. CDP is useful when the amount of parallelism is data-dependent. Here you already know the amount of parallelism on the host and can just launch kernels from the host. – paleonix Commented Mar 6 at 13:46
  • Thank you for your reply @paleonix! I’m trying to build a NEAT algorithm that requires numerous "forward and get result" steps. Since the forward function needs to be called repeatedly, I believe I have to use CDP. The code works fine when I switch from device to global, but now I’m facing an issue with thread synchronization. The process relies on the first function completing before moving on, as it’s essential for obtaining accurate results with the activation function. Is there a way to ensure that a parallelly called function finishes before proceeding with the rest of the code? – ug0x01 Commented Mar 6 at 14:38
  • 1 Sounds like you should read up on CUDA streams. I still don't see how CDP is warranted here. Also keep in mind that legacy CDP1 where you can call cudaDeviceSynchronize() in device code will not run on recent GPUs. It was replaced with CDP2 which does not allow using results from child kernels in the parent kernel. – paleonix Commented Mar 6 at 14:41
  • "Since the forward function needs to be called repeatedly" Is the number of invocations/repetitions data-dependent? While that could be a reason to use CDP, alternatively one could use the fancy new conditional/while nodes for CUDA Graphs. – paleonix Commented Mar 6 at 15:02
 |  Show 1 more comment

1 Answer 1

Reset to default 1

The error is exactly what is says on the tin.
If you want to configure a call to a function with <<<x,y,z>>> parameters, it needs to be a __global__ function.

It is perfectly valid to call a __global__ function from another global function.

if you want your code to compile, recode it like so:

#define restrict __restrict__
__global__ void NNFeedForwardNormalMultiple(double* restrict values, double* restrict weights, double* result, int inputsize, int outputsize) {
    ...
}

__global__ void NNFeedForwardNormalActivate(double* restrict biases, double* result, int size) {
   ...
}

__global__ void NNFeedForwardNormal(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
    ...
}

__global__ void NNFeedForwardNormalWrapper(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
    const auto blocks = some calculation;
    const auto threads_per_block = some calculation;
    NNFeedForwardNormal<<<blocks, threads_per_block>>>(values, weights, result, biases, inputsize, outputsize);
}

If instead you want to limit the number of threads in a device function, you can do it like below. Note that you can only limit the number of blocks/threads, you cannot expand it.
As long as you make sure to keep the bounds at warp edges, this will incur no slowdown.

__device__ void first_ten_blocks() { ... }

__device__ void other_blocks() { ... }

__global__ void start() {
    if (blockIdx.x < 10) { first_ten_blocks(); }   
    else { other_blocks(); } 
}

__device__ void first_warp() { ... }
__device__ void other_warps() { ... }
__global__ void warp_split() {
    if (threadIdx.x < 32) { first_warp(); }
    else { other_warps(); }
}

int main() {
    start<<<100,32>>>(); //10 block first_ten_block, 90 blocks other blocks
    start<<<9,32>>>(); //9 blocks first_ten_blocks, no other blocks

    warp_split<<<1, 64>>>(); 1 warp in first_warp, 1 in other_warps
    warp_split<<<10, 32>>>(); 10x warp in first_warp, no other warps
    warp_split<<<48, 512>>>(); 48x first_warp, 48x other warps with 512-32 = 480 threads each.
}

本文标签: cError a device function call cannot be configuredStack Overflow