c++ - Error: a __device__ function call cannot be configured - Stack Overflow-软件玩家

admin管理员组
文章数量:1406926

I want to implement CDP for a basic forward function (I will call the forward function too many times at the same time (also from a CUDA function) and because of that I want to use CDP)

Here's the code that I'm trying to run;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    cudaDeviceSynchronize();
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

I also tried to run the function from a device function like this but still gave me the same error;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}

And also tried cudaLaunchKernel function and using __global__ instead of __device__ but they didn't work either. I'm using -rdc=true flag too and also my arch is sm_75 which should support CDP.

I want to implement CDP for a basic forward function (I will call the forward function too many times at the same time (also from a CUDA function) and because of that I want to use CDP)

Here's the code that I'm trying to run;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    cudaDeviceSynchronize();
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

I also tried to run the function from a device function like this but still gave me the same error;

__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int outputidx = idx / outputsize;
    int inputidx = idx % outputsize;

    if (outputidx >= outputsize || inputidx >= inputsize) {
        return;
    }

    atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}

__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx >= size) {
        return;
    }

    result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}

__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    
    NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
    NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}

__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
    NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}

Share Improve this question edited Mar 6 at 14:14 John Kugelman 363k69 gold badges553 silver badges597 bronze badges asked Mar 6 at 9:02 ug0x01 477 bronze badges

1 "And also tried cudaLaunchKernel function and using global instead of device but they didn't work either." A device function isn't a kernel function and therefore can not be launched. So the error when using __global__ would be more interesting. – paleonix Commented Mar 6 at 12:02
1 This does not look like a good use-case for CDP. CDP is useful when the amount of parallelism is data-dependent. Here you already know the amount of parallelism on the host and can just launch kernels from the host. – paleonix Commented Mar 6 at 13:46
Thank you for your reply @paleonix! I’m trying to build a NEAT algorithm that requires numerous "forward and get result" steps. Since the forward function needs to be called repeatedly, I believe I have to use CDP. The code works fine when I switch from device to global, but now I’m facing an issue with thread synchronization. The process relies on the first function completing before moving on, as it’s essential for obtaining accurate results with the activation function. Is there a way to ensure that a parallelly called function finishes before proceeding with the rest of the code? – ug0x01 Commented Mar 6 at 14:38
1 Sounds like you should read up on CUDA streams. I still don't see how CDP is warranted here. Also keep in mind that legacy CDP1 where you can call cudaDeviceSynchronize() in device code will not run on recent GPUs. It was replaced with CDP2 which does not allow using results from child kernels in the parent kernel. – paleonix Commented Mar 6 at 14:41
"Since the forward function needs to be called repeatedly" Is the number of invocations/repetitions data-dependent? While that could be a reason to use CDP, alternatively one could use the fancy new conditional/while nodes for CUDA Graphs. – paleonix Commented Mar 6 at 15:02

| Show 1 more comment

1 Answer 1

Sorted by: Reset to default 1

The error is exactly what is says on the tin.
If you want to configure a call to a function with <<<x,y,z>>> parameters, it needs to be a __global__ function.

It is perfectly valid to call a __global__ function from another global function.

if you want your code to compile, recode it like so:

#define restrict __restrict__
__global__ void NNFeedForwardNormalMultiple(double* restrict values, double* restrict weights, double* result, int inputsize, int outputsize) {
    ...
}

__global__ void NNFeedForwardNormalActivate(double* restrict biases, double* result, int size) {
   ...
}

__global__ void NNFeedForwardNormal(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
    ...
}

__global__ void NNFeedForwardNormalWrapper(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
    const auto blocks = some calculation;
    const auto threads_per_block = some calculation;
    NNFeedForwardNormal<<<blocks, threads_per_block>>>(values, weights, result, biases, inputsize, outputsize);
}

If instead you want to limit the number of threads in a device function, you can do it like below. Note that you can only limit the number of blocks/threads, you cannot expand it.
As long as you make sure to keep the bounds at warp edges, this will incur no slowdown.

__device__ void first_ten_blocks() { ... }

__device__ void other_blocks() { ... }

__global__ void start() {
    if (blockIdx.x < 10) { first_ten_blocks(); }   
    else { other_blocks(); } 
}

__device__ void first_warp() { ... }
__device__ void other_warps() { ... }
__global__ void warp_split() {
    if (threadIdx.x < 32) { first_warp(); }
    else { other_warps(); }
}

int main() {
    start<<<100,32>>>(); //10 block first_ten_block, 90 blocks other blocks
    start<<<9,32>>>(); //9 blocks first_ten_blocks, no other blocks

    warp_split<<<1, 64>>>(); 1 warp in first_warp, 1 in other_warps
    warp_split<<<10, 32>>>(); 10x warp in first_warp, no other warps
    warp_split<<<48, 512>>>(); 48x first_warp, 48x other warps with 512-32 = 480 threads each.
}

本文标签： cError a device function call cannot be configuredStack Overflow

版权声明：本文标题：c++ - Error: a __device__ function call cannot be configured - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1744987732a2636192.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

c++ - Error: a device function call cannot be configured - Stack Overflow

1 Answer 1

更多相关文章