c++ - CUDA copy substructs - Stack Overflow-软件玩家

admin管理员组
文章数量:1399737

I have a struct in CUDA that contains other structs (substructs, and also these substructs has substructs themself) and pointers to dynamically allocated memory. I want to copy the entire struct, including its substructures, from device to host using cudaMemcpy. How can I do this correctly without causing shallow copies or memory issues?

I tried to copy the whole main struct but when I try to print substruct values, it returns null data. For example

gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size,cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;

std::cout << "Num Connections: " << h_population->Networks[0].Connections[13].weight << std::endl;

There isn't any output being printed.

I tried all the pointer call combinations ((&, ), (&, &), ( , &), ( , ) ) but none of them worked. Also suggested solution link is saying Doing so means you only have to copy the array to the device, not the structure. (may not be efficient for complex structures) but ofc commentors didn't read it.

Structs;

struct Connection {
    int innovationid;
    int from; 
    int to; 
    float weight;     
    bool enabled;   
};

struct Neuron {
    int type;
    float input_sum;  
    float bias;
    float output; 
};

struct Network {
    Connection* Connections;
    Neuron* Neurons;
    int num_connections;
    int num_neurons;
    float fitness;
};

struct Population {
    Network* Networks;
    int num_networks;
    int generation_id;
};

Kernel function

__global__ void CreateBasePopulation(Population* pop, int pop_num, int input_num, int output_num) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= pop_num) return;

    Network* net = &pop->Networks[idx];
    net->num_neurons = input_num + output_num;
    net->num_connections = input_num * output_num;
    net->fitness = 0.0f;

    curandState state;
    curand_init(clock64(), idx, 0, &state);

    cudaMalloc(&(net->Neurons), sizeof(Neuron) * net->num_neurons);
    cudaMalloc(&(net->Connections), sizeof(Connection) * net->num_connections);

    for (int i = 0; i < output_num; ++i) {
        net->Neurons[i].type = 2;
        net->Neurons[i].bias = ((2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num)) / output_num;
        net->Neurons[i].output = 0.0f;
        net->Neurons[i].input_sum = 0.0f;
    }

    for (int i = 0; i < input_num; ++i) {
        net->Neurons[i].type = 0;
        net->Neurons[i].bias = 0.0f;
        net->Neurons[i].output = 0.0f;
        net->Neurons[i].input_sum = 0.0f;

        for (int j = 0; j < output_num; ++j) {
            int offset = j + (output_num * i);
            net->Connections[offset].from = i;
            net->Connections[offset].to = j;
            net->Connections[offset].innovationid = offset;
            net->Connections[offset].enabled = true;
            net->Connections[offset].weight = (2.0f * curand_uniform(&state)) - 1.0f;
        }
    }
}

Memory allocation and memcpy steps at host

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) 
        exit(code);
   }
}

int main() {
    int population_size = 1024;
    int input_num = 390;
    int output_num = 3;

    size_t heap_size_needed = population_size * ((input_num + output_num) * sizeof(Neuron) +
                                          input_num * output_num * sizeof(Connection));
    size_t heap_size = heap_size_needed + heap_size_needed /5;
    gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size));

    Population* h_population = (Population*)malloc(sizeof(Population)); 
    if (!h_population) {
        std::cerr << "Failed to allocate host memory for population!" << std::endl;
        return -1;
    }

    h_population->num_networks = population_size;
    h_population->generation_id = 1;

    Population* d_population;
    gpuErrchk(cudaMalloc(&d_population, sizeof(Population)));

    Network* d_networks;
    gpuErrchk(cudaMalloc(&d_networks, sizeof(Network) * population_size));

    gpuErrchk(cudaMemcpy(&(h_population->Networks), &d_networks, sizeof(Network*), cudaMemcpyHostToHost));
    gpuErrchk(cudaMemcpy(d_population, h_population, sizeof(Population), cudaMemcpyHostToDevice));

    int threadsPerBlock = 512;
    int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock;

    CreateBasePopulation<<<blocks, threadsPerBlock>>>(d_population, population_size, input_num, output_num);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
    Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
    gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size, cudaMemcpyDeviceToHost));
    h_population->Networks = h_networks;

    for (int i = 0; i < population_size; i++) {
        //Connection* d_connections;
        //gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

        int num_connections = h_networks[i].num_connections;
        Connection* d_connections = nullptr;
        gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

        if (d_connections != nullptr) {
            int num_connections = h_networks[i].num_connections; 
            Connection* h_connections = (Connection*)malloc(sizeof(Connection) * num_connections);

            gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));

            h_networks[i].Connections = h_connections;
        } else {
            std::cerr << "Error: d_connections is a null pointer" << std::endl;
        }
    }

    std::cout << "Population created successfully!\n";
    std::cout << "Num Connections (GPU to CPU memcpy): " << h_population->Networks[12].Connections[13].weight << std::endl;

    gpuErrchk(cudaFree(d_networks));
    gpuErrchk(cudaFree(d_population));
    free(h_population);
    
    return 0;
}

Error output

GPUassert: invalid argument e.cu 68

Line 68

gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));

本文标签： cCUDA copy substructsStack Overflow

版权声明：本文标题：c++ - CUDA copy substructs - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1744123901a2591866.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

c++ - CUDA copy substructs - Stack Overflow

更多相关文章

c++ - CUDA copy substructs - Stack Overflow

发表评论

推荐文章

jQueryJavascript - Run function before (document).ready for the purpose of loading an image - Stack Overflow

python - Turning on `IF NOT EXISTS` while creating schemas and relationships in neo4jneomodel - Stack Overflow

sql - error parsing queries: no queries contained in paths .dbqueries - Stack Overflow

jquery plugins - Javascript - Change FullCalendar.js callback after initialization - Stack Overflow

Custom taxonomy and custom post type - wrong permalinks and template

热门文章

php - How can I detect if a property is write-only using reflection? - Stack Overflow

javascript - ng-class $window.innerWidth - Stack Overflow

cloudflare - Cloudflared tunnel is not properly forwarding ssh - Stack Overflow

javascript - Knockout.js consuming too much memory - Stack Overflow

linux - Wordpress cloning issue

javascript - Update backbone.js model with new array element - Stack Overflow

JavascriptjQuery - On hover over a li element, change class of another li element - Stack Overflow

javascript - Remove all css from a html element using JQuery or CSS? - Stack Overflow

Completely redirecting calls to a static method with byte-buddy - Stack Overflow

javascript - How can I make an image become invisible and un- interactable on a webpage? - Stack Overflow

最新文章

windows设置断电重启开机后自动输入锁屏密码登录

Windows系统设置开机默认开启数字小键盘

Windows11 开机自动同步时间（开机时间不更新问题）

windows配置开机自启动软件或脚本

【Redis】Windows设置Redis为开机自启动

javascript - Bootstrap responsive tables inside responsive tabs - Stack Overflow

javascript - Import React Material UI Icons dynamically - Stack Overflow

node.js - How do I enforce strict peer dependencies at PR time? - Stack Overflow

Admin accounts unable to see plugins from external IP

SEOPHP: How to Convert Form-Submit URL (Get-Method) without Javascript SEO-Friendly? - Stack Overflow

惠普OMEN 15-CE001TX 2EF91PA参数报价

苹果新款MacBook Pro 15英寸 i732GB1TBVega Pro 20参数报价

联想Y330A-PSE L参数报价

神舟战神Z7 D6 i7-12650H16GB512GBRTX4050旗舰版参数报价

神舟战神Z7 D6 i7-12650H16GB1TBRTX4050参数报价