admin管理员组

文章数量:1399737

I have a struct in CUDA that contains other structs (substructs, and also these substructs has substructs themself) and pointers to dynamically allocated memory. I want to copy the entire struct, including its substructures, from device to host using cudaMemcpy. How can I do this correctly without causing shallow copies or memory issues?

I tried to copy the whole main struct but when I try to print substruct values, it returns null data. For example

gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size,cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;

std::cout << "Num Connections: " << h_population->Networks[0].Connections[13].weight << std::endl;

There isn't any output being printed.

I tried all the pointer call combinations ((&, ), (&, &), ( , &), ( , ) ) but none of them worked. Also suggested solution link is saying Doing so means you only have to copy the array to the device, not the structure. (may not be efficient for complex structures) but ofc commentors didn't read it.

Structs;

struct Connection {
    int innovationid;
    int from; 
    int to; 
    float weight;     
    bool enabled;   
};

struct Neuron {
    int type;
    float input_sum;  
    float bias;
    float output; 
};

struct Network {
    Connection* Connections;
    Neuron* Neurons;
    int num_connections;
    int num_neurons;
    float fitness;
};

struct Population {
    Network* Networks;
    int num_networks;
    int generation_id;
};

Kernel function

__global__ void CreateBasePopulation(Population* pop, int pop_num, int input_num, int output_num) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= pop_num) return;

    Network* net = &pop->Networks[idx];
    net->num_neurons = input_num + output_num;
    net->num_connections = input_num * output_num;
    net->fitness = 0.0f;

    curandState state;
    curand_init(clock64(), idx, 0, &state);

    cudaMalloc(&(net->Neurons), sizeof(Neuron) * net->num_neurons);
    cudaMalloc(&(net->Connections), sizeof(Connection) * net->num_connections);

    for (int i = 0; i < output_num; ++i) {
        net->Neurons[i].type = 2;
        net->Neurons[i].bias = ((2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num)) / output_num;
        net->Neurons[i].output = 0.0f;
        net->Neurons[i].input_sum = 0.0f;
    }

    for (int i = 0; i < input_num; ++i) {
        net->Neurons[i].type = 0;
        net->Neurons[i].bias = 0.0f;
        net->Neurons[i].output = 0.0f;
        net->Neurons[i].input_sum = 0.0f;

        for (int j = 0; j < output_num; ++j) {
            int offset = j + (output_num * i);
            net->Connections[offset].from = i;
            net->Connections[offset].to = j;
            net->Connections[offset].innovationid = offset;
            net->Connections[offset].enabled = true;
            net->Connections[offset].weight = (2.0f * curand_uniform(&state)) - 1.0f;
        }
    }
}

Memory allocation and memcpy steps at host

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) 
        exit(code);
   }
}

int main() {
    int population_size = 1024;
    int input_num = 390;
    int output_num = 3;

    size_t heap_size_needed = population_size * ((input_num + output_num) * sizeof(Neuron) +
                                          input_num * output_num * sizeof(Connection));
    size_t heap_size = heap_size_needed + heap_size_needed /5;
    gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size));

    Population* h_population = (Population*)malloc(sizeof(Population)); 
    if (!h_population) {
        std::cerr << "Failed to allocate host memory for population!" << std::endl;
        return -1;
    }

    h_population->num_networks = population_size;
    h_population->generation_id = 1;

    Population* d_population;
    gpuErrchk(cudaMalloc(&d_population, sizeof(Population)));

    Network* d_networks;
    gpuErrchk(cudaMalloc(&d_networks, sizeof(Network) * population_size));

    gpuErrchk(cudaMemcpy(&(h_population->Networks), &d_networks, sizeof(Network*), cudaMemcpyHostToHost));
    gpuErrchk(cudaMemcpy(d_population, h_population, sizeof(Population), cudaMemcpyHostToDevice));

    int threadsPerBlock = 512;
    int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock;

    CreateBasePopulation<<<blocks, threadsPerBlock>>>(d_population, population_size, input_num, output_num);
    gpuErrchk(cudaGetLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
    Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
    gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size, cudaMemcpyDeviceToHost));
    h_population->Networks = h_networks;

    for (int i = 0; i < population_size; i++) {
        //Connection* d_connections;
        //gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

        int num_connections = h_networks[i].num_connections;
        Connection* d_connections = nullptr;
        gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

        if (d_connections != nullptr) {
            int num_connections = h_networks[i].num_connections; 
            Connection* h_connections = (Connection*)malloc(sizeof(Connection) * num_connections);

            gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));

            h_networks[i].Connections = h_connections;
        } else {
            std::cerr << "Error: d_connections is a null pointer" << std::endl;
        }
    }

    std::cout << "Population created successfully!\n";
    std::cout << "Num Connections (GPU to CPU memcpy): " << h_population->Networks[12].Connections[13].weight << std::endl;

    gpuErrchk(cudaFree(d_networks));
    gpuErrchk(cudaFree(d_population));
    free(h_population);
    
    return 0;
}

Error output

GPUassert: invalid argument e.cu 68

Line 68

gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));

本文标签: cCUDA copy substructsStack Overflow