Converting MATLAB FFT-based Delay Function to CUDA MEX - Stack Overflow-软件玩家

admin管理员组
文章数量:1401657

I am trying to convert a matlab function to its equivalent CUDA MEX function. Below is the matlab function:

function [ShiftData_time] = simpledelayupdated_mat(RData, delay, fs)
    nfft = size(RData,1);
    binStart = floor(nfft/2);
    fftBin = (2*pi*ifftshift(((0:nfft-1)-binStart).'))/nfft; 
    fftBin = fftBin';
    
    RFFT = fft(RData, nfft, 1);
    ShiftData = RFFT .* ((exp(-1i * delay * fftBin)).'); 
    ShiftData_time = ifft(ShiftData, nfft, 1);
    ShiftData_time = real(ShiftData_time);
end

Here is the CUDA implementation using MEX :

#include "mex.h"
#include <cuda_runtime.h>
#include <cufft.h>
#include <math.h>
#define M_PI 3.14159

// Macro for cuFFT error checking.
#define CUFFT_CHECK_ERROR(call) { cufftResult err = call; if(err != CUFFT_SUCCESS) { \
    mexErrMsgIdAndTxt("CUDA:CUFFTError", "CUFFT error: %d", err); } }

// Macro for CUDA runtime error checking.
#define CUDA_CHECK_ERROR(call) { cudaError_t err = call; if(err != cudaSuccess) { \
    mexErrMsgIdAndTxt("CUDA:RuntimeError", "CUDA error: %s", cudaGetErrorString(err)); } }

// Updated CUDA kernel to apply the phase shift (delay) in the frequency domain.
// Instead of launching one block per column with nfft threads, we now use a 2D grid.
// grid.x corresponds to the column index, and grid.y is the number of blocks needed
// to cover all frequency bins (with a fixed block size).
__global__ void applyDelay(cufftDoubleComplex *data, const double *fftBin, double delay, int nfft, int nCols)
{
    int col = blockIdx.x;          // Column index.
    int blockId = blockIdx.y;      // Block index within the column.
    int threadsPerBlock = blockDim.x;
    int threadId = threadIdx.x;

    // Compute global frequency bin index within this column.
    int index = blockId * threadsPerBlock + threadId;
    
    if (col < nCols && index < nfft)
    {
        int idx = col * nfft + index;
        double phase = -delay * fftBin[index];
        double cosPhase = cos(phase);
        double sinPhase = sin(phase);
        
        cufftDoubleComplex factor;
        factor.x = cosPhase;
        factor.y = sinPhase;
        
        cufftDoubleComplex orig = data[idx];
        cufftDoubleComplex result;
        result.x = orig.x * factor.x - orig.y * factor.y;
        result.y = orig.x * factor.y + orig.y * factor.x;
        data[idx] = result;
    }
}

// The MEX gateway function.
void mexFunction(int nlhs, mxArray *plhs[],
                 int nrhs, const mxArray *prhs[])
{
    // Check input count: RData, delay, fs.
    if(nrhs != 3)
        mexErrMsgIdAndTxt("MATLAB:simpledelayupdated:invalidNumInputs", 
                          "Three inputs required: RData, delay, fs.");
    
    // Ensure RData is real and double precision.
    if(!mxIsDouble(prhs[0]) || mxIsComplex(prhs[0]))
        mexErrMsgIdAndTxt("MATLAB:simpledelayupdated:inputNotDouble", 
                          "RData must be real double precision.");
    
    // Get input RData and its dimensions.
    double *RData = mxGetPr(prhs[0]);
    mwSize nfft = mxGetM(prhs[0]);   // Number of rows.
    mwSize nCols = mxGetN(prhs[0]);   // Number of columns.
    
    // Read scalar inputs: delay and fs (fs is provided for compatibility).
    double delay = mxGetScalar(prhs[1]);
    double fs = mxGetScalar(prhs[2]); // Not used directly.
    
    // Create output array (real, same size as input).
    plhs[0] = mxCreateDoubleMatrix(nfft, nCols, mxREAL);
    double *ShiftData_time = mxGetPr(plhs[0]);
    
    // Allocate host memory for complex input data.
    cufftDoubleComplex *h_complex = (cufftDoubleComplex*) mxMalloc(nfft * nCols * sizeof(cufftDoubleComplex));
    for (mwSize col = 0; col < nCols; col++) {
        for (mwSize row = 0; row < nfft; row++) {
            int idx = col * nfft + row;
            h_complex[idx].x = RData[idx];
            h_complex[idx].y = 0.0;
        }
    }
    
    // Precompute the FFT frequency bins.
    // This corresponds to the MATLAB expression:
    // fftBin = (2*pi*ifftshift(((0:nfft-1)-floor(nfft/2))))/nfft
    double *h_fftBin = (double*) mxMalloc(nfft * sizeof(double));
    int binStart = (int)(nfft / 2);
    for (mwSize row = 0; row < nfft; row++) {
        int shifted_idx = (row + binStart) % nfft;
        int centered = row - binStart;
        h_fftBin[shifted_idx] = (2.0 * M_PI * centered) / (double)nfft;
    }
    
    // Allocate device memory for the complex data and the fftBin vector.
    cufftDoubleComplex *d_data;
    CUDA_CHECK_ERROR(cudaMalloc((void**)&d_data, nfft * nCols * sizeof(cufftDoubleComplex)));
    CUDA_CHECK_ERROR(cudaMemcpy(d_data, h_complex, nfft * nCols * sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice));
    
    double *d_fftBin;
    CUDA_CHECK_ERROR(cudaMalloc((void**)&d_fftBin, nfft * sizeof(double)));
    CUDA_CHECK_ERROR(cudaMemcpy(d_fftBin, h_fftBin, nfft * sizeof(double), cudaMemcpyHostToDevice));
    
    // Create a cuFFT plan for batched 1D FFTs (each column is one FFT of size nfft).
    cufftHandle plan;
    int rank = 1;
    int n[1] = {(int)nfft};
    int istride = 1, idist = nfft;
    int ostride = 1, odist = nfft;
    CUFFT_CHECK_ERROR(cufftPlanMany(&plan, rank, n,
                                    NULL, istride, idist,
                                    NULL, ostride, odist,
                                    CUFFT_Z2Z, nCols));
    
    // Execute forward FFT (time to frequency domain).
    CUFFT_CHECK_ERROR(cufftExecZ2Z(plan, d_data, d_data, CUFFT_FORWARD));
    
    // Launch the CUDA kernel to apply the phase shift.
    int threadsPerBlock = 256;  // Use a safe block size.
    int blocksPerColumn = (nfft + threadsPerBlock - 1) / threadsPerBlock;
    dim3 grid(nCols, blocksPerColumn, 1);  // grid.x: columns, grid.y: blocks per column.
    dim3 block(threadsPerBlock, 1, 1);
    
    applyDelay<<<grid, block>>>(d_data, d_fftBin, delay, nfft, nCols);
    CUDA_CHECK_ERROR(cudaDeviceSynchronize());
    
    // Execute the inverse FFT to convert back to time domain.
    CUFFT_CHECK_ERROR(cufftExecZ2Z(plan, d_data, d_data, CUFFT_INVERSE));
    
    // Allocate host memory for the inverse FFT result and copy from device.
    cufftDoubleComplex *h_result = (cufftDoubleComplex*) mxMalloc(nfft * nCols * sizeof(cufftDoubleComplex));
    CUDA_CHECK_ERROR(cudaMemcpy(h_result, d_data, nfft * nCols * sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));
    
    // Normalize the result by dividing by nfft and take the real part.
    for(mwSize i = 0; i < nfft * nCols; i++) {
        ShiftData_time[i] = h_result[i].x / ((double)nfft);
    }
    
    // Clean up resources.
    cufftDestroy(plan);
    cudaFree(d_data);
    cudaFree(d_fftBin);
    mxFree(h_complex);
    mxFree(h_fftBin);
    mxFree(h_result);
}

This is how I am calling the Matlab function:

[temp] = simpledelayupdated(gather(RF_Arr_gpu), scalarDelay, sampling_Freq);

where RF_arr_gpu is a 2D MATLAB gpuArray, scalarDelay and sampling_Freq are scalars.

The issue I'm facing is that the final output from both functions does not match when run on real data. However, I have tested individual components of the CUDA MEX function, and the differences between the MATLAB and CUDA MEX outputs are minimal.

Can anyone help me identify where there could be issues in the code? I have attached the output from MATLAB and what we are currently getting from CUDA MEX. The MATLAB image is continuous while the CUDA MEX seems to be more discrete.

本文标签： Converting MATLAB FFTbased Delay Function to CUDA MEXStack Overflow

版权声明：本文标题：Converting MATLAB FFT-based Delay Function to CUDA MEX - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1744222172a2595919.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

Converting MATLAB FFT-based Delay Function to CUDA MEX - Stack Overflow

更多相关文章