admin管理员组

文章数量:1414628

I implemented a kernel that performs a single-pass scan (proposed in the book Programming Massively Parallel Processors):

__global__
void SinglePassKoggeStoneScan(const unsigned int * input, unsigned int * output,
  const unsigned int length, unsigned int * flags, unsigned int * scanValue, unsigned int * blockCounter) {
  __shared__ unsigned int bid_s;
  __shared__ unsigned int XY[SECTION_SIZE];

  if (threadIdx.x == 0) {
    bid_s = atomicAdd(blockCounter, 1);
  }
  __syncthreads();

  int bid = bid_s;
  int idx = bid * blockDim.x + threadIdx.x;

  if (idx < length) {
    XY[threadIdx.x] = input[idx];
  } else {
    XY[threadIdx.x] = 0;
  }
  __syncthreads();

  for (int stride = 1; stride < SECTION_SIZE; stride *= 2) {
    __syncthreads();
    float tmp = 0;
    if (threadIdx.x >= stride) {
      tmp = XY[threadIdx.x] + XY[threadIdx.x - stride];
    }
    __syncthreads();
    if (threadIdx.x >= stride) {
      XY[threadIdx.x] = tmp;
    }
  }
  __syncthreads();

  __shared__ unsigned int previousSum;
  if (threadIdx.x == 0) {
    while (bid >= 1 && atomicAdd( & flags[bid], 0) == 0) {} // Wait for data
    previousSum = scanValue[bid];
    scanValue[bid + 1] = XY[blockDim.x - 1] + previousSum;
    __threadfence();
    atomicAdd( & flags[bid + 1], 1);
  }
  __syncthreads();

  if (idx < length) {
    output[idx] = XY[threadIdx.x] + previousSum;
  }
}

I would like to extend this kernel so that it performs a scan on each row of a matrix independently.

Currently, I can implement a naive solution by executing this code:

#define SECTION_SIZE 1024

unsigned int input[] = {1,2,3,4,5,6,7,8,9};
const unsigned int width = 3;
const unsigned int height = 3;
unsigned int* output = new unsigned int[3*3];

unsigned int *deviceInput, *deviceOutput, *flags, *scanValue, *blockCounter;

const size_t imageSize = width * height * sizeof(unsigned int);
const unsigned int scanValueNum = (width + SECTION_SIZE - 1) / SECTION_SIZE;
const size_t scanValueSize = scanValueNum * sizeof(unsigned int);
cudaMalloc(reinterpret_cast<void**>(&deviceInput), imageSize);
cudaMalloc(reinterpret_cast<void**>(&deviceOutput), imageSize);
cudaMalloc(reinterpret_cast<void**>(&flags), scanValueSize);
cudaMalloc(reinterpret_cast<void**>(&scanValue), scanValueSize);
cudaMalloc(reinterpret_cast<void**>(&blockCounter), sizeof(unsigned int));

cudaMemcpy(deviceInput, input, imageSize, cudaMemcpyHostToDevice);
dim3 blockDim(SECTION_SIZE);
dim3 gridDim(scanValueSize);
for(int i = 0; i < height; i++){
   cudaMemset(flags, 0, scanValueSize);
   cudaMemset(blockCounter, 0, sizeof(unsigned int));
   SinglePassKoggeStoneScan<<<gridDim, blockDim>>>(deviceInput + i*width, deviceOutput + i*width, width, flags, scanValue, blockCounter);
}

cudaMemcpy(output, deviceOutput, imageSize, cudaMemcpyDeviceToHost);

cudaFree(deviceInput);
cudaFree(deviceOutput);
cudaFree(flags);
cudaFree(scanValue);
cudaFree(blockCounter);

However, I am having trouble adapting the code to directly handle this case. Any help or guidance would be greatly appreciated. Thanks in advance!

本文标签: parallel processingExtending a SinglePass Scan Kernel for Independent Rowwise Scan in CUDAStack Overflow