admin管理员组文章数量:1414628
I implemented a kernel that performs a single-pass scan (proposed in the book Programming Massively Parallel Processors):
__global__
void SinglePassKoggeStoneScan(const unsigned int * input, unsigned int * output,
const unsigned int length, unsigned int * flags, unsigned int * scanValue, unsigned int * blockCounter) {
__shared__ unsigned int bid_s;
__shared__ unsigned int XY[SECTION_SIZE];
if (threadIdx.x == 0) {
bid_s = atomicAdd(blockCounter, 1);
}
__syncthreads();
int bid = bid_s;
int idx = bid * blockDim.x + threadIdx.x;
if (idx < length) {
XY[threadIdx.x] = input[idx];
} else {
XY[threadIdx.x] = 0;
}
__syncthreads();
for (int stride = 1; stride < SECTION_SIZE; stride *= 2) {
__syncthreads();
float tmp = 0;
if (threadIdx.x >= stride) {
tmp = XY[threadIdx.x] + XY[threadIdx.x - stride];
}
__syncthreads();
if (threadIdx.x >= stride) {
XY[threadIdx.x] = tmp;
}
}
__syncthreads();
__shared__ unsigned int previousSum;
if (threadIdx.x == 0) {
while (bid >= 1 && atomicAdd( & flags[bid], 0) == 0) {} // Wait for data
previousSum = scanValue[bid];
scanValue[bid + 1] = XY[blockDim.x - 1] + previousSum;
__threadfence();
atomicAdd( & flags[bid + 1], 1);
}
__syncthreads();
if (idx < length) {
output[idx] = XY[threadIdx.x] + previousSum;
}
}
I would like to extend this kernel so that it performs a scan on each row of a matrix independently.
Currently, I can implement a naive solution by executing this code:
#define SECTION_SIZE 1024
unsigned int input[] = {1,2,3,4,5,6,7,8,9};
const unsigned int width = 3;
const unsigned int height = 3;
unsigned int* output = new unsigned int[3*3];
unsigned int *deviceInput, *deviceOutput, *flags, *scanValue, *blockCounter;
const size_t imageSize = width * height * sizeof(unsigned int);
const unsigned int scanValueNum = (width + SECTION_SIZE - 1) / SECTION_SIZE;
const size_t scanValueSize = scanValueNum * sizeof(unsigned int);
cudaMalloc(reinterpret_cast<void**>(&deviceInput), imageSize);
cudaMalloc(reinterpret_cast<void**>(&deviceOutput), imageSize);
cudaMalloc(reinterpret_cast<void**>(&flags), scanValueSize);
cudaMalloc(reinterpret_cast<void**>(&scanValue), scanValueSize);
cudaMalloc(reinterpret_cast<void**>(&blockCounter), sizeof(unsigned int));
cudaMemcpy(deviceInput, input, imageSize, cudaMemcpyHostToDevice);
dim3 blockDim(SECTION_SIZE);
dim3 gridDim(scanValueSize);
for(int i = 0; i < height; i++){
cudaMemset(flags, 0, scanValueSize);
cudaMemset(blockCounter, 0, sizeof(unsigned int));
SinglePassKoggeStoneScan<<<gridDim, blockDim>>>(deviceInput + i*width, deviceOutput + i*width, width, flags, scanValue, blockCounter);
}
cudaMemcpy(output, deviceOutput, imageSize, cudaMemcpyDeviceToHost);
cudaFree(deviceInput);
cudaFree(deviceOutput);
cudaFree(flags);
cudaFree(scanValue);
cudaFree(blockCounter);
However, I am having trouble adapting the code to directly handle this case. Any help or guidance would be greatly appreciated. Thanks in advance!
版权声明:本文标题:parallel processing - Extending a Single-Pass Scan Kernel for Independent Row-wise Scan in CUDA - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1745155336a2645137.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论