int sum = 0;
for (i = 1; i < N; i++) {
sum += a[i];
}
|
假設要做平行化, 考慮 data dependency 的問題, 假設 N=8,安排成3個 iteration 來做, 共需 3 個 iteration, 如下圖:
下面這個程式是簡化的, 假設 N = BlockSize*ThreadSize, 且 ThreadSize 必須為 2 的次方, 計算 1+2+3...+N 的總和, 每個 Block 負責計算一個總和, 在主程式中再把所有 Block 總和結果加起來, 得到數列的計算結果. 由於 globalReduce 會改變 a[] 的值, 所以先做 sharedReduce. 程式碼如下 :
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define BlockSize 8
#define ThreadSize 1024
#define ArraySize (BlockSize*ThreadSize)
__device__ void __syncthreads();
__global__ void
globalReduceBlockSum(int *b, int *a)
{
int id = blockIdx.x *
blockDim.x + threadIdx.x;
for (int d = blockDim.x /
2; d > 0; d >>= 1)
{
if (threadIdx.x <
d)
{
a[id] += a[id + d];
}
__syncthreads();
}
if (threadIdx.x ==
0) {
b[blockIdx.x] = a[id];
}
}
__global__ void
sharedReduceBlockSum(int *b, int *a)
{
__shared__ int x[ThreadSize];
int id = blockIdx.x *
blockDim.x + threadIdx.x;
x[threadIdx.x]
= a[id]; //copy to shared memory of block
__syncthreads();
//wait all threads copy complete
for (int d = blockDim.x /
2; d > 0; d >>= 1)
{
if (threadIdx.x <
d)
{
x[threadIdx.x]
+= x[threadIdx.x + d];
}
__syncthreads();
}
if (threadIdx.x ==
0) {
b[blockIdx.x] =
x[0];
}
}
int main()
{
int host_a[ArraySize];
int host_b[BlockSize];
int *dev_a = 0;
int *dev_b = 0;
int sum = 0;
float elapsedTime;
// setup performance meter from CUDA ----------
cudaEvent_t
start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaSetDevice(0);
cudaMalloc((void**)&dev_a, ArraySize * sizeof(int));
for (int i = 0; i
< ArraySize; i++)
host_a[i]
= i+1;
cudaMemcpy(dev_a,
host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_b, BlockSize * sizeof(int));
cudaMemset(dev_b,
0, BlockSize * sizeof(int));
// Run sharedReduce first, because b[] is modified in
globalReduce
cudaEventRecord(start,
0); //keep start time
sharedReduceBlockSum
<< <BlockSize, ThreadSize >> > (dev_b, dev_a); //calculate
cudaEventRecord(stop,
0); //keep stop time
cudaEventSynchronize(stop);
//wait stop event
cudaEventElapsedTime(&elapsedTime,
start, stop);
cudaMemcpy(host_b,
dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);
//Print result
int answer = (ArraySize + 1)*ArraySize / 2;
printf("shared:\n");
sum = 0;
for (int i = 0; i < BlockSize; i++) {
sum +=
host_b[i];
printf("%d ", host_b[i]);
}
printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);
// run globalReduce
cudaEventRecord(start,
0); //keep start time
globalReduceBlockSum
<< <BlockSize, ThreadSize >> > (dev_b, dev_a); //calculate
cudaEventRecord(stop,
0); //keep stop time
cudaEventSynchronize(stop);
//wait stop event
cudaEventElapsedTime(&elapsedTime,
start, stop);
cudaMemcpy(host_b,
dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);
//Print result
printf("global:\n");
sum = 0;
for (int i = 0; i < BlockSize; i++) {
sum +=
host_b[i];
printf("%d ", host_b[i]);
}
printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);
//cudaDeviceSynchronize();
getchar();
cudaFree(dev_a);
cudaFree(dev_b);
return 0;
}
|
執行結果:
雖然 shared memory 速度比 global memory 速度快, 但是在這個例子, 因為計算太過簡單, 且 sharedReduce 也因為多了一個搬移至 shared memory 的動作, 所以在 N 值較小時反而比較慢.
參考資料:
[1] John Owens, "GPU Reduce, Scan, and
Sort", CS223 Guest Lecture, Electrical and Computer Engineering, http://web.cs.ucdavis.edu/~amenta/f15/amenta-reduce-scan-sort.pdf
[2] Bedřich Beneš, "CUDA Application Reduction
Algorithm ", Department of Computer Graphics , Purdue University, http://hpcg.purdue.edu/bbenes/classes/CGT620/lectures/CGT%20620-CUDA-09-CUDA%20Parallel%20Reduction.pdf
沒有留言:
張貼留言