2018年8月25日 星期六

[CUDA] 平行化之 Reduce 演算法

考慮一個典型的總和運算 :


     int sum = 0;
     for (i = 1; i < N; i++) {
         sum += a[i];
     }




假設要做平行化, 考慮 data dependency 的問題, 假設 N=8,安排成3個 iteration 來做, 共需 3 個 iteration, 如下圖:







下面這個程式是簡化的, 假設 N = BlockSize*ThreadSize, 且 ThreadSize 必須為 2 的次方, 計算 1+2+3...+N 的總和, 每個 Block 負責計算一個總和, 在主程式中再把所有 Block 總和結果加起來, 得到數列的計算結果. 由於 globalReduce 會改變 a[] 的值, 所以先做 sharedReduce. 程式碼如下 :




#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#define BlockSize 8
#define ThreadSize 1024
#define ArraySize (BlockSize*ThreadSize)

__device__ void __syncthreads();
__global__ void globalReduceBlockSum(int *b, int *a)
{
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     for (int d = blockDim.x / 2; d > 0; d >>= 1)
     {
         if (threadIdx.x < d)
         {
              a[id] += a[id + d];
         }       
         __syncthreads();
     }

     if (threadIdx.x == 0) {
         b[blockIdx.x] = a[id];
     }
}

__global__ void sharedReduceBlockSum(int *b, int *a)
{
     __shared__ int x[ThreadSize];

     int id = blockIdx.x * blockDim.x + threadIdx.x;
     x[threadIdx.x] = a[id]; //copy to shared memory of block
     __syncthreads(); //wait all threads copy complete

     for (int d = blockDim.x / 2; d > 0; d >>= 1)
     {
         if (threadIdx.x < d)
         {
              x[threadIdx.x] += x[threadIdx.x + d];
         }
         __syncthreads();
     }

     if (threadIdx.x == 0) {
         b[blockIdx.x] = x[0];
     }
}

int main()
{
     int host_a[ArraySize];
     int host_b[BlockSize];
     int *dev_a = 0;
     int *dev_b = 0;
     int sum = 0;
     float elapsedTime;

     // setup performance meter from CUDA ----------
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);

     cudaSetDevice(0);

     cudaMalloc((void**)&dev_a, ArraySize * sizeof(int));
     for (int i = 0; i <  ArraySize; i++)
         host_a[i] = i+1;
     cudaMemcpy(dev_a, host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);

     cudaMalloc((void**)&dev_b, BlockSize * sizeof(int));
     cudaMemset(dev_b, 0, BlockSize * sizeof(int));

     // Run sharedReduce first, because b[] is modified in globalReduce

     cudaEventRecord(start, 0); //keep start time
     sharedReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a);     //calculate
     cudaEventRecord(stop, 0); //keep stop time
     cudaEventSynchronize(stop); //wait stop event    
     cudaEventElapsedTime(&elapsedTime, start, stop);

     cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);

     //Print result
     int answer = (ArraySize + 1)*ArraySize / 2;

     printf("shared:\n");
     sum = 0;
     for (int i = 0; i < BlockSize; i++) {
         sum += host_b[i];
         printf("%d ", host_b[i]);
     }
     printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);

     // run globalReduce
     cudaEventRecord(start, 0); //keep start time
     globalReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a);     //calculate
     cudaEventRecord(stop, 0); //keep stop time
     cudaEventSynchronize(stop); //wait stop event    
     cudaEventElapsedTime(&elapsedTime, start, stop);

     cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);

     //Print result
     printf("global:\n");
     sum = 0;
     for (int i = 0; i < BlockSize; i++) {
         sum += host_b[i];
         printf("%d ", host_b[i]);
     }
     printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);
     //cudaDeviceSynchronize();
     getchar();

     cudaFree(dev_a);
     cudaFree(dev_b);
     return 0;
}




執行結果:


雖然 shared memory 速度比 global memory 速度快, 但是在這個例子, 因為計算太過簡單, 且 sharedReduce因為多了一個搬移至 shared memory 的動作, 所以在 N 值較小時反而比較慢.


參考資料:

[1] John Owens, "GPU Reduce, Scan, and Sort", CS223 Guest Lecture, Electrical and Computer Engineering, http://web.cs.ucdavis.edu/~amenta/f15/amenta-reduce-scan-sort.pdf
[2] Bedřich Beneš, "CUDA Application Reduction Algorithm ", Department of Computer Graphics , Purdue University, http://hpcg.purdue.edu/bbenes/classes/CGT620/lectures/CGT%20620-CUDA-09-CUDA%20Parallel%20Reduction.pdf













沒有留言:

張貼留言