2018年8月24日 星期五

[CUDA] 測量效能


這裡使用兩種方式:
  1. 使用 windows的 PerformanceCounter, 只要 include <Windows.h> 即可使用
  2. 使用 cudaEvent
arraySize受限於host與gpu的記憶體容量, 可自行調整測試, repeat為重複執行次數. 如果在 main() 中重複呼叫 addWithCuda(), 會因為記憶體搬移花費大量時間, 反而可能比CPU還慢, 因此直接在 addWithCuda() 中重複呼叫 kernel去計算. 資料陣列可宣告於 globalㄝ如果放在local 最好宣告 static, 否則  arraySize 太大會造成 stack overflow.



此例執行畫面中可以看到, 使用GPU大約速度增19倍, 實際效能與 arraySize 及系統配備有關.




程式碼:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "cooperative_groups_helpers.h"
#include <iostream>
#include <memory>
#include <string>
#include <Windows.h>

const int repeat = 10000; //重複計算次數;
const int arraySize = 1024 * 1024; //受限於hostgpu記憶體大小
BOOL WINAPI QueryPerformanceCounter(_Out_ LARGE_INTEGER *lpPerformanceCount);
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
void addWithCpu(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(int *c, const int *a, const int *b, int size);


// data array for test, global or local static in heap to prevent stck overflow
/*
int a[arraySize];
int b[arraySize];
int c[arraySize];
int d[arraySize];
*/
int main()
{
     // data array for test,  global or local static in heap to prevent stck overflow
     static int a[arraySize];
     static int b[arraySize];
     static int c[arraySize];
     static int d[arraySize];

     // setup performance measure from windows ----------
     LARGE_INTEGER frequency;        // ticks per second
     LARGE_INTEGER t1, t2;           // ticks
     float elapsedTime;

     // setup performance measure from windows ---------
     QueryPerformanceFrequency(&frequency);

     // setup performance meter from CUDA ----------
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);

     // init array ----------
     for (int i = 0; i < arraySize; i++) {
         a[i] = i;
         b[i] = arraySize + i;
     }

     /// Add by CPU ----------
     QueryPerformanceCounter(&t1); //keep start time
     for (int i = 0; i < repeat; i++) {
         addWithCpu(c, a, b, arraySize);
     }
     QueryPerformanceCounter(&t2); //keep stop time
     elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;
     printf("c[%d]=%d, cpu t=%f\n",
         arraySize - 1, c[arraySize - 1], elapsedTime);
    

     // Add by CUDA ----------
     cudaEventRecord(start, 0); //keep start time
     cudaError_t cudaStatus = addWithCuda(d, a, b, arraySize);
     cudaEventRecord(stop, 0); //keep stop time
     cudaEventSynchronize(stop); //wait stop event
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "addWithCuda failed!");
         return 1;
     }
     cudaEventElapsedTime(&elapsedTime, start, stop); 
     printf("d[%d]=%d, gpu t=%f\n",
         arraySize - 1, c[arraySize - 1], elapsedTime);

     // cudaDeviceReset must be called before exiting in order for profiling and
     // tracing tools such as Nsight and Visual Profiler to show complete traces.
     cudaStatus = cudaDeviceReset();
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaDeviceReset failed!");
         return 1;
     }

     getchar();
     return 0;
}

// Add with CPU ---------
void addWithCpu(int *c, const int *a, const int *b, unsigned int size)
{
     for (unsigned int i = 0; i < size; i++) {
         c[i] = a[i] + b[i];
     }
}

// Add with GPU ---------
__global__ void addKernel(int *c, const int *a, const int *b, int size)
{
     int i = blockIdx.x*blockDim.x + threadIdx.x;
     if (i < size) {
         c[i] = a[i] + b[i];
     }
}


// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
     int *dev_a = 0;
     int *dev_b = 0;
     int *dev_c = 0;
     cudaError_t cudaStatus;

     // Choose which GPU to run on, change this on a multi-GPU system.
     int dev = 0;
     cudaStatus = cudaSetDevice(dev);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
         goto Error;
     }

     cudaSetDevice(dev);

     // Allocate GPU buffers for three vectors (two input, one output)    .
     cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMalloc failed!");
         goto Error;
     }

     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMalloc failed!");
         goto Error;
     }

     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMalloc failed!");
         goto Error;
     }

     // Copy input vectors from host memory to GPU buffers.
     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMemcpy failed!");
         goto Error;
     }

     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMemcpy failed!");
         goto Error;
     }

     // Launch a kernel on the GPU with one thread for each element.
     /*
     int block = 1;
     unsigned int thread_x = (size-1) / blockDim.x+1;
     unsigned int thread_y = (size-1) % blockDim.x+1;
     dim3 thread = {thread_x, thread_y, 1 };
     */
     int block = (size - 1) / 1024 + 1;
     int thread = (size>1024) ? 1024 : (size - 1);

     for (int i = 0; i < repeat; i++) {
         addKernel << <block, thread >> > (dev_c, dev_a, dev_b, size);
     }

     // Check for any errors launching the kernel
     cudaStatus = cudaGetLastError();
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
         goto Error;
     }

     // cudaDeviceSynchronize waits for the kernel to finish, and returns
     // any errors encountered during the launch.
     cudaStatus = cudaDeviceSynchronize();
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
         goto Error;
     }

     // Copy output vector from GPU buffer to host memory.
     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMemcpy failed!");
         goto Error;
     }

Error:
     cudaFree(dev_c);
     cudaFree(dev_a);
     cudaFree(dev_b);

     return cudaStatus;
}

沒有留言:

張貼留言