2018年8月24日 星期五

[CUDA] host memory與device memory


以下程式碼展示 host memory與device memory之間的關係, 變數命名 host 開頭表示是在 host memory, dev 開頭表示是在 device memory. 一開始初始化 host_a[], 然後用cudaMemcpy() 複製至 dev_a[], 平行呼叫 kernel 函數 shared() 後, 將 dev_a[] 複製至 x[], x是shared memory, 可在同一個 block 內的所有 thread 使用.






#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

#define BlockSize  2
#define ThreadSize 10
#define ArraySize (BlockSize*ThreadSize)

__global__ void shared(int* a) //running on device
{
     __shared__ int x[ThreadSize]; //shared in the same block

     int i = blockIdx.x*ThreadSize+threadIdx.x;
     x[threadIdx.x] = a[i]; //copy global to shared memory

     if (threadIdx.x < ThreadSize/2) {
         x[threadIdx.x] = x[threadIdx.x + 1];
     }

     a[i] = x[threadIdx.x];
}

int main() //running on host
{
     int host_a[ArraySize], host_b[ArraySize]; //memory in host
     int *dev_a = 0; //global memory on device

     cudaSetDevice(0); //select a device

     // init array values
     for (int i = 0; i < ArraySize; i++) {
         host_a[i] = i;
         printf("a[%d]=%d ", i, host_a[i]);
         if (i%ThreadSize == ThreadSize-1) printf("\n");
     }
     printf("\n");
    
     // init device memory array values
     cudaMalloc((void**)&dev_a, ArraySize * sizeof(int));
     cudaMemcpy(dev_a, host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);

     // running kernel in parallel
     shared << < BlockSize, ThreadSize >> > (dev_a);

     //copy result back

     cudaMemcpy(host_b, dev_a, ArraySize * sizeof(int), cudaMemcpyDeviceToHost);

     //   waits for the kernel to finish,
     cudaDeviceSynchronize();

     //output
     for (int i = 0; i < ArraySize; i++) {
         printf("b[%d]=%d ", i, host_b[i]);
         if (i%ThreadSize == ThreadSize-1) printf("\n");
     }
     printf("\n");
     getchar(); //wait keypressed

     return 0;
}

沒有留言:

張貼留言