以下程式碼展示 host memory與device memory之間的關係, 變數命名 host 開頭表示是在 host memory, dev 開頭表示是在 device memory. 一開始初始化 host_a[], 然後用cudaMemcpy() 複製至 dev_a[], 平行呼叫 kernel 函數 shared() 後, 將 dev_a[] 複製至 x[], x是shared memory, 可在同一個 block 內的所有 thread 使用.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define BlockSize 2
#define ThreadSize 10
#define ArraySize (BlockSize*ThreadSize)
__global__ void shared(int* a) //running on device
{
__shared__ int x[ThreadSize]; //shared in the same block
int i = blockIdx.x*ThreadSize+threadIdx.x;
x[threadIdx.x]
= a[i]; //copy global to shared memory
if (threadIdx.x <
ThreadSize/2) {
x[threadIdx.x]
= x[threadIdx.x + 1];
}
a[i] =
x[threadIdx.x];
}
int main() //running on host
{
int host_a[ArraySize], host_b[ArraySize]; //memory in host
int *dev_a = 0; //global memory on device
cudaSetDevice(0);
//select a device
// init array values
for (int i = 0; i < ArraySize; i++) {
host_a[i]
= i;
printf("a[%d]=%d ", i, host_a[i]);
if (i%ThreadSize == ThreadSize-1)
printf("\n");
}
printf("\n");
// init device memory array values
cudaMalloc((void**)&dev_a, ArraySize * sizeof(int));
cudaMemcpy(dev_a,
host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);
// running kernel in parallel
shared
<< < BlockSize, ThreadSize >> > (dev_a);
//copy result back
cudaMemcpy(host_b,
dev_a, ArraySize * sizeof(int), cudaMemcpyDeviceToHost);
// waits for the
kernel to finish,
cudaDeviceSynchronize();
//output
for (int i = 0; i < ArraySize; i++) {
printf("b[%d]=%d ", i, host_b[i]);
if (i%ThreadSize == ThreadSize-1)
printf("\n");
}
printf("\n");
getchar(); //wait keypressed
return 0;
}
|
沒有留言:
張貼留言