2018年8月30日 星期四

[CUDA] 與 OpenCV 共舞

取得 CUDA 版的 OpenCV 後, 使用 OpenCV 前, 先建立環境變數 OPENCV_DIR, 使其指向 opencv 目錄 ( C:\opencv\build\ ):






建立名稱為 OpenCvExample 的 CUDA 專案, CUDA精靈會自動建立一個 kernel.cu 檔 :



在專案中增加 main.cpp  與 OpenCvExample.h檔:





專案預設將編譯為 x64 版本, 因此在設定專案的 VC++ 的 include 與 lib 目錄屬性, 將 include 設定為 $(OPENCV_DIR)\include, lib目錄設定為: $(OPENCV_DIR)\x64\vc15\lib :






設定 VC 的連結程式庫名稱, 加入  opencv_world343d.lib (Debug版) 或 opencv_world343.lib (Release版):





本例只有一個很簡單的複製功能, 主要是展示如何在 kernel 存取 Mat 物件的影像資料 data,  這樣便能利用 OpenCV 的檔案讀寫功能來進行檔案處理, 配合改變 kernel 函數即可做出不同的影像處理結果. 專案程式的函數說明: 


 
main()
1.      建立 srcMat 並讀取影像檔
2.      建立 destMat 準備放置處理結果
3.      呼叫在 kernel.cu的處理函數 CopyImage()
4.      顯示處理結果
CopyImage()
1.      準備 CUDA device memory
2.      呼叫 kernel 函數 copy()
3.      複製處理結果至輸出 dest
__global__ void copy()
平行處理複製來源影像 src至輸出影像 dest


3個程式檔案的內容如下 :

OpenCvExample.h :

#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include "opencv2/imgproc.hpp"
#include <opencv2/core/cuda.hpp>
#include <Windows.h>

using namespace cv;
using namespace cuda;

__global__ void copy(byte *dest, byte *src, int channels);
void CopyImage(Mat *dest, Mat *src);



kernel.cu :

#include "OpenCvExample.h"

#define WindowRadius   1

__global__ void copy(byte *dest, byte *src, int channels)
{
     int p = (blockIdx.x*blockDim.x + threadIdx.x)*channels;
     for (int c = 0; c < channels; c++)
     {
         dest[p + c] = src[p + c];
     }
}

void CopyImage(Mat *dest, Mat *src)
{
     byte *dev_src = 0;
     byte *dev_dest = 0;
     size_t pixelBytes = src->total() *src->elemSize();

     cudaError_t cudaStatus;


     // Choose which GPU to run on
     cudaStatus = cudaSetDevice(0);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaSetDevice failed!");
         return;
     }

     // Allocate GPU buffers

     cudaStatus = cudaMalloc((void**)&dev_src, pixelBytes);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMalloc failed!");
         return;
     }
     cudaStatus = cudaMalloc((void**)&dev_dest, pixelBytes);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMalloc failed!");
         return;
     }

     // Copy from host memory to GPU buffers.
     cudaStatus = cudaMemcpy(dev_src, src->data, pixelBytes, cudaMemcpyHostToDevice);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMemcpy failed!");
         return;
     }

     int Channel = src->channels();
     copy<< <src->rows, src->cols >> > (dev_dest, dev_src, Channel);

     // Check for any errors launching the kernel
     cudaStatus = cudaGetLastError();
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
         return;
     }

     // cudaDeviceSynchronize waits for the kernel to finish
     cudaStatus = cudaDeviceSynchronize();
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
         return;
     }

     // Copy output from GPU buffer to host memory.
     cudaStatus = cudaMemcpy(dest->data, dev_dest, pixelBytes, cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaMemcpy failed!");
         return;
     }

     //check result
     /*
     bool ok = true;
     uint32_t Width = src->cols;
     uint32_t Height = src->rows;
     for (uint32_t x = 0; x < Width; x++)
     {
         for (uint32_t y = 0; y < Height; y++)
         {
              for (int c = 0; c < Channel; c++) {
                  if (dest->data[(y*Width + x)*Channel + c] != src->data[(y*Width + x)*Channel + c])
                  {
                       ok = false;
                       break;
                  }
              }
         }
     }
     fprintf(stderr, "ok=%s\n",ok?"ok":"fail");
     */

     // All done, reset the device
     cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "cudaDeviceReset failed!");
         return;
     }
}




main.cpp :


#include <stdio.h>
#include <iostream>
#include "OpenCvExample.h"

using namespace std;

int main(int argc, char** argv)
{
     cudaError_t cudaStatus;

     char *filename = "D:\\LENA.JPG";

     // Load source image
     Mat srcMat = imread(filename, IMREAD_COLOR); // GRAYSCALE); // Read the file and convert to grayscale
     if (srcMat.empty()) // Check for invalid input
     {
         cout << "Could not open or find the image" << std::endl;
         return 1;
     }

     //create output Mat of the same size as srcMt
     Mat destMat;
     destMat.create(srcMat.rows, srcMat.cols, srcMat.type());

     // Run the parallel algorithm which should be in the .cu file
     CopyImage(&destMat, &srcMat);

     namedWindow("Source window", WINDOW_AUTOSIZE); // Create a window for display.
     imshow("Source window", srcMat); // Show our image inside it.

     namedWindow("Dest window", WINDOW_AUTOSIZE); // Create a window for display.
     imshow("Dest window", destMat); // Show our image inside it.

     waitKey(0); // Wait for a keystroke in the window

     return 0;
}


 執行結果畫面 :







沒有留言:

張貼留言