同上一篇,但是把所有 search range 平行化到每個 work-item,結論是沒有比較快…
#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>
#include <time.h>
#include <opencv2/opencv.hpp>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // eliminate deprecated warning
#include <CL/cl.hpp>
using namespace cv;
using namespace cl;
using namespace std;
// parameter
const int searchRange = 201;
const float scale = 1.0;
using VecS = Vec<float, searchRange>; // vector contains all search range
// auto parameter
const int halfSearchRange = searchRange / 2;
Platform getPlatform() {
vector<Platform> platforms;
Platform::get( &platforms );
if ( platforms.empty() ) {
printf("no platform found.\n");
exit(1);
}
return platforms[0];
}
Device getDevice(Platform platform) {
vector<Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
if ( devices.empty() ) {
printf("no device found\n");
exit(1);
}
return devices[0];
}
void readKernelFile(const string &fileName, string &kernelString) {
ifstream ifs( fileName, ifstream::binary );
if ( !ifs.is_open() ) {
printf("open kernel file failed: %s\n", fileName.c_str());
exit(1);
}
stringstream temp;
temp << ifs.rdbuf();
ifs.close();
kernelString = temp.str();
}
void initSadMap( Mat_<VecS> &sadMap ) {
VecS initP;
for (int i = 0; i < searchRange; ++i) {
initP[i] = FLT_MAX;
}
for (int i = 0; i < (int) sadMap.total(); ++i) {
sadMap(i) = initP;
}
}
int main (int argc, char *argv[]) {
const char *fileNameL = argv[1];
const char *fileNameR = argv[2];
const char *fileNameKernel = argv[3];
// read input image
Mat1f imgL, imgR;
imread(fileNameL, 0).convertTo(imgL, CV_32FC1);
imread(fileNameR, 0).convertTo(imgR, CV_32FC1);
// scaling down
resize(imgL, imgL, Size(), scale, scale);
resize(imgR, imgR, Size(), scale, scale);
// get kernel source code
Program::Sources sources;
string kernelString;
readKernelFile(fileNameKernel, kernelString);
sources.push_back({kernelString.c_str(), kernelString.length()});
// get Platform
Platform platform = getPlatform();
cout << platform.getInfo<CL_PLATFORM_NAME>() << endl;
// get device
Device device = getDevice( platform );
cout << device.getInfo<CL_DEVICE_NAME>() << endl;
// get context
Context context(device);
// build kernel program
Program program(context, sources);
if( program.build({device}) != CL_SUCCESS ){
cout<< " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << endl;
exit(1);
}
// command queue
CommandQueue queue(context, device);
// host result
Mat1f disparityL = Mat1f::zeros( imgL.size() );
Mat1f disparityR = Mat1f::zeros( imgR.size() );
Mat_<VecS> sadMapL( imgL.size() ); initSadMap(sadMapL);
Mat_<VecS> sadMapR( imgR.size() ); initSadMap(sadMapR);
// create buffers on the device
Buffer devImgL(context, CL_MEM_READ_ONLY , sizeof(float)*imgL.total());
Buffer devImgR(context, CL_MEM_READ_ONLY , sizeof(float)*imgR.total());
Buffer devDispL(context, CL_MEM_WRITE_ONLY, sizeof(float)*disparityL.total());
Buffer devDispR(context, CL_MEM_WRITE_ONLY, sizeof(float)*disparityR.total());
Buffer devSadMapL(context, CL_MEM_READ_WRITE, sizeof(float)*sadMapL.total()*searchRange);
Buffer devSadMapR(context, CL_MEM_READ_WRITE, sizeof(float)*sadMapR.total()*searchRange);
// copy buffer form host to device
queue.enqueueWriteBuffer(devImgL, CL_TRUE, 0, sizeof(float)*imgL.total(), imgL.data);
queue.enqueueWriteBuffer(devImgR, CL_TRUE, 0, sizeof(float)*imgR.total(), imgR.data);
queue.enqueueWriteBuffer(devSadMapL, CL_TRUE, 0, sizeof(float)*sadMapL.total()*searchRange, sadMapL.data);
queue.enqueueWriteBuffer(devSadMapR, CL_TRUE, 0, sizeof(float)*sadMapR.total()*searchRange, sadMapR.data);
// make kernel function
auto blockMatch = make_kernel<Buffer&, Buffer&, Buffer&>(program, "blockMatch");
auto getDisparity = make_kernel<Buffer&, Buffer&, int>(program, "getDisparity");
// run kernel
EnqueueArgs kernelDim(queue, NDRange(imgL.rows, imgL.cols, searchRange));
EnqueueArgs kernelDim2(queue, NDRange(imgL.rows, imgL.cols));
clock_t t0 = clock();
blockMatch(kernelDim, devImgL, devImgR, devSadMapL);
blockMatch(kernelDim, devImgR, devImgL, devSadMapR);
getDisparity(kernelDim2, devSadMapL, devDispL, searchRange);
getDisparity(kernelDim2, devSadMapR, devDispR, searchRange);
// read buffer from device to host
queue.enqueueReadBuffer(devDispL, CL_TRUE, 0, sizeof(float)*disparityL.total(), disparityL.data);
queue.enqueueReadBuffer(devDispR, CL_TRUE, 0, sizeof(float)*disparityR.total(), disparityR.data);
queue.enqueueReadBuffer(devSadMapL, CL_TRUE, 0, sizeof(float)*sadMapL.total()*searchRange, sadMapL.data);
queue.enqueueReadBuffer(devSadMapR, CL_TRUE, 0, sizeof(float)*sadMapR.total()*searchRange, sadMapR.data);
clock_t t1 = clock();
printf("time: %f sec\n", (double)(t1-t0) / CLOCKS_PER_SEC);
normalize(disparityL, disparityL, 0, 1, NORM_MINMAX);
normalize(disparityR, disparityR, 0, 1, NORM_MINMAX);
imshow("disparityL", disparityL);
imshow("disparityR", disparityR);
imwrite("disparityL.png", disparityL*255);
imwrite("disparityR.png", disparityR*255);
waitKey();
return 0;
}
#define WIN_SIZE 5
// auto parameter
#define HALF_WIN_SIZE (WIN_SIZE/2)
void getPatch(global const float* img, float patch[WIN_SIZE][WIN_SIZE], int cx, int cy, int cols) {
for (int y = cy-HALF_WIN_SIZE, iy = 0; y <= cy+HALF_WIN_SIZE; ++y, ++iy) {
for (int x = cx-HALF_WIN_SIZE, ix = 0; x <= cx+HALF_WIN_SIZE; ++x, ++ix) {
patch[iy][ix] = img[y*cols+x];
}
}
}
float getSad(float patchL[WIN_SIZE][WIN_SIZE], float patchR[WIN_SIZE][WIN_SIZE]){
float sum = 0;
for (int y = 0; y < WIN_SIZE; ++y) {
for (int x = 0; x < WIN_SIZE; ++x) {
sum += fabs(patchL[y][x] - patchR[y][x]);
}
}
return sum;
}
void kernel blockMatch(global const float* imgL, global const float* imgR, global float* sadMapL) {
const int cy = get_global_id(0); // left image patch center Y
const int cxL = get_global_id(1); // left image patch center X
const int cs = get_global_id(2); // search offset
const int rows = get_global_size(0); // image height
const int cols = get_global_size(1); // image width
const int searchRange = get_global_size(2);
const int halfSearchRange = searchRange / 2;
const int cxR = cxL + cs - halfSearchRange; // right image center
//sadMapL[(cy*cols*searchRange) + (cxL*searchRange) + cs] = imgL[cy*cols+cxL];
//return;
// skip boundary
if ( cxL < HALF_WIN_SIZE ||
cxR < HALF_WIN_SIZE ||
cy < HALF_WIN_SIZE ||
cxL >= cols-HALF_WIN_SIZE ||
cxR >= cols-HALF_WIN_SIZE ||
cy >= rows-HALF_WIN_SIZE ) {
return;
}
// left image patch
float patchL[WIN_SIZE][WIN_SIZE];
getPatch(imgL, patchL, cxL, cy, cols);
// right image patch
float patchR[WIN_SIZE][WIN_SIZE];
getPatch(imgR, patchR, cxR, cy, cols);
float sad = getSad(patchL, patchR);
sadMapL[(cy*cols*searchRange) + (cxL*searchRange) + cs] = sad;
}
void kernel getDisparity(global const float* sadMap, global float* disparity, int searchRange) {
const int cy = get_global_id(0); // left image patch center Y
const int cx = get_global_id(1); // left image patch center X
const int rows = get_global_size(0); // image height
const int cols = get_global_size(1); // image width
const int halfSearchRange = searchRange / 2;
float minSad = FLT_MAX;
float bestDisparity = -1;
int offset = (cy*cols*searchRange) + (cx*searchRange);
for (int i = 0; i < searchRange; ++i) {
float sad = sadMap[offset + i];
if ( sad < minSad ) {
minSad = sad;
bestDisparity = abs(i - halfSearchRange);
}
}
disparity[cy*cols+cx] = bestDisparity;
}
