Stereo Matching using OpenCL (2)

上一篇,但是把所有 search range 平行化到每個 work-item,結論是沒有比較快…

#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>
#include <time.h>

#include <opencv2/opencv.hpp>

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // eliminate deprecated warning
#include <CL/cl.hpp>

using namespace cv;
using namespace cl;
using namespace std;

// parameter
const int searchRange = 201;
const float scale     = 1.0;

using VecS = Vec<float, searchRange>; // vector contains all search range

// auto parameter
const int halfSearchRange = searchRange / 2;

Platform getPlatform() {
	vector<Platform> platforms;
	Platform::get( &platforms );
	if ( platforms.empty() ) {
		printf("no platform found.\n");
		exit(1);
	}
	return platforms[0];
}

Device getDevice(Platform platform) {
	vector<Device> devices;
	platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
	if ( devices.empty() ) {
		printf("no device found\n");
		exit(1);
	}
	return devices[0];
}

void readKernelFile(const string &fileName, string &kernelString) {
	ifstream ifs( fileName, ifstream::binary );
	if ( !ifs.is_open() ) {
		printf("open kernel file failed: %s\n", fileName.c_str());
		exit(1);
	}
	stringstream temp;
	temp << ifs.rdbuf();
	ifs.close();

	kernelString = temp.str();
}

void initSadMap( Mat_<VecS> &sadMap ) {
	VecS initP;
	for (int i = 0; i < searchRange; ++i) {
		initP[i] = FLT_MAX;
	}
	for (int i = 0; i < (int) sadMap.total(); ++i) {
		sadMap(i) = initP;
	}
}

int main (int argc, char *argv[]) {
	const char *fileNameL      = argv[1];
	const char *fileNameR      = argv[2];
	const char *fileNameKernel = argv[3];

	// read input image
	Mat1f imgL, imgR;
	imread(fileNameL, 0).convertTo(imgL, CV_32FC1);
	imread(fileNameR, 0).convertTo(imgR, CV_32FC1);

	// scaling down
	resize(imgL, imgL, Size(), scale, scale);
	resize(imgR, imgR, Size(), scale, scale);

	// get kernel source code
	Program::Sources sources;
	string kernelString;
	readKernelFile(fileNameKernel, kernelString);
	sources.push_back({kernelString.c_str(), kernelString.length()});

	// get Platform
	Platform platform = getPlatform();
	cout << platform.getInfo<CL_PLATFORM_NAME>() << endl;

	// get device
	Device device = getDevice( platform );
	cout << device.getInfo<CL_DEVICE_NAME>() << endl;

	// get context
	Context context(device);

	// build kernel program
	Program program(context, sources);
	if( program.build({device}) != CL_SUCCESS ){
		cout<< " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << endl;
		exit(1);
	}

	// command queue
	CommandQueue queue(context, device);

	// host result
	Mat1f disparityL = Mat1f::zeros( imgL.size() );
	Mat1f disparityR = Mat1f::zeros( imgR.size() );
	Mat_<VecS> sadMapL( imgL.size() );  initSadMap(sadMapL);
	Mat_<VecS> sadMapR( imgR.size() );  initSadMap(sadMapR);

	// create buffers on the device
	Buffer devImgL(context, CL_MEM_READ_ONLY , sizeof(float)*imgL.total());
	Buffer devImgR(context, CL_MEM_READ_ONLY , sizeof(float)*imgR.total());
	Buffer devDispL(context, CL_MEM_WRITE_ONLY, sizeof(float)*disparityL.total());
	Buffer devDispR(context, CL_MEM_WRITE_ONLY, sizeof(float)*disparityR.total());
	Buffer devSadMapL(context, CL_MEM_READ_WRITE, sizeof(float)*sadMapL.total()*searchRange);
	Buffer devSadMapR(context, CL_MEM_READ_WRITE, sizeof(float)*sadMapR.total()*searchRange);

	// copy buffer form host to device
	queue.enqueueWriteBuffer(devImgL, CL_TRUE, 0, sizeof(float)*imgL.total(), imgL.data);
	queue.enqueueWriteBuffer(devImgR, CL_TRUE, 0, sizeof(float)*imgR.total(), imgR.data);
	queue.enqueueWriteBuffer(devSadMapL, CL_TRUE, 0, sizeof(float)*sadMapL.total()*searchRange, sadMapL.data);
	queue.enqueueWriteBuffer(devSadMapR, CL_TRUE, 0, sizeof(float)*sadMapR.total()*searchRange, sadMapR.data);

	// make kernel function
	auto blockMatch   = make_kernel<Buffer&, Buffer&, Buffer&>(program, "blockMatch");
	auto getDisparity = make_kernel<Buffer&, Buffer&, int>(program, "getDisparity");

	// run kernel
	EnqueueArgs kernelDim(queue, NDRange(imgL.rows, imgL.cols, searchRange));
	EnqueueArgs kernelDim2(queue, NDRange(imgL.rows, imgL.cols));
	clock_t t0 = clock();
	blockMatch(kernelDim, devImgL, devImgR, devSadMapL);
	blockMatch(kernelDim, devImgR, devImgL, devSadMapR);
	getDisparity(kernelDim2, devSadMapL, devDispL, searchRange);
	getDisparity(kernelDim2, devSadMapR, devDispR, searchRange);

	// read buffer from device to host
	queue.enqueueReadBuffer(devDispL, CL_TRUE, 0, sizeof(float)*disparityL.total(), disparityL.data);
	queue.enqueueReadBuffer(devDispR, CL_TRUE, 0, sizeof(float)*disparityR.total(), disparityR.data);
	queue.enqueueReadBuffer(devSadMapL, CL_TRUE, 0, sizeof(float)*sadMapL.total()*searchRange, sadMapL.data);
	queue.enqueueReadBuffer(devSadMapR, CL_TRUE, 0, sizeof(float)*sadMapR.total()*searchRange, sadMapR.data);

	clock_t t1 = clock();
	printf("time: %f sec\n", (double)(t1-t0) / CLOCKS_PER_SEC);

	normalize(disparityL, disparityL, 0, 1, NORM_MINMAX);
	normalize(disparityR, disparityR, 0, 1, NORM_MINMAX);
	imshow("disparityL", disparityL);
	imshow("disparityR", disparityR);
	imwrite("disparityL.png", disparityL*255);
	imwrite("disparityR.png", disparityR*255);
	waitKey();

	return 0;
}
#define WIN_SIZE           5

// auto parameter
#define HALF_WIN_SIZE     (WIN_SIZE/2)

void getPatch(global const float* img, float patch[WIN_SIZE][WIN_SIZE], int cx, int cy, int cols) {
	for (int y = cy-HALF_WIN_SIZE, iy = 0; y <= cy+HALF_WIN_SIZE; ++y, ++iy) {
		for (int x = cx-HALF_WIN_SIZE, ix  = 0; x <= cx+HALF_WIN_SIZE; ++x, ++ix) {
			patch[iy][ix] = img[y*cols+x];
		}
	}
}

float getSad(float patchL[WIN_SIZE][WIN_SIZE], float patchR[WIN_SIZE][WIN_SIZE]){
	float sum = 0;
	for (int y = 0; y < WIN_SIZE; ++y) {
		for (int x = 0; x < WIN_SIZE; ++x) {
			sum += fabs(patchL[y][x] - patchR[y][x]);
		}
	}
	return sum;
}

void kernel blockMatch(global const float* imgL, global const float* imgR, global float* sadMapL) {
	const int cy   = get_global_id(0);   // left image patch center Y
	const int cxL  = get_global_id(1);   // left image patch center X
	const int cs   = get_global_id(2);   // search offset
	const int rows = get_global_size(0); // image height
	const int cols = get_global_size(1); // image width
	const int searchRange     = get_global_size(2);
	const int halfSearchRange = searchRange / 2;
	const int cxR             = cxL + cs - halfSearchRange; // right image center
	
	//sadMapL[(cy*cols*searchRange) + (cxL*searchRange) + cs] = imgL[cy*cols+cxL];
	//return;
	
	// skip boundary
	if ( cxL  < HALF_WIN_SIZE       ||
	     cxR  < HALF_WIN_SIZE       ||
	     cy   < HALF_WIN_SIZE       ||
         cxL  >= cols-HALF_WIN_SIZE ||
		 cxR  >= cols-HALF_WIN_SIZE ||
		 cy   >= rows-HALF_WIN_SIZE ) {
		return;
	}
	
	// left image patch
	float patchL[WIN_SIZE][WIN_SIZE];
	getPatch(imgL, patchL, cxL, cy, cols);
	
	// right image patch
	float patchR[WIN_SIZE][WIN_SIZE];
	getPatch(imgR, patchR, cxR, cy, cols);

	float sad = getSad(patchL, patchR);

	sadMapL[(cy*cols*searchRange) + (cxL*searchRange) + cs] = sad;
}

void kernel getDisparity(global const float* sadMap, global float* disparity, int searchRange) {
	const int cy   = get_global_id(0);   // left image patch center Y
	const int cx   = get_global_id(1);   // left image patch center X
	const int rows = get_global_size(0); // image height
	const int cols = get_global_size(1); // image width
	const int halfSearchRange = searchRange / 2;
	
	float minSad = FLT_MAX;
	float bestDisparity = -1;
	int offset = (cy*cols*searchRange) + (cx*searchRange);
	for (int i = 0; i < searchRange; ++i) {
		float sad = sadMap[offset + i];
		if ( sad < minSad ) {
			minSad = sad;
			bestDisparity = abs(i - halfSearchRange);
		}
	}
	
	disparity[cy*cols+cx] = bestDisparity;
}

 

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *