OpenCL Box Filtering – Algorithm Notes

一個很 tricky 的點跟 CPU 計算不一樣，這邊如果把 main.cpp 呼叫 kernel 的 devBy 全部換成 devBx，在 thread 數不超過顯卡 core 負荷時(也就是一個 round 可以跑完 kernel )，結果是不會變的。可以嘗試在大圖需要多個 round 才能跑完 kernel 時就會出錯。

其實因為所有 thread 同步 inplace filtering 寫入關係，當寫入任一點時其他點的 source 已經不會需要再用到，所以不會有 propagate 的問題

讀寫檔案使用 OpenCV，減少 memory access 次數所以 x 和 y 方向 filtering 分開

#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // eliminate deprecated warning
#include <CL/cl.hpp>

#include <opencv2/opencv.hpp>

using namespace cl;
using namespace cv;
using namespace std;

Platform getPlatform() {
	vector<Platform> platforms;
	Platform::get( &platforms );
	if ( platforms.empty() ) {
		printf("no platform found.\n");
		exit(1);
	}
	return platforms[0];
}

Device getDevice(Platform platform) {
	vector<Device> devices;
	platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
	if ( devices.empty() ) {
		printf("no device found\n");
		exit(1);
	}
	return devices[0];
}

void readKernelFile(const string &fileName, string &kernelString) {
	ifstream ifs( fileName, ifstream::binary );
	if ( !ifs.is_open() ) {
		printf("open kernel file failed: %s\n", fileName.c_str());
		exit(1);
	}
	stringstream temp;
	temp << ifs.rdbuf();
	ifs.close();

	kernelString = temp.str();
}

int main (int argc, char *argv[]) {
	// get kernel source code
	Program::Sources sources;
	string kernelString;
	readKernelFile(argv[1], kernelString);
	sources.push_back({kernelString.c_str(), kernelString.length()});

	// get Platform
	Platform platform = getPlatform();
	cout << platform.getInfo<CL_PLATFORM_NAME>() << endl;

	// get device
	Device device = getDevice( platform );
	cout << device.getInfo<CL_DEVICE_NAME>() << endl;

	// get context
	Context context(device);

	// build kernel program
	Program program(context, sources);
	if( program.build({device}) != CL_SUCCESS ){
		cout<< " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << endl;
		exit(1);
	}

	// make kernel function
	auto boxBlurX = make_kernel<Buffer&, Buffer&, int&>(program, "boxBlurX");
	auto boxBlurY = make_kernel<Buffer&, Buffer&, int&>(program, "boxBlurY");

	// command queue
	CommandQueue queue(context, device);

	// buffer on host
	Mat3b A = imread(argv[2]);
	Mat3b B( A.size() );
	const int nRows = A.rows;
	const int nCols = A.cols;
	const int vecSize = nRows * nCols;

	// create buffers on the device
	Buffer devA(context, CL_MEM_READ_WRITE, sizeof(uchar)*3*vecSize);
	Buffer devBx(context, CL_MEM_READ_WRITE, sizeof(uchar)*3*vecSize);
	Buffer devBy(context, CL_MEM_READ_WRITE, sizeof(uchar)*3*vecSize);

	// copy buffer form host to device
	queue.enqueueWriteBuffer(devA, CL_TRUE, 0, sizeof(uchar)*3*vecSize, A.data);

	EnqueueArgs kernelDim(queue, NDRange(nRows, nCols));

	int kSize = 3;
	int sign  = 2;
	while( true ) {
		// run kernel
		boxBlurX(kernelDim, devA , devBx, kSize);
		boxBlurY(kernelDim, devBx, devBy, kSize);
		boxBlurX(kernelDim, devBy, devBx, kSize);
		boxBlurY(kernelDim, devBx, devBy, kSize);

		// read buffer from device to host
		queue.enqueueReadBuffer(devBy, CL_TRUE, 0, sizeof(uchar)*3*vecSize, B.data);

		// show result
		printf("kSize %d\n", kSize);
		imshow("result", B);
		int key = waitKey(33);
		if ( key == 'q') break;

		kSize += sign;
		if (kSize >= 21) {
			sign = -2;
		}
		if (kSize <= 3) {
			sign = 2;
		}
	}

	return 0;
}

void kernel boxBlurX(global const unsigned char* A, global unsigned char* B, int kSize) {
	int cy   = get_global_id(0);
	int cx   = get_global_id(1);
	int rows = get_global_size(0);
	int cols = get_global_size(1);
	
	int pad      = kSize >> 1;
	int3 sum     = 0;
	int offsetYY = 3 * cy * cols;
	for (int x = cx-pad; x <= cx+pad; ++x) {
		int xx = clamp(x, 0, cols-1);
		int offsetXX = 3*xx;
		int offsetP  = offsetYY + offsetXX;
		sum += (int3) (A[offsetP+0], A[offsetP+1], A[offsetP+2]);
	}
	sum /= kSize;
	
	int offset = 3*(cy * cols + cx);
	B[offset+0] = clamp(sum.x, 0, 255);
	B[offset+1] = clamp(sum.y, 0, 255);
	B[offset+2] = clamp(sum.z, 0, 255);
}

void kernel boxBlurY(global const unsigned char* A, global unsigned char* B, int kSize) {
	int cy   = get_global_id(0);
	int cx   = get_global_id(1);
	int rows = get_global_size(0);
	int cols = get_global_size(1);
	
	int pad      = kSize >> 1;
	int3 sum     = 0;
	int offsetXX = 3*cx;
	for (int y = cy-pad; y <= cy+pad; ++y) {
		int yy = clamp(y, 0, rows-1);
		int offsetYY = 3 * yy * cols;
		int offsetP  = offsetYY + offsetXX;
		sum += (int3) (A[offsetP+0], A[offsetP+1], A[offsetP+2]);
	}
	sum /= kSize;
	
	int offset = 3*(cy * cols + cx);
	B[offset+0] = clamp(sum.x, 0, 255);
	B[offset+1] = clamp(sum.y, 0, 255);
	B[offset+2] = clamp(sum.z, 0, 255);
}

發佈留言 取消回覆

發佈留言取消回覆