一個很 tricky 的點跟 CPU 計算不一樣,這邊如果把 main.cpp 呼叫 kernel 的 devBy 全部換成 devBx,在 thread 數不超過顯卡 core 負荷時(也就是一個 round 可以跑完 kernel ),結果是不會變的。可以嘗試在大圖需要多個 round 才能跑完 kernel 時就會出錯。
其實因為所有 thread 同步 inplace filtering 寫入關係,當寫入任一點時其他點的 source 已經不會需要再用到,所以不會有 propagate 的問題
讀寫檔案使用 OpenCV,減少 memory access 次數所以 x 和 y 方向 filtering 分開
#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // eliminate deprecated warning
#include <CL/cl.hpp>
#include <opencv2/opencv.hpp>
using namespace cl;
using namespace cv;
using namespace std;
Platform getPlatform() {
vector<Platform> platforms;
Platform::get( &platforms );
if ( platforms.empty() ) {
printf("no platform found.\n");
exit(1);
}
return platforms[0];
}
Device getDevice(Platform platform) {
vector<Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
if ( devices.empty() ) {
printf("no device found\n");
exit(1);
}
return devices[0];
}
void readKernelFile(const string &fileName, string &kernelString) {
ifstream ifs( fileName, ifstream::binary );
if ( !ifs.is_open() ) {
printf("open kernel file failed: %s\n", fileName.c_str());
exit(1);
}
stringstream temp;
temp << ifs.rdbuf();
ifs.close();
kernelString = temp.str();
}
int main (int argc, char *argv[]) {
// get kernel source code
Program::Sources sources;
string kernelString;
readKernelFile(argv[1], kernelString);
sources.push_back({kernelString.c_str(), kernelString.length()});
// get Platform
Platform platform = getPlatform();
cout << platform.getInfo<CL_PLATFORM_NAME>() << endl;
// get device
Device device = getDevice( platform );
cout << device.getInfo<CL_DEVICE_NAME>() << endl;
// get context
Context context(device);
// build kernel program
Program program(context, sources);
if( program.build({device}) != CL_SUCCESS ){
cout<< " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << endl;
exit(1);
}
// make kernel function
auto boxBlurX = make_kernel<Buffer&, Buffer&, int&>(program, "boxBlurX");
auto boxBlurY = make_kernel<Buffer&, Buffer&, int&>(program, "boxBlurY");
// command queue
CommandQueue queue(context, device);
// buffer on host
Mat3b A = imread(argv[2]);
Mat3b B( A.size() );
const int nRows = A.rows;
const int nCols = A.cols;
const int vecSize = nRows * nCols;
// create buffers on the device
Buffer devA(context, CL_MEM_READ_WRITE, sizeof(uchar)*3*vecSize);
Buffer devBx(context, CL_MEM_READ_WRITE, sizeof(uchar)*3*vecSize);
Buffer devBy(context, CL_MEM_READ_WRITE, sizeof(uchar)*3*vecSize);
// copy buffer form host to device
queue.enqueueWriteBuffer(devA, CL_TRUE, 0, sizeof(uchar)*3*vecSize, A.data);
EnqueueArgs kernelDim(queue, NDRange(nRows, nCols));
int kSize = 3;
int sign = 2;
while( true ) {
// run kernel
boxBlurX(kernelDim, devA , devBx, kSize);
boxBlurY(kernelDim, devBx, devBy, kSize);
boxBlurX(kernelDim, devBy, devBx, kSize);
boxBlurY(kernelDim, devBx, devBy, kSize);
// read buffer from device to host
queue.enqueueReadBuffer(devBy, CL_TRUE, 0, sizeof(uchar)*3*vecSize, B.data);
// show result
printf("kSize %d\n", kSize);
imshow("result", B);
int key = waitKey(33);
if ( key == 'q') break;
kSize += sign;
if (kSize >= 21) {
sign = -2;
}
if (kSize <= 3) {
sign = 2;
}
}
return 0;
}
void kernel boxBlurX(global const unsigned char* A, global unsigned char* B, int kSize) {
int cy = get_global_id(0);
int cx = get_global_id(1);
int rows = get_global_size(0);
int cols = get_global_size(1);
int pad = kSize >> 1;
int3 sum = 0;
int offsetYY = 3 * cy * cols;
for (int x = cx-pad; x <= cx+pad; ++x) {
int xx = clamp(x, 0, cols-1);
int offsetXX = 3*xx;
int offsetP = offsetYY + offsetXX;
sum += (int3) (A[offsetP+0], A[offsetP+1], A[offsetP+2]);
}
sum /= kSize;
int offset = 3*(cy * cols + cx);
B[offset+0] = clamp(sum.x, 0, 255);
B[offset+1] = clamp(sum.y, 0, 255);
B[offset+2] = clamp(sum.z, 0, 255);
}
void kernel boxBlurY(global const unsigned char* A, global unsigned char* B, int kSize) {
int cy = get_global_id(0);
int cx = get_global_id(1);
int rows = get_global_size(0);
int cols = get_global_size(1);
int pad = kSize >> 1;
int3 sum = 0;
int offsetXX = 3*cx;
for (int y = cy-pad; y <= cy+pad; ++y) {
int yy = clamp(y, 0, rows-1);
int offsetYY = 3 * yy * cols;
int offsetP = offsetYY + offsetXX;
sum += (int3) (A[offsetP+0], A[offsetP+1], A[offsetP+2]);
}
sum /= kSize;
int offset = 3*(cy * cols + cx);
B[offset+0] = clamp(sum.x, 0, 255);
B[offset+1] = clamp(sum.y, 0, 255);
B[offset+2] = clamp(sum.z, 0, 255);
}
