c - How to merge row processing kernel filterring and column processing kernel filterring into single openCL kernel -
i have image processing filter implemented using row , column processing in opencl, of processing gets wasted in launching multiple kernels itself.
so can merge these 2 kernel single kernel same functionality , performs improve in intel hd4600 graphics card. details of code given below:-
assumptions: 1. both horizontal , vertical padding done host (c programming) 2. n (filter length 8, width , height 1024 x 1024,filter coefficients generated using generic filters 3. first row , col kernel beingness launched using below api ret |= clenqueuendrangekernel(command_queue, kernel, 2, null, global_ws(1024x1024), null, 0, null,null);
//code:
__kernel void filter_rows(__global float *ip_img,__global float *op_img, int width, int height,int pitch,int n,__constant float *w) { __private int i=get_global_id(0); __private int j=get_global_id(1); __private int k; __private float a; __private int image_offset = n*pitch +n; __private int curr_pix = j*pitch + +image_offset; // apply filter for(k=-n, a=0.0f; k<=n; k++) { += ip_img[curr_pix+k] * w[k+n]; } op_img[curr_pix] = a; } __kernel void filter_col(__global float *ip_img,__global float *op_img,int width, int height,int pitch,int n,__constant float *w) { __private int i=get_global_id(0); __private int j=get_global_id(1); __private int k; __private float a; __private int image_offset = n*pitch +n; __private int curr_pix = j*pitch + +image_offset; // apply filter for(k=-n, a=0.0f; k<=n; k++) { += ip_img[k*pitch +curr_pix] * w[k+n]; } op_img[curr_pix] = a; } void padd_hor(float *ip_img,pad_leng) { //...using simple c programming } void padd_ver(float *ip_img,pad_leng) { //...using simple c programming } void generic_filter(_global float *in_image,__global float *out_image, __global float *temp_image,int width, int height,int pitch,int n, __constant float *wr,__constant float *wc) { padd_hor(in_image,filter_length) filter_rows(in_image,temp_image,width,height,pitch,filter_length,filter_coeff_hor); pad_ver(temp_image,filter_length) filter_col(temp_image,out_image,width,height,pitch,filter_length,filter_coeff_ver); } __kernel generic_filter(_global float *in_image,__global float *out_image,__global float*temp_image, int width, int height,int pitch,int n,__constant float *wr,__constant float *wc) { // ... here need suggetion implement kernel same generic_filter }
your help appreciated optimize filter , best possible result. please allow me know how much max gain can respect c code runs on intel cpu.
thanks , regards vijayky88
c optimization opencl
No comments:
Post a Comment