Breedlove: c - How to merge row processing kernel filterring and column processing kernel filterring into single openCL kernel -

Saturday, 15 June 2013

c - How to merge row processing kernel filterring and column processing kernel filterring into single openCL kernel -

i have image processing filter implemented using row , column processing in opencl, of processing gets wasted in launching multiple kernels itself.

so can merge these 2 kernel single kernel same functionality , performs improve in intel hd4600 graphics card. details of code given below:-

assumptions: 1. both horizontal , vertical padding done host (c programming) 2. n (filter length 8, width , height 1024 x 1024,filter coefficients generated using generic filters 3. first row , col kernel beingness launched using below api ret |= clenqueuendrangekernel(command_queue, kernel, 2, null, global_ws(1024x1024), null, 0, null,null);

//code:

__kernel void filter_rows(__global float *ip_img,__global float *op_img,                           int width, int height,int pitch,int n,__constant float *w) {     __private int i=get_global_id(0);      __private int j=get_global_id(1);      __private int k;     __private float a;     __private int image_offset = n*pitch +n;     __private int curr_pix = j*pitch + +image_offset;     // apply filter     for(k=-n, a=0.0f; k<=n; k++)     {     += ip_img[curr_pix+k] * w[k+n];     }     op_img[curr_pix] = a;    } __kernel void filter_col(__global float *ip_img,__global float *op_img,int width,                          int height,int pitch,int n,__constant float *w) {     __private int i=get_global_id(0);     __private int j=get_global_id(1);     __private int k;     __private float a;     __private int image_offset = n*pitch +n;     __private int curr_pix = j*pitch + +image_offset;      // apply filter     for(k=-n, a=0.0f; k<=n; k++)     {       += ip_img[k*pitch +curr_pix] * w[k+n];                       }     op_img[curr_pix] = a; } void padd_hor(float *ip_img,pad_leng) {     //...using simple c programming } void padd_ver(float *ip_img,pad_leng) {     //...using simple c programming } void generic_filter(_global float *in_image,__global float *out_image, __global float *temp_image,int width, int height,int pitch,int n, __constant float *wr,__constant float *wc) {     padd_hor(in_image,filter_length)     filter_rows(in_image,temp_image,width,height,pitch,filter_length,filter_coeff_hor);     pad_ver(temp_image,filter_length)     filter_col(temp_image,out_image,width,height,pitch,filter_length,filter_coeff_ver); } __kernel generic_filter(_global float *in_image,__global float *out_image,__global      float*temp_image, int width, int height,int pitch,int n,__constant float *wr,__constant float *wc) {     // ... here need suggetion implement kernel same generic_filter }

your help appreciated optimize filter , best possible result. please allow me know how much max gain can respect c code runs on intel cpu.

thanks , regards vijayky88

c optimization opencl

Breedlove

Saturday, 15 June 2013

c - How to merge row processing kernel filterring and column processing kernel filterring into single openCL kernel -

No comments:

Post a Comment