Monday 15 April 2013

c - OpenCL Kernel for String Concatenation -



c - OpenCL Kernel for String Concatenation -

i haven't found much literature or examples of performing operations on strings gpu. specifically, have 2 arrays of strings , need concatenate elements of 2nd array corresponding elements of 1st array. cannot figure out how write kernel this.

an illustration of concatenation in c be:

#include <stdio.h> void concatenate_string(char*, char*, char*); int main() { char original[100], add[100], result[100]; printf("enter source string\n"); scanf("%s", original); printf("enter string concatenate\n"); scanf("%s", add); concatenate_string(original, add, result); printf("string after concatenation \"%s\"\n", result); homecoming 0; } void concatenate_string(char *original, char *add, char *result) { while(*original) { *result = *original; original++; result++; } while(*add) { *result = *add; add++; result++; } *result = '\0'; }

below opencl host code containing kernel. kernel follows same flow concatenate_string function above. programme executes successfully, gives me no output.

#include <stdio.h> #include <stdlib.h> #include <string.h> #ifdef __apple__ #include <opencl/cl.h> #else #include <cl/cl.h> #endif #include <ocl_macros.h> #include <iostream> #include <string> //common defines #define vendor_name "amd" #define device_type cl_device_type_gpu #define vector_size 1024 using namespace std; //opencl kernel run every work item created. //the below const char string compiled runtime complier //when programme object created clcreateprogramwithsource //and built clbuildprogram. const char *concat_kernel = "__kernel \n" "void concat_kernel( \n" " __global uchar *d, \n" " __global uchar *e, \n" " __global uchar *f) \n" "{ \n" " //get index of work-item \n" " int index = get_global_id(0); \n" " while(d[index]) \n" " { \n" " *f[index] = *d[index]; \n" " d[index]++; \n" " f[index]++; \n" " } \n" " while(e[index]) \n" " { \n" " *f[index] = *e[index]; \n" " e[index]++; \n" " f[index]++; \n" " } \n" " *f[index] = '\0'; \n" "} \n"; int main(void) { cl_int clstatus; //keeps track of error values returned. // platform , device info cl_platform_id * platforms = null; // set platform. take @ macros used in file. // these defined in common/ocl_macros.h ocl_create_platforms( platforms ); // devices list , take type of device want run on cl_device_id *device_list = null; ocl_create_device( platforms[0], device_type, device_list); // create opencl context devices in device_list cl_context context; cl_context_properties props[3] = { cl_context_platform, (cl_context_properties)platforms[0], 0 }; // opencl context can associated multiple devices, either cpu or gpu // based on value of device_type defined above. context = clcreatecontext( null, num_devices, device_list, null, null, &clstatus); log_ocl_error(clstatus, "clcreatecontext failed..." ); // create command queue first device in device_list cl_command_queue command_queue = clcreatecommandqueue(context, device_list[0], 0, &clstatus); log_ocl_error(clstatus, "clcreatecommandqueue failed..." ); // allocate space vectors d, e, , f string *d = (string*)malloc(sizeof(string)*vector_size); string *e = (string*)malloc(sizeof(string)*vector_size); string *f = (string*)malloc(sizeof(string)*vector_size); for(int = 0; < vector_size; i++) { d[i] = ".25_numstring"; } for(int = 0; < vector_size; i++) { e[i] = "string_2"; f[i] = "0"; } // create memory buffers on device each vector cl_mem d_clmem = clcreatebuffer(context, cl_mem_read_only, vector_size * sizeof(string), null, &clstatus); cl_mem e_clmem = clcreatebuffer(context, cl_mem_read_only, vector_size * sizeof(string), null, &clstatus); cl_mem f_clmem = clcreatebuffer(context, cl_mem_write_only, vector_size * sizeof(string), null, &clstatus); // re-create buffer d , e device. blocking write device buffer. clstatus = clenqueuewritebuffer(command_queue, d_clmem, cl_true, 0, vector_size * sizeof(string), d, 0, null, null); log_ocl_error(clstatus, "clenqueuewritebuffer failed..." ); clstatus = clenqueuewritebuffer(command_queue, e_clmem, cl_true, 0, vector_size * sizeof(string), e, 0, null, null); log_ocl_error(clstatus, "clenqueuewritebuffer failed..." ); // create programme kernel source cl_program programme = clcreateprogramwithsource(context, 1, (const char **)&concat_kernel, null, &clstatus); log_ocl_error(clstatus, "clcreateprogramwithsource failed..." ); // build programme clstatus = clbuildprogram(program, 1, device_list, null, null, null); if(clstatus != cl_success) log_ocl_compiler_error(program, device_list[0]); // create opencl kernel cl_kernel kernel = clcreatekernel(program, "concat_kernel", &clstatus); // set arguments of kernel. take @ kernel definition in concat_kernel // variable. first parameter constant , other 3 buffers. clstatus |= clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *)&d_clmem); clstatus |= clsetkernelarg(kernel, 1, sizeof(cl_mem), (void *)&e_clmem); clstatus |= clsetkernelarg(kernel, 2, sizeof(cl_mem), (void *)&f_clmem); log_ocl_error(clstatus, "clsetkernelarg failed..." ); // execute opencl kernel on list size_t global_size = vector_size; // process 1 vector element in each work item size_t local_size = 64; // process in work groups of size 64. cl_event concat_event; clstatus = clenqueuendrangekernel(command_queue, kernel, 1, null, &global_size, &local_size, 0, null, &concat_event); log_ocl_error(clstatus, "clenqueuendrangekernel failed..." ); // read memory buffer f_clmem on device host allocated buffer c // task invoked after completion of event concat_event clstatus = clenqueuereadbuffer(command_queue, f_clmem, cl_true, 0, vector_size * sizeof(string), f, 1, &concat_event, null); log_ocl_error(clstatus, "clenqueuereadbuffer failed..." ); // clean , wait comands complete. clstatus = clfinish(command_queue); // display result screen for(int = 0; < vector_size; i++) printf("%s + %s = %s\n", d[i].c_str(), e[i].c_str(), f[i].c_str()); // release opencl objects , release host buffers. clstatus = clreleasekernel(kernel); clstatus = clreleaseprogram(program); clstatus = clreleasememobject(d_clmem); clstatus = clreleasememobject(e_clmem); clstatus = clreleasememobject(f_clmem); clstatus = clreleasecommandqueue(command_queue); clstatus = clreleasecontext(context); free(d); free(e); free(f); free(platforms); free(device_list); homecoming 0; }

i don't think see much of gain offloading concat operation gpu, here how it:

__kernel void concat_kernel(__global uchar *d,__global uchar *e,__global uchar *f, const int dsize, const int esize) { int gid = get_global_id(0); int globalsize = get_global_size(0); int i; for(i=gid; i< dsize; i+= globalsize){ f[i] = d[i]; } for(i=gid; i< esize; i+= globalsize){ f[i+dsize] = e[i]; } if(gid == globalsize-1){ //using lastly work item here because //idle when (dsize+esize) % globalsize != 0 f[dsize + esize -1] = '\0'; } }

you need pass in sizes of strings want concatenate, instead of searching null value. kernel work number of work items, , different-sized d , e inputs. usual, f needs big plenty hold dsize+esise+1 chars.

each work item re-create (dsize+esize)/globalsize chars output.

room improvement:

try different global work sizes find optimal value device , input size the global memory access should pretty good, if want seek single work grouping , utilize local memory, help, bound global read speed.

c string algorithm opencl gpgpu

No comments:

Post a Comment