Breedlove: c - OpenCL Kernel for String Concatenation -

c - OpenCL Kernel for String Concatenation -

i haven't found much literature or examples of performing operations on strings gpu. specifically, have 2 arrays of strings , need concatenate elements of 2nd array corresponding elements of 1st array. cannot figure out how write kernel this.

an illustration of concatenation in c be:

#include <stdio.h>  void concatenate_string(char*, char*, char*);  int main() {    char original[100], add[100], result[100];    printf("enter source string\n");    scanf("%s", original);    printf("enter string concatenate\n");    scanf("%s", add);    concatenate_string(original, add, result);     printf("string after concatenation \"%s\"\n", result);      homecoming 0; }  void concatenate_string(char *original, char *add, char *result) {     while(*original)     {        *result = *original;        original++;        result++;     }     while(*add)     {        *result = *add;        add++;        result++;     }     *result = '\0'; }

below opencl host code containing kernel. kernel follows same flow concatenate_string function above. programme executes successfully, gives me no output.

#include <stdio.h> #include <stdlib.h> #include <string.h> #ifdef __apple__ #include <opencl/cl.h> #else #include <cl/cl.h> #endif #include <ocl_macros.h> #include <iostream> #include <string> //common defines  #define vendor_name "amd" #define device_type cl_device_type_gpu #define vector_size 1024  using namespace std;  //opencl kernel run every work item created. //the below const char string compiled runtime complier //when  programme object created clcreateprogramwithsource  //and built clbuildprogram. const char *concat_kernel = "__kernel                                       \n" "void concat_kernel(                            \n" "                  __global uchar *d,           \n" "                  __global uchar *e,           \n" "                  __global uchar *f)           \n" "{                                              \n" "    //get index of work-item           \n" "    int index = get_global_id(0);              \n" "    while(d[index])                            \n" "    {                                          \n" "        *f[index] = *d[index];                 \n" "        d[index]++;                            \n" "        f[index]++;                            \n" "    }                                          \n" "    while(e[index])                            \n" "    {                                          \n" "        *f[index] = *e[index];                 \n" "        e[index]++;                            \n" "        f[index]++;                            \n" "    }                                          \n" "    *f[index] = '\0';                          \n" "}                                              \n";  int main(void) {      cl_int clstatus; //keeps track of error values returned.       // platform , device  info     cl_platform_id * platforms = null;      // set platform. take @ macros used in file.      // these defined in common/ocl_macros.h     ocl_create_platforms( platforms );      // devices list ,  take type of device want run on     cl_device_id *device_list = null;     ocl_create_device( platforms[0], device_type, device_list);      // create opencl context devices in device_list     cl_context context;     cl_context_properties props[3] =     {         cl_context_platform,         (cl_context_properties)platforms[0],         0     };     // opencl context can associated multiple devices, either cpu or gpu     // based on value of device_type defined above.     context = clcreatecontext( null, num_devices, device_list, null, null, &clstatus);     log_ocl_error(clstatus, "clcreatecontext failed..." );      // create command queue first device in device_list     cl_command_queue command_queue = clcreatecommandqueue(context, device_list[0], 0, &clstatus);     log_ocl_error(clstatus, "clcreatecommandqueue failed..." );      // allocate space vectors d, e, , f      string *d = (string*)malloc(sizeof(string)*vector_size);     string *e = (string*)malloc(sizeof(string)*vector_size);     string *f = (string*)malloc(sizeof(string)*vector_size);     for(int = 0; < vector_size; i++)     {         d[i] = ".25_numstring";     }     for(int = 0; < vector_size; i++)     {         e[i] = "string_2";         f[i] = "0";     }     // create memory buffers on device each vector     cl_mem d_clmem = clcreatebuffer(context, cl_mem_read_only,             vector_size * sizeof(string), null, &clstatus);     cl_mem e_clmem = clcreatebuffer(context, cl_mem_read_only,             vector_size * sizeof(string), null, &clstatus);     cl_mem f_clmem = clcreatebuffer(context, cl_mem_write_only,             vector_size * sizeof(string), null, &clstatus);      //  re-create buffer d , e device. blocking write device buffer.     clstatus = clenqueuewritebuffer(command_queue, d_clmem, cl_true, 0,             vector_size * sizeof(string), d, 0, null, null);     log_ocl_error(clstatus, "clenqueuewritebuffer failed..." );     clstatus = clenqueuewritebuffer(command_queue, e_clmem, cl_true, 0,             vector_size * sizeof(string), e, 0, null, null);     log_ocl_error(clstatus, "clenqueuewritebuffer failed..." );      // create  programme kernel source     cl_program  programme = clcreateprogramwithsource(context, 1,             (const char **)&concat_kernel, null, &clstatus);     log_ocl_error(clstatus, "clcreateprogramwithsource failed..." );      // build  programme     clstatus = clbuildprogram(program, 1, device_list, null, null, null);     if(clstatus != cl_success)         log_ocl_compiler_error(program, device_list[0]);      // create opencl kernel     cl_kernel kernel = clcreatekernel(program, "concat_kernel", &clstatus);      // set arguments of kernel. take @ kernel  definition in concat_kernel      // variable. first parameter constant , other 3 buffers.     clstatus |= clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *)&d_clmem);     clstatus |= clsetkernelarg(kernel, 1, sizeof(cl_mem), (void *)&e_clmem);     clstatus |= clsetkernelarg(kernel, 2, sizeof(cl_mem), (void *)&f_clmem);     log_ocl_error(clstatus, "clsetkernelarg failed..." );      // execute opencl kernel on list     size_t global_size = vector_size; // process 1 vector element in each work item     size_t local_size = 64;           // process in work groups of size 64.     cl_event concat_event;     clstatus = clenqueuendrangekernel(command_queue, kernel, 1, null,             &global_size, &local_size, 0, null, &concat_event);     log_ocl_error(clstatus, "clenqueuendrangekernel failed..." );      // read memory buffer f_clmem on device host allocated buffer c     // task invoked after completion of event concat_event     clstatus = clenqueuereadbuffer(command_queue, f_clmem, cl_true, 0,             vector_size * sizeof(string), f, 1, &concat_event, null);     log_ocl_error(clstatus, "clenqueuereadbuffer failed..." );      // clean , wait comands complete.     clstatus = clfinish(command_queue);      // display result screen     for(int = 0; < vector_size; i++)         printf("%s + %s = %s\n", d[i].c_str(), e[i].c_str(), f[i].c_str());      // release opencl objects , release host buffers.     clstatus = clreleasekernel(kernel);     clstatus = clreleaseprogram(program);     clstatus = clreleasememobject(d_clmem);     clstatus = clreleasememobject(e_clmem);     clstatus = clreleasememobject(f_clmem);     clstatus = clreleasecommandqueue(command_queue);     clstatus = clreleasecontext(context);     free(d);     free(e);     free(f);     free(platforms);     free(device_list);       homecoming 0; }

i don't think see much of gain offloading concat operation gpu, here how it:

__kernel void concat_kernel(__global uchar *d,__global uchar *e,__global uchar *f, const int dsize, const int esize) {     int gid = get_global_id(0);     int globalsize = get_global_size(0);      int i;     for(i=gid; i< dsize; i+= globalsize){         f[i] = d[i];     }      for(i=gid; i< esize; i+= globalsize){         f[i+dsize] = e[i];     }      if(gid == globalsize-1){         //using  lastly work item here because         //idle when (dsize+esize) % globalsize != 0         f[dsize + esize -1] = '\0';     } }

you need pass in sizes of strings want concatenate, instead of searching null value. kernel work number of work items, , different-sized d , e inputs. usual, f needs big plenty hold dsize+esise+1 chars.

each work item re-create (dsize+esize)/globalsize chars output.

room improvement:

try different global work sizes find optimal value device , input size the global memory access should pretty good, if want seek single work grouping , utilize local memory, help, bound global read speed.

c string algorithm opencl gpgpu

Breedlove

Monday, 15 April 2013

c - OpenCL Kernel for String Concatenation -

No comments:

Post a Comment