Error with 'cuda-memcheck' in cuda 8.0 -


it strange when not add cuda-memcheck before ./main, program runs without warning or error message, however, when add it, have error message following.

========= invalid __global__ write of size 8 =========     @ 0x00000120 in initcurand(curandstatexorwow*, unsigned long) =========     thread (9,0,0) in block (3,0,0) =========     address 0x5005413b0 out of bounds =========     saved host backtrace driver entry point @ kernel launch time =========     host frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (culaunchkernel + 0x2c5) [0x204115] =========     host frame:./main [0x18e11] =========     host frame:./main [0x369b3] =========     host frame:./main [0x3403] =========     host frame:./main [0x308c] =========     host frame:./main [0x30b7] =========     host frame:./main [0x2ebb] =========     host frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830] 

here functions, brief introduction on code, try generate random numbers , save them device variable weights, use vector sample discrete numbers.

#include<iostream> #include<curand.h> #include<curand_kernel.h> #include<time.h>  using namespace std;  #define num 100   __device__ float weights[num];  // function define seed __global__ void initcurand(curandstate *state, unsigned long seed){     int idx = threadidx.x + blockidx.x * blockdim.x;     curand_init(seed, idx, 0, &state[idx]); }   __device__ void sampling(float *weight, float max_weight, int *index, curandstate *state){     int j;     float u;         do{         j = (int)(curand_uniform(state) * (num + 0.999999));          u = curand_uniform(state); //sample uniform distribution;     }while( u > weight[j]/max_weight);     *index  = j; }  __global__ void test(int *dev_sample, curandstate *state){     int idx     = threadidx.x + blockidx.x * blockdim.x;\     // generate random numbers uniform distribution , save them weights     weights[idx]    = curand_uniform(&state[idx]);     // run sampling function, in which, weights input function on each thread     sampling(weights, 1, dev_sample+idx, &state[idx]); }   int main(){      // define seed of random generator     curandstate *devstate;       cudamalloc((void**)&devstate, num*sizeof(curandstate));      int *h_sample;     h_sample    = (int*) malloc(num*sizeof(int));      int *d_sample;     cudamalloc((void**)&d_sample, num*sizeof(float));      initcurand<<<(int)num/32 + 1, 32>>>(devstate, 1);     test<<<(int)num/32 + 1, 32>>>(d_sample, devstate);      cudamemcpy(h_sample, d_sample, num*sizeof(float), cudamemcpydevicetohost);      (int = 0; < num; ++i)     {         cout << *(h_sample + i) << endl;     }      //free memory     cudafree(devstate);     free(h_sample);     cudafree(d_sample);     return 0; } 

just start learn cuda, if methods access global memory incorrect, please me that. thanks

this launching "extra" threads:

initcurand<<<(int)num/32 + 1, 32>>>(devstate, 1); 

num 100, above config launch 4 blocks of 32 threads each, i.e. 128 threads. allocating space 100 curandstate here:

cudamalloc((void**)&devstate, num*sizeof(curandstate)); 

so initcurand kernel have threads (idx = 100-127) attempting initialize curandstate haven't allocated. result when run cuda-memcheck rigorous out-of-bounds checking, error reported.

one possible solution modify initcurand kernel follows:

__global__ void initcurand(curandstate *state, unsigned long seed, int num){     int idx = threadidx.x + blockidx.x * blockdim.x;     if (idx < num)         curand_init(seed, idx, 0, &state[idx]); } 

this prevent out-of-bounds threads doing anything. note need modify kernel call pass num it. also, appears me have similar problem in test kernel. may want similar fix there. common construct in cuda kernels, call "thread check". can find other questions here on tag discussing same concept.


Comments

Popular posts from this blog

aws api gateway - SerializationException in posting new Records via Dynamodb Proxy Service in API -

asp.net - Problems sending emails from forum -