Error with 'cuda-memcheck' in cuda 8.0 -
it strange when not add cuda-memcheck
before ./main
, program runs without warning or error message, however, when add it, have error message following.
========= invalid __global__ write of size 8 ========= @ 0x00000120 in initcurand(curandstatexorwow*, unsigned long) ========= thread (9,0,0) in block (3,0,0) ========= address 0x5005413b0 out of bounds ========= saved host backtrace driver entry point @ kernel launch time ========= host frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (culaunchkernel + 0x2c5) [0x204115] ========= host frame:./main [0x18e11] ========= host frame:./main [0x369b3] ========= host frame:./main [0x3403] ========= host frame:./main [0x308c] ========= host frame:./main [0x30b7] ========= host frame:./main [0x2ebb] ========= host frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
here functions, brief introduction on code, try generate random numbers , save them device variable weights
, use vector sample discrete numbers.
#include<iostream> #include<curand.h> #include<curand_kernel.h> #include<time.h> using namespace std; #define num 100 __device__ float weights[num]; // function define seed __global__ void initcurand(curandstate *state, unsigned long seed){ int idx = threadidx.x + blockidx.x * blockdim.x; curand_init(seed, idx, 0, &state[idx]); } __device__ void sampling(float *weight, float max_weight, int *index, curandstate *state){ int j; float u; do{ j = (int)(curand_uniform(state) * (num + 0.999999)); u = curand_uniform(state); //sample uniform distribution; }while( u > weight[j]/max_weight); *index = j; } __global__ void test(int *dev_sample, curandstate *state){ int idx = threadidx.x + blockidx.x * blockdim.x;\ // generate random numbers uniform distribution , save them weights weights[idx] = curand_uniform(&state[idx]); // run sampling function, in which, weights input function on each thread sampling(weights, 1, dev_sample+idx, &state[idx]); } int main(){ // define seed of random generator curandstate *devstate; cudamalloc((void**)&devstate, num*sizeof(curandstate)); int *h_sample; h_sample = (int*) malloc(num*sizeof(int)); int *d_sample; cudamalloc((void**)&d_sample, num*sizeof(float)); initcurand<<<(int)num/32 + 1, 32>>>(devstate, 1); test<<<(int)num/32 + 1, 32>>>(d_sample, devstate); cudamemcpy(h_sample, d_sample, num*sizeof(float), cudamemcpydevicetohost); (int = 0; < num; ++i) { cout << *(h_sample + i) << endl; } //free memory cudafree(devstate); free(h_sample); cudafree(d_sample); return 0; }
just start learn cuda, if methods access global memory incorrect, please me that. thanks
this launching "extra" threads:
initcurand<<<(int)num/32 + 1, 32>>>(devstate, 1);
num
100, above config launch 4 blocks of 32 threads each, i.e. 128 threads. allocating space 100 curandstate
here:
cudamalloc((void**)&devstate, num*sizeof(curandstate));
so initcurand
kernel have threads (idx
= 100-127) attempting initialize curandstate
haven't allocated. result when run cuda-memcheck
rigorous out-of-bounds checking, error reported.
one possible solution modify initcurand
kernel follows:
__global__ void initcurand(curandstate *state, unsigned long seed, int num){ int idx = threadidx.x + blockidx.x * blockdim.x; if (idx < num) curand_init(seed, idx, 0, &state[idx]); }
this prevent out-of-bounds threads doing anything. note need modify kernel call pass num
it. also, appears me have similar problem in test
kernel. may want similar fix there. common construct in cuda kernels, call "thread check". can find other questions here on tag discussing same concept.
Comments
Post a Comment