题⽬:将1000000个线程写⼊到10个数组。
#include #include #include \"device_launch_parameters.h\"#include \"gputimer.h\" #define num_threads 1000000#define block_width 1000#define array_size 10 void print_array(int * array, int size); __global__ void increment_atomic(int * g){ int i = blockIdx.x * blockDim.x + threadIdx.x; i = i % array_size; atomicAdd(&g[i], 1);} void print_array(int * array, int size){ for (int i = 0; i < size; i++) { printf(\"%d \", array[i]); }} int main(){ GpuTimer timer; printf(\"%d total threads in %d blocks writing into %d arrays\\n\", num_threads, num_threads / block_width, array_size); int h_array[array_size]; const int array_bytes = array_size * sizeof(int); int * d_array; cudaMalloc((void **)&d_array, array_bytes); cudaMemset((void *)d_array, 0, array_bytes); timer.Start(); increment_atomic << cudaMemcpy(h_array, d_array, array_bytes, cudaMemcpyDeviceToHost); print_array(h_array, array_size); printf(\"\\nTime elapsed = %g ms\\n\", timer.Elapsed()); cudaFree(d_array); return 0;} 编译环境:visual studio 2013 因篇幅问题不能全部显示,请点此查看更多更全内容