29 const int registers_per_thread = 64;
30 cudaDeviceProp deviceInfo;
37 ,
int shmem_per_system){
38 return blocks_per_mp( chunk_size * thread_per_system, chunk_size * shmem_per_system)
45 if( dev >= 0 && dev < devcnt )
48 std::cerr <<
"Cannot select the CUDA device. GPU integrators are disabled" << std::endl;
50 cudaErrCheck( cudaGetDeviceProperties(&deviceInfo, dev) );
61 cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
66 std::cerr <<
"Device:\t" << deviceInfo.name <<
"\n"
68 <<
"Global Memory:\t" << deviceInfo.totalGlobalMem/double(1<<30) <<
"GB\n"
69 <<
"Shared Memory\t" << deviceInfo.sharedMemPerBlock/1024 <<
"KB\n"
70 <<
"Max Blocksize\t" << deviceInfo.maxThreadsPerBlock <<
"\n"
71 <<
"Warp Size \t" << deviceInfo.warpSize <<
"\n"
72 <<
"Registers/MP \t" << deviceInfo.regsPerBlock <<
"\n"
81 assert(blocksize > 0);
82 assert(registers_per_thread > 0);
83 assert(shmem_per_block > 0);
84 assert(deviceInfo.warpSize > 0 );
85 int reg_limit = deviceInfo.regsPerBlock / (blocksize * registers_per_thread);
86 int shm_limit = deviceInfo.sharedMemPerBlock / shmem_per_block ;
87 int block_warps = (blocksize+ deviceInfo.warpSize)/deviceInfo.warpSize;
88 int total_warps = deviceInfo.maxThreadsPerBlock / deviceInfo.warpSize;
89 int warp_limit = block_warps > 0 ? total_warps / block_warps : 0;
91 int limit = std::min( warp_limit, std::min( reg_limit , shm_limit ) );
94 $PRINT(
"BS: " << blocksize <<
", SHM" << shmem_per_block <<
" -> "
95 <<
"Limits: reg=" << reg_limit <<
", shm=" << shm_limit
96 <<
", warp=" << warp_limit );