24 #include "../common.hpp"
30 struct count_systems_t {
33 count_systems_t(
const ensemble &ens):ens(ens),count_running(0){}
37 void number_of_active_systems_kernel( count_systems_t* csys){
38 int sysid = ((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
39 if(sysid>=csys->ens.nsys())
return;
40 if(csys->ens[sysid].is_active() )
41 atomicAdd(&csys->count_running,1);
45 const int system_per_block = 16*16;
46 const int nblocks = ( ens.
nsys() + system_per_block - 1 ) / system_per_block;
49 dim3 tD; tD.x = system_per_block; tD.y = 1;
51 count_systems_t count_systems(ens), *pcount_systems ;
54 cudaErrCheck ( cudaMalloc(&pcount_systems,
sizeof(count_systems_t)) );
55 cudaErrCheck ( cudaMemcpy(pcount_systems,&count_systems,
sizeof(count_systems_t),cudaMemcpyHostToDevice) );
57 number_of_active_systems_kernel<<< gD, tD >>>( pcount_systems );
59 cudaErrCheck ( cudaMemcpy(&count_systems,pcount_systems,
sizeof(count_systems_t),cudaMemcpyDeviceToHost) );
62 return count_systems.count_running;
66 bool configure_grid(dim3 &gridDim,
int threadsPerBlock,
int nthreads,
int dynShmemPerThread,
int staticShmemPerBlock)
68 const int shmemPerMP = 16384;
70 int dyn_shared_mem_required = dynShmemPerThread*threadsPerBlock;
71 int shared_mem_required = staticShmemPerBlock + dyn_shared_mem_required;
72 if(shared_mem_required > shmemPerMP) {
return false; }
75 int nthreadsEx = nthreads;
76 int over = nthreads % threadsPerBlock;
77 if(over) { nthreadsEx += threadsPerBlock - over; }
80 int nblocks = nthreadsEx / threadsPerBlock;
81 if(nthreadsEx % threadsPerBlock) { nblocks++; }
96 for(
int bytmp = 1; bytmp != 65536; bytmp++)
98 int r = nblocks % bytmp;
99 if(r < best_r && nblocks / bytmp < 65535)
102 bx = nblocks / bytmp;
105 if(r == 0) {
break; }
109 if(bx == -1) { std::cerr <<
"Unfactorizable?!\n"; exit(-1); }
114 void reactivate_systems_kernel(
ensemble ens ){
115 int sysid = ((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
116 if(sysid>=ens.nsys())
return;
117 if(ens[sysid].is_inactive() )
118 ens[sysid].set_active();
123 void reactivate_systems(
ensemble ens) {
124 const int system_per_block = 16*16;
125 const int nblocks = ( ens.nsys() + system_per_block - 1 ) / system_per_block;
128 dim3 tD; tD.x = system_per_block; tD.y = 1;
130 reactivate_systems_kernel<<< gD, tD >>>( ens );