00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include <math.h>
00033 #include <cuda.h>
00034
00035 #include "Inform.h"
00036 #include "utilities.h"
00037 #include "WKFThreads.h"
00038 #include "WKFUtils.h"
00039 #include "CUDAKernels.h"
00040
00041
00042
00043
00044 #if 0
00045
00046
00047
00048 #define NBLOCK 128 // size of an RDF data block
00049 #define MAXBIN 64 // maximum number of bins in a histogram
00050 #elif 1
00051
00052
00053
00054 #define NBLOCK 32 // size of an RDF data block
00055 #define MAXBIN 1024 // maximum number of bins in a histogram
00056 #elif 1
00057
00058
00059 #define NBLOCK 320 // size of an RDF data block
00060 #define MAXBIN 3072 // maximum number of bins in a histogram
00061 #define __USEATOMIC 1 // enable atomic increment operations
00062 #elif 1
00063
00064
00065
00066
00067 #define NBLOCK 896 // size of an RDF data block
00068 #define MAXBIN 8192 // maximum number of bins in a histogram
00069 #define __USEATOMIC 1 // enable atomic increment operations
00070 #endif
00071
00072 #define NCUDABLOCKS 256 // the number of blocks to divide the shared memory
00073
00074
00075
00076
00077 #define NBLOCKHIST 64 // the number of threads used in the final histogram
00078
00079
00080
00081 #define NCONSTBLOCK 5440 // the number of atoms in constant memory.
00082
00083
00084 #define THREADSPERWARP 32 // this is correct for G80/GT200/Fermi GPUs
00085 #define WARP_LOG_SIZE 5 // this is also correct for G80/GT200/Fermi GPUs
00086
00087 #define BIN_OVERFLOW_LIMIT 2147483648 // maximum value a bin can store before
00088
00089
00090
00091
00092
00093 __global__ void init_hist(unsigned int* llhistg,
00094 int maxbin)
00095 {
00096
00097 #ifndef __USEATOMIC
00098 int io;
00099 #endif
00100
00101 int iblock;
00102
00103 unsigned int *llhist;
00104
00105
00106 int nofblocks;
00107 int nleftover;
00108
00109 int maxloop;
00110
00111
00112
00113
00114 #ifdef __USEATOMIC
00115 llhist=llhistg+blockIdx.x*maxbin+threadIdx.x;
00116 nofblocks=maxbin/NBLOCK;
00117 nleftover=maxbin-nofblocks*NBLOCK;
00118 maxloop=nofblocks*NBLOCK;
00119
00120 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00121 llhist[iblock]=0UL;
00122 }
00123
00124 if (threadIdx.x < nleftover) {
00125 llhist[iblock]=0UL;
00126 }
00127
00128 #else
00129 llhist=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*maxbin+threadIdx.x;
00130 nofblocks=maxbin/NBLOCK;
00131 nleftover=maxbin-nofblocks*NBLOCK;
00132 maxloop=nofblocks*NBLOCK;
00133 int maxloop2=(NBLOCK / THREADSPERWARP)*maxbin;
00134
00135 for (io=0; io < maxloop2; io+=maxbin) {
00136 for (iblock=0; iblock < maxloop; iblock+=NBLOCK) {
00137 llhist[io + iblock]=0UL;
00138 }
00139
00140 if (threadIdx.x < nleftover) {
00141 llhist[io + iblock]=0UL;
00142 }
00143 }
00144 #endif
00145
00146 return;
00147 }
00148
00149
00150
00151
00152
00153
00154 __global__ void init_hist_f(float* histdev) {
00155 histdev[threadIdx.x]=0.0f;
00156 return;
00157 }
00158
00159
00160
00161
00162
00163
00164
00165
00166 __global__ void reimage_xyz(float* xyz,
00167 int natomsi,
00168 int natomsipad,
00169 float3 celld,
00170 float rmax)
00171 {
00172 int iblock;
00173
00174 __shared__ float xyzi[3*NBLOCK];
00175
00176
00177 int nofblocks;
00178
00179
00180
00181 float *pxi;
00182
00183
00184 int threetimesid;
00185
00186 float rtmp;
00187
00188 int ifinalblock;
00189
00190 int loopmax;
00191
00192
00193 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00194 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00195 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00196 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00197 threetimesid=3*threadIdx.x;
00198
00199
00200 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00201 __syncthreads();
00202
00203 xyzi[threadIdx.x ]=pxi[iblock ];
00204 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00205 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00206 __syncthreads();
00207
00208
00209
00210
00211 xyzi[threetimesid] = fmodf(xyzi[threetimesid ], celld.x);
00212 if (xyzi[threetimesid ] < 0.0f) {
00213 xyzi[threetimesid ] += celld.x;
00214 }
00215 xyzi[threetimesid+1]=fmodf(xyzi[threetimesid+1], celld.y);
00216 if (xyzi[threetimesid+1] < 0.0f) {
00217 xyzi[threetimesid+1] += celld.y;
00218 }
00219 xyzi[threetimesid+2]=fmodf(xyzi[threetimesid+2], celld.z);
00220 if (xyzi[threetimesid+2] < 0.0f) {
00221 xyzi[threetimesid+2] += celld.z;
00222 }
00223
00224
00225 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00226
00227
00228
00229 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00230 rtmp = -((float)(threadIdx.x+1))*rmax;
00231 xyzi[threetimesid ] = rtmp;
00232 xyzi[threetimesid+1] = rtmp;
00233 xyzi[threetimesid+2] = rtmp;
00234 }
00235 }
00236
00237 __syncthreads();
00238
00239
00240 pxi[iblock ]=xyzi[threadIdx.x ];
00241 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK];
00242 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00243
00244
00245
00246 }
00247 }
00248
00249
00250
00251
00252 __global__ void phantom_xyz(float* xyz,
00253 int natomsi,
00254 int natomsipad,
00255 float minxyz,
00256 float rmax)
00257 {
00258 int iblock;
00259
00260 __shared__ float xyzi[3*NBLOCK];
00261
00262
00263 int nofblocks;
00264
00265
00266
00267 float *pxi;
00268
00269
00270 int threetimesid;
00271
00272 float rtmp;
00273
00274 int ifinalblock;
00275
00276 int loopmax;
00277
00278
00279 nofblocks=((natomsipad/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00280 loopmax=nofblocks*3*NCUDABLOCKS*NBLOCK;
00281 ifinalblock=(natomsipad / NBLOCK - 1) % NCUDABLOCKS;
00282 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00283 threetimesid=3*threadIdx.x;
00284
00285
00286 for (iblock=0; iblock<loopmax; iblock+=3*NCUDABLOCKS*NBLOCK) {
00287 __syncthreads();
00288
00289 xyzi[threadIdx.x ]=pxi[iblock ];
00290 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00291 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00292 __syncthreads();
00293
00294
00295 if (iblock==loopmax-3*NCUDABLOCKS*NBLOCK && blockIdx.x==ifinalblock) {
00296
00297
00298
00299 if ((blockDim.x-threadIdx.x) <= (natomsipad - natomsi)) {
00300 rtmp = -((float)(threadIdx.x+1))*rmax-minxyz;
00301 xyzi[threetimesid ] = rtmp;
00302 xyzi[threetimesid+1] = rtmp;
00303 xyzi[threetimesid+2] = rtmp;
00304 }
00305 }
00306
00307 __syncthreads();
00308
00309
00310 pxi[iblock ]=xyzi[threadIdx.x ];
00311 pxi[iblock+ NBLOCK]=xyzi[threadIdx.x+ NBLOCK];
00312 pxi[iblock+2*NBLOCK]=xyzi[threadIdx.x+2*NBLOCK];
00313
00314
00315
00316 }
00317 }
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331 __constant__ static float xyzj[3*NCONSTBLOCK];
00332
00333
00334
00335 void copycoordstoconstbuff(int natoms, const float* xyzh) {
00336 cudaMemcpyToSymbol(xyzj, xyzh, 3*natoms*sizeof(float));
00337 }
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347 #ifndef __USEATOMIC
00348 __device__ void addData(volatile unsigned int *s_WarpHist,
00349
00350 unsigned int data,
00351 unsigned int threadTag)
00352 {
00353 unsigned int count;
00354
00355 do {
00356 count = s_WarpHist[data] & 0x07FFFFFFUL;
00357 count = threadTag | (count + 1);
00358 s_WarpHist[data] = count;
00359 } while(s_WarpHist[data] != count);
00360 }
00361 #endif
00362
00363
00364
00365
00366
00367 __global__ void calculate_rdf(int usepbc,
00368 float* xyz,
00369 int natomsi,
00370 int natomsj,
00371 float3 celld,
00372 unsigned int* llhistg,
00373 int nbins,
00374 float rmin,
00375 float delr_inv)
00376 {
00377 int io;
00378 int iblock;
00379
00380 #ifdef __USEATOMIC
00381 unsigned int *llhists1;
00382
00383
00384 __shared__ unsigned int llhists[MAXBIN];
00385 #else
00386 volatile unsigned int *llhists1;
00387
00388
00389 volatile __shared__ unsigned int llhists[(NBLOCK*MAXBIN)/THREADSPERWARP];
00390
00391 #endif
00392
00393 __shared__ float xyzi[3*NBLOCK];
00394
00395
00396 float rxij, rxij2, rij;
00397
00398
00399 int ibin,nofblocksi;
00400 int nleftover;
00401 float *pxi;
00402 unsigned int joffset;
00403 unsigned int loopmax, loopmax2;
00404 float rmin2=.0001/delr_inv;
00405
00406 #ifndef __USEATOMIC
00407
00408
00409 const unsigned int threadTag = ((unsigned int)
00410 ( threadIdx.x & (THREADSPERWARP - 1)))
00411 << (32 - WARP_LOG_SIZE);
00412 #endif
00413
00414
00415
00416
00417
00418
00419 #ifdef __USEATOMIC
00420 llhists1=llhists;
00421 nofblocksi=nbins/NBLOCK;
00422 nleftover=nbins-nofblocksi*NBLOCK;
00423 #else
00424 llhists1=llhists+(threadIdx.x/THREADSPERWARP)*MAXBIN;
00425 nofblocksi=nbins/THREADSPERWARP;
00426 nleftover=nbins-nofblocksi*THREADSPERWARP;
00427 #endif
00428
00429
00430 #ifdef __USEATOMIC
00431 loopmax = nofblocksi*NBLOCK;
00432 for (io=0; io<loopmax; io+=NBLOCK) {
00433 llhists1[io + threadIdx.x]=0UL;
00434 }
00435 if (threadIdx.x < nleftover) {
00436 llhists1[io + threadIdx.x]=0UL;
00437 }
00438
00439 #else
00440 loopmax = nofblocksi*THREADSPERWARP;
00441 for (io=0; io<loopmax; io+=THREADSPERWARP) {
00442 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00443 }
00444 if ((threadIdx.x & (THREADSPERWARP - 1)) < nleftover) {
00445 llhists1[io + (threadIdx.x & (THREADSPERWARP - 1))]=0UL;
00446 }
00447 #endif
00448
00449
00450 nofblocksi=((natomsi/NBLOCK)+NCUDABLOCKS-blockIdx.x-1)/NCUDABLOCKS;
00451 pxi=xyz+blockIdx.x*3*NBLOCK+threadIdx.x;
00452
00453 loopmax = 3 * natomsj;
00454 int idxt3 = 3 * threadIdx.x;
00455 int idxt3p1 = idxt3+1;
00456 int idxt3p2 = idxt3+2;
00457
00458 loopmax2 = nofblocksi*3*NCUDABLOCKS*NBLOCK;
00459
00460
00461
00462 if (usepbc) {
00463 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00464 __syncthreads();
00465
00466
00467 xyzi[threadIdx.x ]=pxi[iblock ];
00468 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00469 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00470
00471 __syncthreads();
00472
00473 for (joffset=0; joffset<loopmax; joffset+=3) {
00474
00475
00476 rxij=fabsf(xyzi[idxt3 ] - xyzj[joffset ]);
00477 rxij2=celld.x - rxij;
00478 rxij=fminf(rxij, rxij2);
00479 rij=rxij*rxij;
00480 rxij=fabsf(xyzi[idxt3p1] - xyzj[joffset+1]);
00481 rxij2=celld.y - rxij;
00482 rxij=fminf(rxij, rxij2);
00483 rij+=rxij*rxij;
00484 rxij=fabsf(xyzi[idxt3p2] - xyzj[joffset+2]);
00485 rxij2=celld.z - rxij;
00486 rxij=fminf(rxij, rxij2);
00487 rij=sqrtf(rij + rxij*rxij);
00488
00489
00490 ibin=__float2int_rd((rij-rmin)*delr_inv);
00491 if (ibin<nbins && ibin>=0 && rij>rmin2) {
00492 #ifdef __USEATOMIC
00493 atomicAdd(llhists1+ibin, 1U);
00494 #else
00495 addData(llhists1, ibin, threadTag);
00496 #endif
00497 }
00498 }
00499 }
00500 } else {
00501 for (iblock=0; iblock<loopmax2; iblock+=3*NCUDABLOCKS*NBLOCK) {
00502 __syncthreads();
00503
00504
00505 xyzi[threadIdx.x ]=pxi[iblock ];
00506 xyzi[threadIdx.x+ NBLOCK]=pxi[iblock+ NBLOCK];
00507 xyzi[threadIdx.x+2*NBLOCK]=pxi[iblock+2*NBLOCK];
00508
00509 __syncthreads();
00510
00511
00512 for (joffset=0; joffset<loopmax; joffset+=3) {
00513
00514
00515 rxij=xyzi[idxt3 ] - xyzj[joffset ];
00516 rij=rxij*rxij;
00517 rxij=xyzi[idxt3p1] - xyzj[joffset+1];
00518 rij+=rxij*rxij;
00519 rxij=xyzi[idxt3p2] - xyzj[joffset+2];
00520 rij=sqrtf(rij + rxij*rxij);
00521
00522
00523 ibin=__float2int_rd((rij-rmin)*delr_inv);
00524 if (ibin<nbins && ibin>=0 && rij>rmin2) {
00525 #ifdef __USEATOMIC
00526 atomicAdd(llhists1+ibin, 1U);
00527 #else
00528 addData(llhists1, ibin, threadTag);
00529 #endif
00530 }
00531 }
00532 }
00533 }
00534
00535 __syncthreads();
00536
00537
00538
00539
00540 nofblocksi=nbins/NBLOCK;
00541 nleftover=nbins-nofblocksi*NBLOCK;
00542
00543 #ifdef __USEATOMIC
00544
00545 unsigned int *llhistg1=llhistg+blockIdx.x*MAXBIN+threadIdx.x;
00546
00547
00548 loopmax = nofblocksi*NBLOCK;
00549 for (iblock=0; iblock < loopmax; iblock+=NBLOCK) {
00550 llhistg1[iblock] += llhists[iblock+threadIdx.x];
00551 }
00552
00553 if (threadIdx.x < nleftover) {
00554 llhistg1[iblock] += llhists[iblock+threadIdx.x];
00555 }
00556 #else
00557
00558 unsigned int* llhistg1=llhistg+(((blockIdx.x)*NBLOCK)/THREADSPERWARP)*MAXBIN+threadIdx.x;
00559
00560
00561 loopmax = MAXBIN * (NBLOCK / THREADSPERWARP);
00562 loopmax2 = nofblocksi * NBLOCK;
00563 for (io=0; io < loopmax; io+=MAXBIN) {
00564 for (iblock=0; iblock < loopmax2; iblock+=NBLOCK) {
00565 llhistg1[io+iblock] += llhists[io+iblock+threadIdx.x];
00566 }
00567
00568 if (threadIdx.x < nleftover) {
00569 llhistg1[iblock] += llhists[io+iblock+threadIdx.x];
00570 }
00571 }
00572 #endif
00573
00574 return;
00575 }
00576
00577
00578 #define ull2float __uint2float_rn
00579
00580
00581
00582
00583
00584
00585 __global__ void calculate_histogram(float* histdev,
00586 unsigned int* llhistg,
00587
00588
00589 int nbins)
00590 {
00591 int io;
00592 int iblock;
00593 int maxloop;
00594
00595 __shared__ unsigned int llhists[NBLOCKHIST];
00596
00597 __shared__ float xi[NBLOCKHIST];
00598 int nofblocks;
00599
00600 nofblocks=nbins/NBLOCKHIST;
00601 int nleftover=nbins-nofblocks*NBLOCKHIST;
00602 maxloop = nofblocks*NBLOCKHIST;
00603
00604
00605 for (iblock=0;iblock<maxloop;iblock+=NBLOCKHIST) {
00606 xi[threadIdx.x]=0.0f;
00607
00608
00609 #ifdef __USEATOMIC
00610 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00611 #else
00612 for (io=0; io < MAXBIN * (NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00613 #endif
00614
00615 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00616
00617
00618 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00619
00620
00621 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00622 }
00623
00624
00625 histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00626 }
00627
00628
00629 if (threadIdx.x < nleftover) {
00630 xi[threadIdx.x]=0.0f;
00631
00632
00633 #ifdef __USEATOMIC
00634 for (io=0; io < MAXBIN*NCUDABLOCKS; io+=MAXBIN) {
00635 #else
00636 for (io=0; io < MAXBIN *(NCUDABLOCKS*NBLOCK / THREADSPERWARP); io+=MAXBIN) {
00637 #endif
00638
00639 llhists[threadIdx.x]=llhistg[io+iblock+threadIdx.x];
00640
00641
00642 llhists[threadIdx.x]=llhists[threadIdx.x] & 0x07FFFFFFUL;
00643
00644
00645 xi[threadIdx.x]+=ull2float(llhists[threadIdx.x]);
00646 }
00647
00648
00649 histdev[iblock+threadIdx.x]+=xi[threadIdx.x];
00650 }
00651
00652
00653 return;
00654 }
00655
00656
00657
00658
00659
00660
00661 void calculate_histogram_block(int usepbc,
00662 float *xyz,
00663 int natomsi,
00664 int natomsj,
00665 float3 celld,
00666 unsigned int* llhistg,
00667 int nbins,
00668 float rmin,
00669 float delr_inv,
00670 float *histdev,
00671 int nblockhist) {
00672
00673 init_hist<<<NCUDABLOCKS, NBLOCK>>>(llhistg, MAXBIN);
00674
00675
00676 calculate_rdf<<<NCUDABLOCKS, NBLOCK>>>(usepbc, xyz, natomsi, natomsj, celld,
00677 llhistg, nbins, rmin, delr_inv);
00678
00679
00680 calculate_histogram<<<1, nblockhist>>>(histdev, llhistg, nbins);
00681 }
00682
00683
00684
00685
00686
00687
00688 typedef struct {
00689 int usepbc;
00690 int natoms1;
00691 float* xyz;
00692 int natoms2;
00693 float** xyz2array;
00694 float* cell;
00695 float** histarray;
00696 int nbins;
00697 int nblockhist;
00698
00699 float rmin;
00700 float delr;
00701 int nblocks;
00702 int nhblock;
00703 } rdfthrparms;
00704
00705
00706
00707
00708
00709
00710
00711
00712
00713 static void * rdf_thread(void *voidparms) {
00714 rdfthrparms *parms = NULL;
00715 int threadid=0;
00716
00717 wkf_threadpool_worker_getid(voidparms, &threadid, NULL);
00718 wkf_threadpool_worker_getdata(voidparms, (void **) &parms);
00719
00720 #if 0
00721 printf("rdf thread[%d] lanched and running...\n", threadid);
00722 #endif
00723 cudaSetDevice(threadid);
00724
00725
00726
00727
00728 const int natoms1 = parms->natoms1;
00729 const float *xyz = parms->xyz;
00730 const int natoms2 = parms->natoms2;
00731 const float *cell = parms->cell;
00732 const int nbins = parms->nbins;
00733 const int nblockhist = parms->nblockhist;
00734 const float rmin = parms->rmin;
00735 const float delr = parms->delr;
00736 float *xyz2 = parms->xyz2array[threadid];
00737 float *hist = parms->histarray[threadid];
00738 const int nhblock = parms->nhblock;
00739 const int usepbc = parms->usepbc;
00740
00741 float* xyzdev;
00742 float3 celld;
00743 float* histdev;
00744 unsigned int* llhistg;
00745
00746 int nconstblocks;
00747 int nleftover;
00748 int nbigblocks;
00749 int ibigblock;
00750 int nleftoverbig;
00751 int ntmpbig;
00752 int nbinstmp;
00753 int natoms1pad;
00754 int natoms2pad;
00755 float rmax;
00756
00757
00758 natoms1pad=natoms1;
00759 if (natoms1%NBLOCK!=0) {natoms1pad = (natoms1 / NBLOCK + 1) * NBLOCK;}
00760 natoms2pad=natoms2;
00761 if (natoms2%NBLOCK!=0) {natoms2pad = (natoms2 / NBLOCK + 1) * NBLOCK;}
00762
00763
00764
00765
00766 cudaMalloc((void**)&xyzdev, 3*max(natoms1pad,natoms2pad)*sizeof(float));
00767
00768
00769 cudaMalloc((void**)&histdev, nbins*sizeof(float));
00770
00771
00772 int ihblock;
00773 int maxloop=nbins/nblockhist;
00774 if (maxloop*nblockhist!=nbins) {maxloop=maxloop+1;}
00775 for (ihblock=0; ihblock<maxloop; ihblock++) {
00776 init_hist_f<<<1, nblockhist>>>(histdev+ihblock*nblockhist);
00777 }
00778
00779
00780 #ifdef __USEATOMIC
00781 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*MAXBIN*sizeof(int)));
00782 #else
00783 cudaMalloc((void**)&llhistg, (NCUDABLOCKS*NBLOCK*MAXBIN*
00784 sizeof(int))/THREADSPERWARP);
00785 #endif
00786
00787
00788 celld.x=cell[0];
00789 celld.y=cell[1];
00790 celld.z=cell[2];
00791
00792
00793
00794 rmax = 1.1f * (rmin+((float)nbins)*delr);
00795
00796
00797 cudaMemcpy(xyzdev, xyz2, 3*natoms2*sizeof(float), cudaMemcpyHostToDevice);
00798
00799 if (usepbc) {
00800
00801 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,celld,rmax);
00802 } else {
00803
00804 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms2,natoms2pad,1.0e38f,rmax);
00805 }
00806
00807
00808
00809 cudaMemcpy(xyz2, xyzdev, 3*natoms2*sizeof(float), cudaMemcpyDeviceToHost);
00810
00811
00812 cudaMemcpy(xyzdev, xyz, 3*natoms1*sizeof(float), cudaMemcpyHostToDevice);
00813
00814 if (usepbc) {
00815
00816 reimage_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,celld,rmax);
00817 } else {
00818
00819 phantom_xyz<<<NCUDABLOCKS,NBLOCK>>>(xyzdev,natoms1,natoms1pad,1.0e38f,rmax);
00820 }
00821
00822
00823 nbinstmp = MAXBIN;
00824
00825
00826 nconstblocks=natoms2/NCONSTBLOCK;
00827 nleftover=natoms2-nconstblocks*NCONSTBLOCK;
00828
00829
00830 ntmpbig = NBLOCK * ((BIN_OVERFLOW_LIMIT / NCONSTBLOCK) / NBLOCK);
00831 nbigblocks = natoms1pad / ntmpbig;
00832 nleftoverbig = natoms1pad - nbigblocks * ntmpbig;
00833
00834 int nbblocks = (nleftoverbig != 0) ? nbigblocks+1 : nbigblocks;
00835
00836
00837 wkf_tasktile_t tile;
00838 while (wkf_threadpool_next_tile(voidparms, 1, &tile) != WKF_SCHED_DONE) {
00839 #if 0
00840 printf("rdf thread[%d] fetched tile: s: %d e: %d\n", threadid, tile.start, tile.end);
00841 #endif
00842 int iconstblock = -1;
00843 ihblock = -1;
00844 int workitem;
00845 for (workitem=tile.start; workitem<tile.end; workitem++) {
00846
00847 ihblock = workitem % nhblock;
00848 int newiconstblock = workitem / nhblock;
00849 int blocksz;
00850
00851
00852 if (iconstblock != newiconstblock) {
00853 iconstblock = newiconstblock;
00854
00855
00856 blocksz = (iconstblock == nconstblocks && nleftover != 0)
00857 ? nleftover : NCONSTBLOCK;
00858
00859
00860 copycoordstoconstbuff(blocksz, xyz2+(3*NCONSTBLOCK*iconstblock));
00861 }
00862
00863 #if 0
00864 printf("rdf thread[%d] iconstblock: %d ihblock: %d\n", threadid,
00865 iconstblock, ihblock);
00866 #endif
00867
00868
00869
00870
00871 float rmintmp=rmin + (MAXBIN * delr * ihblock);
00872
00873 nbinstmp = MAXBIN;
00874
00875
00876 if (ihblock==(nhblock-1)) {
00877 nbinstmp = nbins%MAXBIN;
00878
00879 if (nbinstmp==0) { nbinstmp = MAXBIN;}
00880 }
00881
00882 for (ibigblock=0; ibigblock < nbblocks; ibigblock++) {
00883
00884 int bblocksz = (ibigblock == nbigblocks && nleftoverbig != 0)
00885 ? nleftoverbig : ntmpbig;
00886
00887 calculate_histogram_block(usepbc, (xyzdev+ibigblock*ntmpbig*3),
00888 bblocksz, blocksz, celld,
00889 llhistg, nbinstmp, rmintmp, (1/delr),
00890 histdev+ihblock*MAXBIN, nblockhist);
00891 }
00892
00893 cudaDeviceSynchronize();
00894 }
00895 }
00896
00897
00898 cudaMemcpy(hist, histdev, nbins*sizeof(float), cudaMemcpyDeviceToHost);
00899
00900 cudaFree(xyzdev);
00901 cudaFree(histdev);
00902 cudaFree(llhistg);
00903
00904 #if 0
00905 printf("thread[%d] finished, releasing resources\n", threadid);
00906 #endif
00907
00908 return NULL;
00909 }
00910
00911
00912 int rdf_gpu(wkf_threadpool_t *devpool,
00913 int usepbc,
00914 int natoms1,
00915
00916 float *xyz,
00917
00918 int natoms2,
00919
00920 float *xyz2,
00921
00922 float* cell,
00923 float* hist,
00924
00925 int nbins,
00926 float rmin,
00927 float delr)
00928 {
00929 int i, ibin;
00930
00931 if (devpool == NULL) {
00932 return -1;
00933 }
00934 int numprocs = wkf_threadpool_get_workercount(devpool);
00935
00936
00937 float **xyz2array = (float**) calloc(1, sizeof(float *) * numprocs);
00938 float **histarray = (float**) calloc(1, sizeof(float *) * numprocs);
00939 for (i=0; i<numprocs; i++) {
00940 xyz2array[i] = (float*) calloc(1, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00941 memcpy(xyz2array[i], xyz2, sizeof(float) * 3 * (natoms2/NBLOCK + 1) * NBLOCK);
00942 histarray[i] = (float*) calloc(1, sizeof(float) * nbins);
00943 }
00944
00945 memset(hist, 0, nbins * sizeof(float));
00946
00947
00948 rdfthrparms parms;
00949
00950 parms.usepbc = usepbc;
00951 parms.natoms1 = natoms1;
00952 parms.xyz = xyz;
00953 parms.natoms2 = natoms2;
00954 parms.cell = cell;
00955 parms.nbins = nbins;
00956 parms.nblockhist = NBLOCKHIST;
00957 parms.rmin = rmin;
00958 parms.delr = delr;
00959 parms.xyz2array = xyz2array;
00960 parms.histarray = histarray;
00961
00962
00963 parms.nhblock = (nbins+MAXBIN-1)/MAXBIN;
00964
00965
00966 int nconstblocks=parms.natoms2/NCONSTBLOCK;
00967 int nleftoverconst=parms.natoms2 - nconstblocks*NCONSTBLOCK;
00968
00969
00970 parms.nblocks = (nleftoverconst != 0) ? nconstblocks+1 : nconstblocks;
00971
00972 int parblocks = parms.nblocks * parms.nhblock;
00973 #if 0
00974 printf("master thread: number of parallel work items: %d\n", parblocks);
00975 #endif
00976
00977
00978 wkf_tasktile_t tile;
00979 tile.start=0;
00980 tile.end=parblocks;
00981 wkf_threadpool_sched_dynamic(devpool, &tile);
00982 wkf_threadpool_launch(devpool, rdf_thread, &parms, 1);
00983
00984 memset(hist, 0, nbins * sizeof(float));
00985
00986 for (i=0; i<numprocs; i++) {
00987 for (ibin=0; ibin<nbins; ibin++) {
00988 hist[ibin] += histarray[i][ibin];
00989 }
00990 }
00991
00992 return 0;
00993 }
00994
00995
00996