15 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
16 #include <sys/types.h>
21 #if (CHECK_ASSUME_ALIGNED != 0)
22 #define __ASSUME_ALIGNED(v) assert(((unsigned long long int)(v)) % 64 == 0); __assume_aligned((v), MIC_ALIGN);
24 #define __ASSUME_ALIGNED(v) __assume_aligned((v), MIC_ALIGN);
28 #if (CHECK_ASSUME != 0)
29 #define __ASSUME(s) assert(s); __assume(s);
31 #define __ASSUME(s) __assume(s);
35 #define RESTRICT __restrict
38 #define __thread __declspec(thread)
47 __attribute__((target(mic))) double * device__table_four = NULL;
48 __attribute__((target(mic)))
float * device__table_four_float = NULL;
49 __attribute__((target(mic)))
int device__table_four_n_16 = 0;
51 __attribute__((target(mic)))
double * device__lj_table = NULL;
52 __attribute__((target(mic)))
float * device__lj_table_float = NULL;
53 __attribute__((target(mic)))
int device__lj_table_dim = 0;
54 __attribute__((target(mic)))
int device__lj_table_size = 0;
56 __attribute__((target(mic)))
unsigned int * device__exclusion_bits = NULL;
57 __attribute__((target(mic)))
long int device__exclusion_bits_size = 0;
59 __attribute__((target(mic))) mic_constants * device__constants = NULL;
61 __attribute__((target(mic)))
double * device__table_four_copy = NULL;
62 __attribute__((target(mic)))
float * device__table_four_float_copy = NULL;
63 __attribute__((target(mic)))
double * device__lj_table_copy = NULL;
64 __attribute__((target(mic)))
float * device__lj_table_float_copy = NULL;
65 __attribute__((target(mic)))
unsigned int * device__exclusion_bits_copy = NULL;
66 __attribute__((target(mic))) mic_constants * device__constants_copy = NULL;
68 __attribute__((target(mic))) patch_pair* device__patch_pairs_copy = NULL;
69 __attribute__((target(mic))) force_list* device__force_lists_copy = NULL;
70 __attribute__((target(mic))) atom* device__atoms_copy = NULL;
71 __attribute__((target(mic))) atom_param* device__atom_params_copy = NULL;
72 __attribute__((target(mic)))
int device__patch_pairs_copy_size = 0;
73 __attribute__((target(mic)))
int device__force_lists_copy_size = 0;
74 __attribute__((target(mic)))
int device__atoms_copy_size = 0;
75 __attribute__((target(mic)))
int device__atom_params_copy_size = 0;
83 __thread
int host__pe = -1;
84 __thread
int host__node = -1;
85 __attribute__((target(mic)))
int device__pe = -1;
86 __attribute__((target(mic)))
int device__node = -1;
88 __thread
int singleKernelFlag = 0;
90 __thread patch_pair * host__patch_pairs = NULL;
91 __thread
int host__patch_pairs_size = 0;
92 __thread
int host__patch_pairs_bufSize = 0;
93 __attribute__((target(mic))) const patch_pair * device__patch_pairs = NULL;
94 __attribute__((target(mic)))
int device__patch_pairs_size = 0;
96 __thread force_list * host__force_lists = NULL;
97 __thread
int host__force_lists_size = 0;
98 __thread
int host__force_lists_bufSize = 0;
99 __attribute__((target(mic))) force_list * device__force_lists = NULL;
100 __attribute__((target(mic)))
int device__force_lists_size = 0;
102 __attribute__((target(mic))) uintptr_t device__pairlists = 0;
103 __attribute__((target(mic)))
int device__pairlists_alloc_size = 0;
105 __attribute__((target(mic)))
int** device__pl_array = NULL;
106 __attribute__((target(mic)))
int* device__pl_size = NULL;
107 __attribute__((target(mic)))
double** device__r2_array = NULL;
109 __thread atom * host__atoms = NULL;
110 __thread atom_param * host__atom_params = NULL;
111 __thread double4 * host__forces = NULL;
112 __thread double4 * host__slow_forces = NULL;
113 __thread
int host__atoms_size = 0;
114 __thread
int host__atoms_bufSize = 0;
115 __attribute__((target(mic))) atom * device__atoms = NULL;
116 __attribute__((target(mic))) atom_param * device__atom_params = NULL;
117 __attribute__((target(mic))) double4 * device__forces = NULL;
118 __attribute__((target(mic))) double4 * device__slow_forces = NULL;
119 __attribute__((target(mic)))
int device__atoms_size = 0;
121 __thread
size_t host__force_buffers_req_size = 0;
122 __attribute__((target(mic)))
size_t device__force_buffers_req_size = 0;
123 __attribute__((target(mic)))
size_t device__force_buffers_alloc_size = 0;
124 __attribute__((target(mic))) double4 * device__force_buffers = NULL;
125 __attribute__((target(mic))) double4 * device__slow_force_buffers = NULL;
127 __attribute__((target(mic))) mic_position3_t device__lata;
128 __attribute__((target(mic))) mic_position3_t device__latb;
129 __attribute__((target(mic))) mic_position3_t device__latc;
131 __thread mic_kernel_data * host__kernel_data = NULL;
133 __attribute__((target(mic)))
int device__numOMPThreads = -1;
135 __thread
int tag_atom_params;
136 __thread
int tag_remote_kernel;
137 __thread
int tag_local_kernel;
139 __thread
int patch_pairs_copySize;
140 __thread
int force_lists_copySize;
141 __thread
int atom_params_copySize;
143 #if MIC_DEVICE_FPRINTF != 0
144 __attribute__((target(mic))) FILE * device__fout = NULL;
149 __attribute__((target(mic))) int device__timestep = 0;
153 #include <sys/time.h>
154 __declspec(target(mic))
155 double getCurrentTime() {
157 gettimeofday(&now, NULL);
158 return (
double)(now.tv_sec + (now.tv_usec * 1.0e-6));
162 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
163 #if MIC_DEVICE_TRACING_DETAILED != 0
164 __thread
double * host__device_times_computes = NULL;
165 __thread
double * host__device_times_patches = NULL;
166 __attribute__((target(mic))) double * device__device_times_computes = NULL;
167 __attribute__((target(mic)))
double * device__device_times_patches = NULL;
169 __thread
double * host__device_times_start = NULL;
170 __attribute__((target(mic))) double * device__device_times_start = NULL;
178 __attribute__((target(mic)))
179 void mic_dev_die(const
char * const str) {
181 const char *
const loc =
"on device";
183 const char *
const loc =
"on host";
186 printf(
"[MIC_DIE] :: \"%s\" (%s)\n", str, loc);
188 printf(
"[MIC_DIE] :: mic_dev_die called (%s)\n", loc);
195 __attribute__((target(mic)))
196 void mic_print_config() {
198 #if MIC_PRINT_CONFIG != 0
203 printf(
"device :: MULTIPLE_THREADS (%d)\n", MULTIPLE_THREADS);
204 printf(
"device :: # OpenMP Threads : %d\n", device__numOMPThreads);
206 printf(
"device :: MIC_HANDCODE_FORCE (%d)\n", MIC_HANDCODE_FORCE);
207 printf(
"device :: MIC_HANDCODE_FORCE_PFDIST (%d)\n", MIC_HANDCODE_FORCE_PFDIST);
208 printf(
"device :: MIC_HANDCODE_FORCE_USEGATHER_NBTBL (%d)\n", MIC_HANDCODE_FORCE_USEGATHER_NBTBL);
209 printf(
"device :: MIC_HANDCODE_FORCE_USEGATHER (%d)\n", MIC_HANDCODE_FORCE_USEGATHER);
210 printf(
"device :: MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT (%d)\n", MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT);
211 printf(
"device :: MIC_HANDCODE_FORCE_COMBINE_FORCES (%d)\n", MIC_HANDCODE_FORCE_COMBINE_FORCES);
213 printf(
"device :: MIC_HANDCODE_FORCE_SINGLE (%d)\n", MIC_HANDCODE_FORCE_SINGLE);
214 printf(
"device :: MIC_HANDCODE_FORCE_CALCR2TABLE (%d)\n", MIC_HANDCODE_FORCE_CALCR2TABLE);
215 printf(
"device :: MIC_HANDCODE_FORCE_SOA_VS_AOS (%d)\n", MIC_HANDCODE_FORCE_SOA_VS_AOS);
217 printf(
"device :: MIC_SORT_ATOMS (%d)\n", MIC_SORT_ATOMS);
218 printf(
"device :: MIC_SORT_LISTS (%d)\n", MIC_SORT_LISTS);
220 printf(
"device :: MIC_HANDCODE_PLGEN (%d)\n", MIC_HANDCODE_PLGEN);
221 printf(
"device :: MIC_TILE_PLGEN (%d)\n", MIC_TILE_PLGEN);
222 printf(
"device :: MIC_CONDITION_NORMAL (%d)\n", MIC_CONDITION_NORMAL);
223 printf(
"device :: MIC_PAD_PLGEN (%d)\n", MIC_PAD_PLGEN);
225 printf(
"device :: MIC_SORT_COMPUTES (%d)\n", MIC_SORT_COMPUTES);
227 printf(
"device :: MIC_SYNC_INPUT (%d)\n", MIC_SYNC_INPUT);
229 printf(
"device :: REFINE_PAIRLISTS (%d)\n", REFINE_PAIRLISTS);
230 printf(
"device :: REFINE_PAIRLIST_HANDCODE (%d)\n", REFINE_PAIRLIST_HANDCODE);
231 printf(
"device :: REFINE_PAIRLISTS_XYZ (%d)\n", REFINE_PAIRLISTS_XYZ);
233 printf(
"device :: MIC_FULL_CHECK (%d)\n", MIC_FULL_CHECK);
234 printf(
"device :: MIC_EXCL_CHECKSUM_FULL (%d)\n", MIC_EXCL_CHECKSUM_FULL);
236 printf(
"device :: MIC_ALIGN (%d)\n", MIC_ALIGN);
240 #endif // MIC_PRINT_CONFIG
249 void mic_init_device(
const int pe,
const int node,
const int deviceNum) {
256 host__kernel_data = (mic_kernel_data*)(_MM_MALLOC_WRAPPER(2 *
sizeof(mic_kernel_data), 64,
"mic_kernel_data"));
257 __ASSERT(host__kernel_data != NULL);
258 mic_kernel_data * kernel_data = host__kernel_data;
261 patch_pairs_copySize = 0;
262 force_lists_copySize = 0;
263 atom_params_copySize = 0;
266 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
267 host__device_times_start = (
double*)_MM_MALLOC_WRAPPER(10 *
sizeof(
double), 64,
"host__device_times_start");
268 __ASSERT(host__device_times_start != NULL);
269 double * device_times_start = host__device_times_start;
270 #define DEVICE_TIMES_CLAUSE nocopy(device_times_start[0:10] : alloc_if(1) free_if(0) align(64)) \
271 nocopy(device__device_times_start)
273 #define DEVICE_TIMES_CLAUSE
277 #pragma offload target(mic:deviceNum) \
278 in(pe) in(node) nocopy(device__pe) nocopy(device__node) \
279 in(kernel_data[0:2] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
280 nocopy(device__pl_array) nocopy(device__pl_size) nocopy(device__r2_array) \
281 nocopy(device__numOMPThreads) \
282 DEVICE_TIMES_CLAUSE \
283 DEVICE_FPRINTF_CLAUSE
286 __FULL_CHECK(__ASSERT(node >= 0 && pe >= 0));
291 #if MIC_DEVICE_FPRINTF != 0
293 char filename[128] = { 0 };
294 sprintf(filename,
"/tmp/namd_deviceDebugInfo.%d", device__pe);
295 if (device__node <= 0) {
296 printf(
"[MIC-DEBUG] :: Generating debug output to device file for MICs.\n");
299 device__fout = fopen(filename,
"w");
303 DEVICE_FPRINTF(
"Device on PE %d (node: %d) initializing...\n", device__pe, device__node);
306 #if MULTIPLE_THREADS != 0
307 device__numOMPThreads = omp_get_max_threads();
309 device__numOMPThreads = 1;
314 #if REFINE_PAIRLISTS != 0
315 device__pl_array = (
int**)(_MM_MALLOC_WRAPPER(device__numOMPThreads *
sizeof(
int*), 64,
"device__pl_array")); __ASSERT(device__pl_array != NULL);
316 device__pl_size = (
int*)(_MM_MALLOC_WRAPPER(device__numOMPThreads *
sizeof(
int), 64,
"device__pl_size")); __ASSERT(device__pl_size != NULL);
317 device__r2_array = (
double**)(_MM_MALLOC_WRAPPER(device__numOMPThreads *
sizeof(
double*), 64,
"device__r2_array"); __ASSERT(device__r2_array != NULL);
318 for (
int i = 0; i < device__numOMPThreads; i++) {
319 device__pl_array[i] = NULL;
320 device__pl_size[i] = 0;
321 device__r2_array[i] = NULL;
326 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
327 device__device_times_start = device_times_start;
336 if (node == 0 && deviceNum == 0) { mic_print_config(); }
340 #undef DEVICE_TIMES_CLAUSE
349 __attribute__((target(mic))) int mic_check_internal() {
365 int mic_check(
int deviceNum) {
367 #pragma offload target(mic:deviceNum) inout(dev_ok)
369 #if !defined(__MIC__)
370 mic_dev_die(
"mic_check :: Attempt to execute offload on host (not supported yet)... exiting."); fflush(NULL);
372 dev_ok = mic_check_internal();
380 #if MIC_DATA_STRUCT_VERIFY != 0
383 __declspec(target(mic))
384 void _verify_parallel_memcpy(
void * const dst,
385 const
void * const src,
388 const char*
const src_c = (
const char*
const)src;
389 char*
const dst_c = (
char*
const)dst;
390 #pragma omp parallel for schedule(static, 4096)
391 for (
int i = 0; i < size; i++) { dst_c[i] = src_c[i]; }
395 __declspec(target(mic))
396 void* _verify_remalloc(
int ©Size, const
int size, T* &ptr) {
397 if (copySize < size) {
398 copySize = (int)(1.2f * size);
399 copySize = (copySize + 1023) & (~1023);
400 _MM_FREE_WRAPPER(ptr);
401 ptr = (T*)_MM_MALLOC_WRAPPER(copySize *
sizeof(T), 64,
"_verify_remalloc");
402 __FULL_CHECK(assert(ptr != NULL));
408 __declspec(target(mic))
409 int _verify_pairlist(const
int pe, const
int timestep, const
int isRemote, const
int phase,
410 const
int i_upper, const
int j_upper,
411 const
int * const pairlist_base,
412 const
int ppI, const
char * const plStr,
413 const patch_pair &pp, const
int plTypeIndex
416 const atom_param *
const pExt_0 = device__atom_params + pp.patch1_atom_start;
417 const atom_param *
const pExt_1 = device__atom_params + pp.patch2_atom_start;
419 if (pairlist_base == NULL) {
420 if (timestep > 0) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pairlist_base == NULL !!!\n", pe, timestep, isRemote, phase);
return 1; }
421 }
else if (i_upper <= 0) {
422 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i_upper <= 0 !!!\n", pe, timestep, isRemote, phase);
return 1;
423 }
else if (j_upper <= 0) {
424 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: j_upper <= 0 !!!\n", pe, timestep, isRemote, phase);
return 1;
429 int pairlist_size = pairlist_base[1] - 2;
430 int pairlist_allocSize = pairlist_base[0];
431 const int *
const pairlist = pairlist_base + 2;
433 if (pairlist_size + 2 > pairlist_allocSize) {
434 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pairlist_size + 2 > pairlist_allocSize !!!\n", pe, timestep, isRemote, phase);
return 1;
437 for (
int k = 0; k < pairlist_size; k++) {
439 int somethingWrong = 0;
440 int i = (pairlist[k] >> 16) & 0xFFFF;
441 int j = pairlist[k] & 0xFFFF;
445 const int maxDiff = pExt_1[j].excl_maxdiff;
446 const int indexDiff = pExt_0[i].index - pExt_1[j].index;
447 if (indexDiff < -1 * maxDiff || indexDiff > maxDiff) {
448 if (plTypeIndex != PL_NORM_INDEX) {
449 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: interaction outside maxDiff range in non-NORM pairlist !!!"
450 " - ppI:%d, i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d\n",
451 pe, timestep, isRemote, phase,
452 ppI, i, j, indexDiff, maxDiff, plTypeIndex
457 const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
458 const int offset_major = offset / (
sizeof(
unsigned int) * 8);
459 const int offset_minor = offset % (
sizeof(
unsigned int) * 8);
460 const int exclFlags = ((device__exclusion_bits[offset_major]) >> offset_minor) & 0x03;
461 if (exclFlags == 0x3) {
462 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid exclusion bit pattern detected !!!"
463 " - ppI:%d, i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d\n",
464 pe, timestep, isRemote, phase,
465 ppI, i, j, indexDiff, maxDiff, plTypeIndex
468 }
else if (exclFlags != plTypeIndex) {
469 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: plTypeIndex/exclution_bits mismatch !!!"
470 " - i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d, exclFlags:%d\n",
471 pe, timestep, isRemote, phase,
472 i, j, indexDiff, maxDiff, plTypeIndex, exclFlags
481 if (i < currentI) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i < currentI !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
482 if (k % 16 != 0) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i changed when k % 16 != 0 !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
489 if (i < 0 || i >= i_upper) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid i value detected in padded region!!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
490 if (j != 0xFFFF) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid j value detected in padded region !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
494 if (i < 0 || i >= i_upper) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid i value detected !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
498 if (j < 0 || j >= j_upper) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid j value detected !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
502 if (somethingWrong) {
504 int lo = k - 3;
if (lo < 0) { lo = 0; }
505 int hi = k + 3;
if (hi >= pairlist_size) { hi = pairlist_size; }
507 str += sprintf(str,
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pl.%s[%d:%d->%d,%d:%d] = { ... ", pe, timestep, isRemote, phase, plStr, ppI, lo, hi, k, pairlist_size);
508 for (
int _k = lo; _k < hi; _k++) { str += sprintf(str,
"%s0x%08x%s ", ((_k==k)?(
"<<"):(
"")), pairlist[_k], ((_k==k)?(
">>"):(
""))); }
509 str += sprintf(str,
"}, i_upper:%d, j_upper:%d\n", i_upper, j_upper);
519 __declspec(target(mic))
520 int _verify_forces(const
int pe, const
int timestep, const
int isRemote, const
int phase,
521 const
int index, const double4 * const forceBuffers, const double4 * const
forces,
522 const
char * const typeStr
525 force_list &fl = device__force_lists[index];
528 if (fl.force_list_start < 0 || fl.force_list_start >= device__force_buffers_req_size) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_list_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, index, fl.force_list_start, device__force_buffers_req_size);
return 1; }
529 if (fl.force_output_start < 0 || fl.force_output_start >= device__atoms_size) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_output_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, index, fl.force_output_start, device__atoms_size);
return 1; }
530 if (fl.patch_stride < 0) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].patch_stride(%d) < 0 !!!\n", pe, timestep, isRemote, phase, index, fl.patch_stride);
return 1; }
531 if (fl.force_list_size < 0) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_list_size(%d) < 0 !!!\n", pe, timestep, isRemote, phase, index, fl.force_list_size);
return 1; }
532 if (fl.force_list_start + (4 * fl.patch_stride * fl.force_list_size) > device__force_buffers_req_size) {
533 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: fl[%d].force_list_start(%d) + (4 * fl.patch_stride(%d) * fl.force_list_size(%d)) > device__force_buffers_req_size(%lu) !!!\n",
534 pe, timestep, isRemote, phase, index, fl.force_list_start, fl.patch_stride, fl.force_list_size, device__force_buffers_req_size);
539 double* f = (
double*)(forceBuffers + fl.force_list_start);
540 for (
int i = 0; i < 4 * fl.patch_stride * fl.force_list_size; i++) {
541 if (fabsf(f[i]) > 2.5e2) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: Large %sforce detected (1) !!! f[%d]:%le\n", pe, timestep, isRemote, phase, typeStr, i, f[i]); }
545 f = (
double*)(forces + fl.force_output_start);
546 for (
int i = 0; i < 4 * fl.patch_stride; i++) {
547 if (fabsf(f[i]) > 2.5e2) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: Large %sforce detected (2) !!! f[%d]:%le\n", pe, timestep, isRemote, phase, typeStr, i, f[i]); }
553 __declspec(target(mic))
554 int _verify_buffers_match(const
int pe, const
int timestep, const
int isRemote, const
int phase,
555 const
char * const buf, const
char * const golden, const
unsigned int bufSize,
556 const
char * const str
559 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf == NULL (%s) !!!\n", pe, timestep, isRemote, phase, str);
return 1;
560 }
else if (golden == NULL) {
561 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: golden == NULL (%s) !!!\n", pe, timestep, isRemote, phase, str);
return 1;
562 }
else if (bufSize < 0) {
563 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: bufSize <= 0 (%s) !!!\n", pe, timestep, isRemote, phase, str);
return 1;
565 int mismatchCount = 0;
566 #pragma omp parallel for reduction(+:mismatchCount)
567 for (
int i = 0; i < bufSize; i++) {
569 if (buf[i] != golden[i]) {
570 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
571 pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
577 if (buf[i] != golden[i]) {
578 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
579 pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
585 if (buf[i] == golden[i] || i == bufSize - 1) {
586 if (i < bufSize - 1 && buf[i+1] != golden[i+1]) {
587 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
588 pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
591 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf mismatch range %d -> %d (%s) !!!\n",
592 pe, timestep, isRemote, phase, start, ((buf[i] != golden[i] && i == bufSize - 1) ? (i+1) : (i)), str
596 }
else if (buf[i] != golden[i] && i - start < 16) {
597 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
598 pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
604 if (mismatchCount > 0) {
return 1; }
609 __declspec(target(mic))
610 void _verify_tables(const
int pe, const
int timestep, const
int isRemote, const
int phase) {
613 if (_verify_buffers_match(pe, timestep, isRemote, phase, (
char*)(device__exclusion_bits), (
char*)(device__exclusion_bits_copy), device__exclusion_bits_size *
sizeof(
unsigned int),
"exclusion_bits")) {
return; }
614 if (_verify_buffers_match(pe, timestep, isRemote, phase, (
char*)(device__constants), (
char*)(device__constants_copy),
sizeof(mic_constants),
"constants")) {
return; }
615 #if MIC_HANDCODE_FORCE_SINGLE != 0
616 if (_verify_buffers_match(pe, timestep, isRemote, phase, (
char*)(device__table_four_float), (
char*)(device__table_four_float_copy), 61 * device__table_four_n_16 *
sizeof(
float),
"table_four(float)")) {
return; }
617 if (_verify_buffers_match(pe, timestep, isRemote, phase, (
char*)(device__lj_table_float), (
char*)(device__lj_table_float_copy), device__lj_table_size /
sizeof(
double) *
sizeof(
float),
"lj_table(float)")) {
return; }
619 if (_verify_buffers_match(pe, timestep, isRemote, phase, (
char*)(device__table_four), (
char*)(device__table_four_copy), 61 * device__table_four_n_16 *
sizeof(
double),
"table_four(double)")) {
return; }
620 if (_verify_buffers_match(pe, timestep, isRemote, phase, (
char*)(device__lj_table), (
char*)(device__lj_table_copy), device__lj_table_size *
sizeof(
double),
"lj_table(double)")) {
return; }
625 char *p1 = (
char*)(device__constants);
626 char *p2 = (
char*)(device__exclusion_bits);
628 if (p1 >= p2 && p1 < p2 + (device__exclusion_bits_size *
sizeof(
unsigned int))) {
629 printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: !!!! OVERLAP OF EXCLUSIONS AND CONSTANTS DETECTED !!!! - constants:%p(%ld), exclusions:%p(%ld)\n",
630 pe, timestep, isRemote, phase, device__constants,
sizeof(device__constants), device__exclusion_bits, device__exclusion_bits_size *
sizeof(
unsigned int)
638 __declspec(target(mic))
639 void _verify_data_structures(const
int pe, const
int timestep, const
int isRemote, const
int phase,
640 const
int check_lo, const
int check_hi, const
int doSlow
644 if (device__atoms == NULL) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__atoms == NULL !!!\n", pe, timestep, isRemote, phase);
return; }
645 if (device__atom_params == NULL) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__atom_params == NULL !!!\n", pe, timestep, isRemote, phase);
return; }
646 if (device__forces == NULL) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__forces == NULL !!!\n", pe, timestep, isRemote, phase);
return; }
647 if (device__slow_forces == NULL) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__slow_forces == NULL !!!\n", pe, timestep, isRemote, phase);
return; }
648 if (device__patch_pairs == NULL) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs == NULL !!!\n", pe, timestep, isRemote, phase);
return; }
649 if (device__force_lists == NULL) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists == NULL !!!\n", pe, timestep, isRemote, phase);
return; }
650 if (device__pairlists == NULL) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__pairlists == NULL !!!\n", pe, timestep, isRemote, phase);
return; }
653 if (device__patch_pairs_size <= 0) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs_size <= 0 !!!\n", pe, timestep, isRemote, phase);
return; }
654 if (device__force_lists_size <= 0) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists_size <= 0 !!!\n", pe, timestep, isRemote, phase);
return; }
657 _verify_tables(pe, timestep, isRemote, phase);
660 #if MULTIPLE_THREADS != 0
661 #pragma omp parallel for
663 for (
int k = check_lo; k < check_hi; k++) {
665 const patch_pair &pp = device__patch_pairs[k];
668 if (pp.patch1_atom_start < 0 || pp.patch1_atom_start >= device__atoms_size) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch1_atom_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, k, pp.patch1_atom_start, device__atoms_size); }
669 if (pp.patch2_atom_start < 0 || pp.patch2_atom_start >= device__atoms_size) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch2_atom_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, k, pp.patch2_atom_start, device__atoms_size); }
670 if (pp.patch1_force_start < 0 || pp.patch1_force_start >= device__force_buffers_req_size) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch1_force_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, k, pp.patch1_force_start, device__force_buffers_req_size); }
671 if (pp.patch2_force_start < 0 || pp.patch2_force_start >= device__force_buffers_req_size) { printf(
"[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch2_force_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, k, pp.patch2_force_start, device__force_buffers_req_size); }
674 int i_upper = pp.patch1_size;
675 int j_upper = pp.patch2_size;
676 const int **
const pairlists = (
const int **
const)device__pairlists;
677 _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_NORM_INDEX], k,
"NORM", pp, PL_NORM_INDEX);
678 _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_EXCL_INDEX], k,
"EXCL", pp, PL_EXCL_INDEX);
679 _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_MOD_INDEX], k,
"MOD", pp, PL_MOD_INDEX);
684 #if MULTIPLE_THREADS != 0
685 #pragma omp parallel for
687 for (
int k = 0; k < device__force_lists_size; k++) {
688 _verify_forces(pe, timestep, isRemote, phase, k, device__force_buffers, device__forces,
"");
689 if (doSlow) { _verify_forces(pe, timestep, isRemote, phase, k, device__slow_force_buffers, device__slow_forces,
"slow "); }
696 #endif // MIC_DATA_STRUCT_VERIFY
709 void mic_bind_table_four(
const int deviceNum,
710 const double *table_four,
711 const int table_four_n,
712 const int table_four_n_16
716 __FULL_CHECK(__ASSERT(deviceNum >= 0));
717 __FULL_CHECK(__ASSERT(table_four != NULL));
718 __FULL_CHECK(__ASSERT(table_four_n > 0));
719 __FULL_CHECK(__ASSERT(table_four_n_16 > 0 && table_four_n_16 >= table_four_n));
720 __FULL_CHECK(__ASSERT(table_four_n_16 % 16 == 0));
725 int numTableFourElems = 61 * table_four_n_16;
728 #pragma offload target(mic:deviceNum) \
729 in(table_four_n_16) nocopy(device__table_four_n_16) \
730 in(numTableFourElems) \
731 in(table_four[0:numTableFourElems] : alloc_if(1) free_if(1)) \
732 nocopy(device__table_four) nocopy(device__table_four_float) \
733 nocopy(device__table_four_copy) nocopy(device__table_four_float_copy)
738 device__table_four_n_16 = table_four_n_16;
739 __FULL_CHECK(__ASSERT(device__table_four_n_16 > 0));
743 __FULL_CHECK(__ASSERT(device__table_four == NULL && device__table_four_float == NULL));
744 #if MIC_HANDCODE_FORCE_SINGLE != 0
747 device__table_four_float = (
float*)(_MM_MALLOC_WRAPPER(
sizeof(
float) * numTableFourElems, 64,
"device__table_four_float"));
748 __ASSERT(device__table_four_float != NULL);
749 for (
int i = 0; i < numTableFourElems; i++) {
750 device__table_four_float[i] = (float)(table_four[i]);
754 #if MIC_DATA_STRUCT_VERIFY != 0
755 device__table_four_float_copy = (
float*)(_MM_MALLOC_WRAPPER(
sizeof(
float) * numTableFourElems, 64,
"device__table_four_float_copy"));
756 __ASSERT(device__table_four_float_copy != NULL);
757 memcpy(device__table_four_float_copy, device__table_four_float, numTableFourElems *
sizeof(
float));
760 #else // MIC_HANDCODE_FORCE_SINGLE != 0
763 device__table_four = (
double*)(_MM_MALLOC_WRAPPER(
sizeof(
double) * numTableFourElems, 64,
"device__table_four"));
764 __ASSERT(device__table_four != NULL);
765 for (
int i = 0; i < numTableFourElems; i++) {
766 device__table_four[i] = table_four[i];
768 #if MIC_DATA_STRUCT_VERIFY != 0
769 device__table_four_copy = (
double*)(_MM_MALLOC_WRAPPER(
sizeof(
double) * numTableFourElems, 64,
"device__table_four_copy"));
770 __ASSERT(device__table_four_copy != NULL);
771 memcpy(device__table_four_copy, device__table_four, numTableFourElems *
sizeof(
double));
774 #endif // MIC_HANDCODE_FORCE_SINGLE != 0
789 void mic_bind_lj_table(
const int deviceNum,
791 const int lj_table_dim,
796 __FULL_CHECK(__ASSERT(deviceNum >= 0));
797 __FULL_CHECK(__ASSERT(lj_table != NULL));
798 __FULL_CHECK(__ASSERT(lj_table_dim > 0));
799 __FULL_CHECK(__ASSERT(lj_table_size > 0));
802 #pragma offload target(mic:deviceNum) \
803 in(lj_table_dim) nocopy(device__lj_table_dim) \
804 in(lj_table_size) nocopy(device__lj_table_size) \
805 in(lj_table[0:lj_table_size] : alloc_if(1) free_if(1)) \
806 nocopy(device__lj_table) nocopy(device__lj_table_float) \
807 nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
813 device__lj_table_dim = lj_table_dim;
815 __FULL_CHECK(__ASSERT(device__lj_table_dim > 0 && device__lj_table_size > 0));
819 __FULL_CHECK(__ASSERT(device__lj_table == NULL && device__lj_table_float == NULL));
820 int numElements = device__lj_table_size *
sizeof(char) /
sizeof(
double);
821 #if MIC_HANDCODE_FORCE_SINGLE != 0
824 device__lj_table_float = (
float*)(_MM_MALLOC_WRAPPER(
sizeof(
float) * numElements, 64,
"device__lj_table_float"));
825 __ASSERT(device__lj_table_float != NULL);
826 for (
int i = 0; i < numElements; i++) {
827 device__lj_table_float[i] = (float)(((
double*)lj_table)[i]);
831 #if MIC_DATA_STRUCT_VERIFY != 0
832 device__lj_table_float_copy = (
float*)(_MM_MALLOC_WRAPPER(
sizeof(
float) * numElements, 64,
"device__lj_table_float_copy"));
833 __ASSERT(device__lj_table_float_copy != NULL);
834 memcpy(device__lj_table_float_copy, device__lj_table_float,
sizeof(
float) * numElements);
837 #else // MIC_HANDCODE_FORCE_SINGLE != 0
840 device__lj_table = (
double*)(_MM_MALLOC_WRAPPER(
sizeof(
double) * numElements, 64,
"device__lj_table"));
841 __ASSERT(device__lj_table_float != NULL);
842 for (
int i = 0; i < numElements; i++) {
843 device__lj_table[i] = ((
double*)lj_table)[i];
847 #if MIC_DATA_STRUCT_VERIFY != 0
848 device__lj_table_copy = (
double*)(_MM_MALLOC_WRAPPER(
sizeof(
double) * numElements, 64,
"device__lj_table_copy"));
849 __ASSERT(device__lj_table_copy != NULL);
850 memcpy(device__lj_table_copy, device__lj_table,
sizeof(
double) * numElements);
853 #endif // MIC_HANDCODE_FORCE_SINGLE != 0
867 void mic_bind_exclusions(
const int deviceNum,
868 unsigned int *exclusion_bits,
869 const long int exclusion_bits_size
873 __FULL_CHECK(__ASSERT(deviceNum >= 0));
874 __FULL_CHECK(__ASSERT(exclusion_bits != NULL));
875 __FULL_CHECK(__ASSERT(exclusion_bits_size > 0));
878 #pragma offload target(mic:deviceNum) \
879 in(exclusion_bits_size) nocopy(device__exclusion_bits_size) \
880 in(exclusion_bits[0:exclusion_bits_size] : alloc_if(1) free_if(1)) \
881 nocopy(device__exclusion_bits) nocopy(device__exclusion_bits_copy)
886 device__exclusion_bits_size = exclusion_bits_size;
887 __FULL_CHECK(__ASSERT(device__exclusion_bits_size > 0));
890 __ASSERT(device__exclusion_bits == NULL);
891 device__exclusion_bits = (
unsigned int *)_MM_MALLOC_WRAPPER(
sizeof(
unsigned int) * device__exclusion_bits_size, 64,
"device__exclusion_bits");
892 __ASSERT(device__exclusion_bits != NULL);
893 memcpy(device__exclusion_bits, exclusion_bits,
sizeof(
unsigned int) * device__exclusion_bits_size);
896 #if MIC_DATA_STRUCT_VERIFY != 0
897 device__exclusion_bits_copy = (
unsigned int*)(_MM_MALLOC_WRAPPER(device__exclusion_bits_size *
sizeof(
unsigned int), 64,
" device__exclusion_bits_copy"));
898 __ASSERT(device__exclusion_bits_copy != NULL);
899 memcpy(device__exclusion_bits_copy, device__exclusion_bits,
sizeof(
unsigned int) * device__exclusion_bits_size);
901 device__exclusion_bits_copy = NULL;
914 void mic_bind_constants(
const int deviceNum,
916 const double dielectric_1,
917 const double scaling,
918 const double scale14,
919 const double r2_delta,
920 const int r2_delta_exp,
925 __FULL_CHECK(__ASSERT(deviceNum >= 0));
930 mic_constants constants;
932 constants.dielectric_1 = dielectric_1;
933 constants.scaling = scaling;
934 constants.scale14 = scale14;
935 constants.modf_mod = 1.0 - scale14;
936 constants.r2_delta = r2_delta;
937 constants.r2_delta_exp = r2_delta_exp;
938 constants.r2_delta_expc = 64 * (r2_delta_exp - 1023);
939 constants.commOnly = commOnly;
940 constants.singleKernelFlag = singleKernelFlag;
943 #pragma offload target(mic:deviceNum) \
944 in(constants) nocopy(device__constants) \
945 nocopy(device__pe) nocopy(device__exclusion_bits) \
946 nocopy(device__exclusion_bits_copy) nocopy(device__exclusion_bits_size) \
947 nocopy(device__table_four_copy) nocopy(device__table_four_float_copy) \
948 nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
949 nocopy(device__constants_copy)
954 __ASSERT(device__constants == NULL);
955 device__constants = (mic_constants*)_MM_MALLOC_WRAPPER(
sizeof(mic_constants), 64,
"device__constants");
956 __ASSERT(device__constants != NULL);
957 memcpy(device__constants, &constants,
sizeof(mic_constants));
961 #if MIC_HANDCODE_FORCE_SINGLE != 0
962 device__constants->r2_delta_expc = 64 * (device__constants->r2_delta_exp - 127);
966 #if MIC_DATA_STRUCT_VERIFY != 0
967 device__constants_copy = (mic_constants*)_MM_MALLOC_WRAPPER(
sizeof(mic_constants), 64,
"device__constants_copy");
968 __ASSERT(device__constants_copy != NULL);
969 memcpy(device__constants_copy, device__constants,
sizeof(mic_constants));
984 void mic_bind_patch_pairs_only(
const int deviceNum,
987 const int patch_pairs_bufSize
994 __FULL_CHECK(__ASSERT(deviceNum >= 0));
995 __FULL_CHECK(__ASSERT(patch_pairs != NULL));
996 __FULL_CHECK(__ASSERT(patch_pairs_size > 0));
997 __FULL_CHECK(__ASSERT(patch_pairs_bufSize > 0));
998 __FULL_CHECK(__ASSERT(patch_pairs_bufSize >= patch_pairs_size));
1002 if (patch_pairs_bufSize > host__patch_pairs_bufSize) {
1005 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1006 double * device_times_computes = host__device_times_computes;
1007 #define TIMES_COMPUTES_FREE_CLAUSE nocopy(device_times_computes : alloc_if(0) free_if(1))
1009 #define TIMES_COMPUTES_FREE_CLAUSE
1013 if (host__patch_pairs != NULL) {
1015 const patch_pair * _patch_pairs = host__patch_pairs;
1016 #pragma offload target(mic:deviceNum) \
1017 nocopy(_patch_pairs : alloc_if(0) free_if(1)) \
1018 TIMES_COMPUTES_FREE_CLAUSE
1023 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1024 if (host__device_times_computes != NULL) { _MM_FREE_WRAPPER(host__device_times_computes); }
1025 host__device_times_computes = (
double*)_MM_MALLOC_WRAPPER(2 * patch_pairs_bufSize *
sizeof(
double), 64,
"host__device_times_computes");
1026 __ASSERT(host__device_times_computes != NULL);
1027 device_times_computes = host__device_times_computes;
1028 #define TIMES_COMPUTES_ALLOC_CLAUSE nocopy(device_times_computes[0:2*patch_pairs_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
1030 #define TIMES_COMPUTES_ALLOC_CLAUSE
1034 host__patch_pairs = (patch_pair*)patch_pairs;
1035 host__patch_pairs_bufSize = patch_pairs_bufSize;
1038 #pragma offload target(mic:deviceNum) \
1039 in(patch_pairs_size) nocopy(device__patch_pairs_size) \
1040 in(patch_pairs_bufSize) \
1041 nocopy(patch_pairs[0:patch_pairs_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1042 nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
1043 nocopy(device__numOMPThreads) nocopy(device__pe) \
1044 TIMES_COMPUTES_ALLOC_CLAUSE
1050 __ASSERT(device__patch_pairs_size > 0);
1053 const int numPairlists = NUM_PAIRLIST_TYPES * device__patch_pairs_size;
1054 if (numPairlists > device__pairlists_alloc_size) {
1055 int **new_pairlists = (
int**)(_MM_MALLOC_WRAPPER(numPairlists *
sizeof(
int*), 64,
"device__pairlists"));
1057 if (device__pairlists != 0) {
1058 int **old_pairlists = (
int**)(device__pairlists);
1059 memcpy((
void*)new_pairlists, (
void*)old_pairlists,
sizeof(
int*) * device__pairlists_alloc_size);
1060 _MM_FREE_WRAPPER(old_pairlists);
1061 initStart = device__pairlists_alloc_size;
1063 for (
int i = initStart; i < numPairlists; i++) { new_pairlists[i] = NULL; }
1064 device__pairlists = (uintptr_t)new_pairlists;
1065 device__pairlists_alloc_size = numPairlists;
1070 #undef TIMES_COMPUTES_FREE_CLAUSE
1071 #undef TIMES_COMPUTES_ALLOC_CLAUSE
1088 void mic_bind_force_lists_only(
const int deviceNum,
1089 force_list *force_lists,
1090 const int force_lists_size,
1091 const int force_lists_bufSize
1098 __FULL_CHECK(__ASSERT(deviceNum >= 0));
1099 __FULL_CHECK(__ASSERT(force_lists != NULL));
1100 __FULL_CHECK(__ASSERT(force_lists_size > 0));
1101 __FULL_CHECK(__ASSERT(force_lists_bufSize > 0));
1102 __FULL_CHECK(__ASSERT(force_lists_bufSize >= force_lists_size));
1105 if (force_lists_bufSize > host__force_lists_bufSize) {
1108 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1109 double * device_times_patches = host__device_times_patches;
1110 #define TIMES_PATCHES_FREE_CLAUSE nocopy(device_times_patches : alloc_if(0) free_if(1))
1112 #define TIMES_PATCHES_FREE_CLAUSE
1116 if (host__force_lists != NULL) {
1117 const force_list * _force_lists = host__force_lists;
1118 #pragma offload target(mic:deviceNum) \
1119 nocopy(_force_lists : alloc_if(0) free_if(1)) \
1120 TIMES_PATCHES_FREE_CLAUSE
1125 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1126 if (host__device_times_patches != NULL) { _MM_FREE_WRAPPER(host__device_times_patches); }
1127 host__device_times_patches = (
double*)_MM_MALLOC_WRAPPER(2 * force_lists_bufSize *
sizeof(
double), 64,
"host__device_times_patches");
1128 __ASSERT(host__device_times_patches != NULL);
1129 device_times_patches = host__device_times_patches;
1130 #define TIMES_PATCHES_ALLOC_CLAUSE nocopy(device_times_patches[0:2*force_lists_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
1132 #define TIMES_PATCHES_ALLOC_CLAUSE
1136 host__force_lists = force_lists;
1137 host__force_lists_bufSize = force_lists_bufSize;
1140 #pragma offload target(mic:deviceNum) \
1141 in(force_lists_size) nocopy(device__force_lists_size) \
1142 nocopy(force_lists[0:force_lists_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1143 TIMES_PATCHES_ALLOC_CLAUSE
1148 device__force_lists_size = force_lists_size;
1149 __ASSERT(device__force_lists_size > 0);
1152 #undef TIMES_PATCHES_FREE_CLAUSE
1153 #undef TIMES_PATCHES_ALLOC_CLAUSE
1158 host__force_lists_size = force_lists_size;
1159 force_lists_copySize = force_lists_size;
1174 void mic_bind_atoms_only(
const int deviceNum,
1180 const int atoms_bufSize
1187 __FULL_CHECK(__ASSERT(deviceNum >= 0));
1188 __FULL_CHECK(__ASSERT(atoms != NULL));
1189 __FULL_CHECK(__ASSERT(atom_params != NULL));
1190 __FULL_CHECK(__ASSERT(forces != NULL));
1191 __FULL_CHECK(__ASSERT(slow_forces != NULL));
1192 __FULL_CHECK(__ASSERT(atoms_size > 0));
1193 __FULL_CHECK(__ASSERT(atoms_bufSize > 0));
1194 __FULL_CHECK(__ASSERT(atoms_bufSize >= atoms_size));
1197 if (atoms_bufSize > host__atoms_bufSize) {
1200 if (host__atoms != NULL) {
1201 const atom * _atoms = host__atoms;
1202 const atom_param * _atom_params = host__atom_params;
1203 const double4 * _forces = host__forces;
1204 const double4 * _slow_forces = host__slow_forces;
1206 #pragma offload target(mic:deviceNum) \
1207 nocopy(_atoms : alloc_if(0) free_if(1)) \
1208 nocopy(_atom_params : alloc_if(0) free_if(1)) \
1209 nocopy(_forces : alloc_if(0) free_if(1)) \
1210 nocopy(_slow_forces : alloc_if(0) free_if(1))
1215 host__atoms =
atoms;
1220 host__atoms_bufSize = atoms_bufSize;
1223 #pragma offload target(mic:deviceNum) \
1224 in(atoms_size : into(device__atoms_size)) \
1225 nocopy(atoms[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1226 nocopy(atom_params[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1227 nocopy(forces[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1228 nocopy(slow_forces[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
1244 void mic_bind_force_buffers_only(
const int deviceNum,
1245 const size_t force_buffers_size
1250 if (host__force_buffers_req_size < force_buffers_size) {
1251 host__force_buffers_req_size = (int)(force_buffers_size * 1.1f);
1252 host__force_buffers_req_size = (host__force_buffers_req_size + 4095) & (~4095);
1257 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1259 void mic_submit_patch_data(
const int deviceNum,
1260 void *
const hostPtr,
1261 void* &hostPtr_prev,
1262 const int transferBytes,
1263 const int allocBytes_host,
1264 int &allocBytes_device,
1265 uint64_t &devicePtr,
1279 if (hostPtr != hostPtr_prev || allocBytes_host > allocBytes_device) {
1282 if (devicePtr != 0) {
1283 char* __hostPtr_prev = (
char*)hostPtr_prev;
1284 #pragma offload target(mic:deviceNum) \
1285 out(__hostPtr_prev[0:0] : alloc_if(0) free_if(1))
1294 char* __hostPtr = (
char*)hostPtr;
1295 if (allocFlag != 0) {
1297 uint64_t __devicePtr = 0;
1298 allocBytes_device = allocBytes_host;
1299 #pragma offload target(mic:deviceNum) \
1300 in(__hostPtr[0:allocBytes_host] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1302 { __devicePtr = (uint64_t)__hostPtr; }
1305 devicePtr = __devicePtr;
1310 #pragma offload_transfer target(mic:deviceNum) \
1311 in(__hostPtr[0:transferBytes] : alloc_if(0) free_if(0) align(MIC_ALIGN)) \
1319 hostPtr_prev = hostPtr;
1322 #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL
1333 #define NBTYPE NBPAIR
1346 #define NBTYPE NBSELF
1378 void mic_nonbonded_forces(
const int deviceNum,
1380 const int numLocalAtoms,
1381 const int numLocalComputes,
1382 const int numLocalPatches,
1383 const mic_position3_t
lata,
1384 const mic_position3_t
latb,
1385 const mic_position3_t
latc,
1388 const int usePairlists,
1389 const int savePairlists,
1390 const int atomsChanged
1393 const int numAtoms = host__atoms_size;
1394 const int numPatchPairs = host__patch_pairs_size;
1395 const int numForceLists = host__force_lists_size;
1398 int *tag_kernel = ((isRemote) ? (&tag_remote_kernel) : (&tag_local_kernel));
1402 mic_kernel_data * remote_kernel_data = host__kernel_data + 1;
1403 mic_kernel_data * local_kernel_data = host__kernel_data;
1404 mic_kernel_data * kernel_data = ((isRemote) ? (remote_kernel_data) : (local_kernel_data));
1405 mic_kernel_data * _kernel_data = host__kernel_data;
1406 #if MIC_SYNC_INPUT != 0
1409 if (singleKernelFlag != 0 || isRemote) {
1410 remote_kernel_data->isRemote = 1;
1411 local_kernel_data->isRemote = 0;
1412 remote_kernel_data->numLocalAtoms = local_kernel_data->numLocalAtoms = numLocalAtoms;
1413 remote_kernel_data->numLocalComputes = local_kernel_data->numLocalComputes = numLocalComputes;
1414 remote_kernel_data->numLocalPatches = local_kernel_data->numLocalPatches = numLocalPatches;
1415 remote_kernel_data->doSlow = local_kernel_data->doSlow = doSlow;
1416 remote_kernel_data->doEnergy = local_kernel_data->doEnergy = doEnergy;
1417 remote_kernel_data->usePairlists = local_kernel_data->usePairlists = usePairlists;
1418 remote_kernel_data->savePairlists = local_kernel_data->savePairlists = savePairlists;
1419 remote_kernel_data->lata.x = local_kernel_data->lata.x = lata.x;
1420 remote_kernel_data->lata.y = local_kernel_data->lata.y = lata.y;
1421 remote_kernel_data->lata.z = local_kernel_data->lata.z = lata.z;
1422 remote_kernel_data->latb.x = local_kernel_data->latb.x = latb.x;
1423 remote_kernel_data->latb.y = local_kernel_data->latb.y = latb.y;
1424 remote_kernel_data->latb.z = local_kernel_data->latb.z = latb.z;
1425 remote_kernel_data->latc.x = local_kernel_data->latc.x = latc.x;
1426 remote_kernel_data->latc.y = local_kernel_data->latc.y = latc.y;
1427 remote_kernel_data->latc.z = local_kernel_data->latc.z = latc.z;
1428 remote_kernel_data->numAtoms = local_kernel_data->numAtoms = numAtoms;
1429 remote_kernel_data->numPatchPairs = local_kernel_data->numPatchPairs = numPatchPairs;
1430 remote_kernel_data->numForceLists = local_kernel_data->numForceLists = numForceLists;
1431 remote_kernel_data->forceBuffersReqSize = local_kernel_data->forceBuffersReqSize = host__force_buffers_req_size;
1434 kernel_data->isRemote = isRemote;
1435 kernel_data->numLocalAtoms = numLocalAtoms;
1436 kernel_data->numLocalComputes = numLocalComputes;
1437 kernel_data->numLocalPatches = numLocalPatches;
1438 kernel_data->doSlow = doSlow;
1439 kernel_data->doEnergy = doEnergy;
1440 kernel_data->usePairlists = usePairlists;
1441 kernel_data->savePairlists = savePairlists;
1442 kernel_data->lata.x = lata.x;
1443 kernel_data->lata.y = lata.y;
1444 kernel_data->lata.z = lata.z;
1445 kernel_data->latb.x = latb.x;
1446 kernel_data->latb.y = latb.y;
1447 kernel_data->latb.z = latb.z;
1448 kernel_data->latc.x = latc.x;
1449 kernel_data->latc.y = latc.y;
1450 kernel_data->latc.z = latc.z;
1451 kernel_data->numAtoms = numAtoms;
1452 kernel_data->numPatchPairs = numPatchPairs;
1453 kernel_data->numForceLists = numForceLists;
1454 kernel_data->forceBuffersReqSize = host__force_buffers_req_size;
1460 const int slowForcesNumAtoms = ((doSlow) ? (numAtoms) : (0));
1461 const int _patch_pairs_copySize = patch_pairs_copySize;
1462 const int _force_lists_copySize = force_lists_copySize;
1463 const int _atom_params_copySize = atom_params_copySize;
1464 patch_pairs_copySize = 0;
1465 force_lists_copySize = 0;
1466 atom_params_copySize = 0;
1470 int toCopySize_atoms = (singleKernelFlag != 0 || isRemote) ? (numAtoms) : (0);
1471 int toCopySize_atom_params = (singleKernelFlag != 0 || isRemote) ? (_atom_params_copySize) : (0);
1472 int toCopySize_patch_pairs = (singleKernelFlag != 0 || isRemote) ? (_patch_pairs_copySize) : (0);
1473 int toCopySize_force_lists = (singleKernelFlag != 0 || isRemote) ? (_force_lists_copySize) : (0);
1474 int toCopyStart_forces = (singleKernelFlag != 0) ? (0) : ((isRemote) ? (numLocalAtoms) : (0));
1475 int toCopySize_forces = (singleKernelFlag != 0) ? (numAtoms) : ((isRemote) ? (numAtoms - numLocalAtoms) : (numLocalAtoms));
1476 int toCopyStart_slow_forces = (doSlow) ? (toCopyStart_forces) : (0);
1477 int toCopySize_slow_forces = (doSlow) ? (toCopySize_forces) : (0);
1481 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
1482 double * device_times_start = host__device_times_start;
1483 int toCopySize_device_times_start = ((isRemote) ? (0) : (10));
1484 #if MIC_DEVICE_TRACING_DETAILED != 0
1485 double * device_times_computes = host__device_times_computes;
1486 double * device_times_patches = host__device_times_patches;
1487 int toCopySize_device_times_computes = ((isRemote) ? (0) : (2 * numPatchPairs));
1488 int toCopySize_device_times_patches = ((isRemote) ? (0) : (2 * numForceLists));
1489 #if MIC_SYNC_OUTPUT != 0
1490 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1491 in(device_times_computes[0:0] : alloc_if(0) free_if(0)) \
1492 in(device_times_patches[0:0] : alloc_if(0) free_if(0)) \
1493 nocopy(device__device_times_computes) nocopy(device__device_times_patches)
1495 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1496 out(device_times_computes[0:toCopySize_device_times_computes] : alloc_if(0) free_if(0)) \
1497 out(device_times_patches[0:toCopySize_device_times_patches] : alloc_if(0) free_if(0)) \
1498 nocopy(device__device_times_computes) nocopy(device__device_times_patches)
1501 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
1503 #if MIC_SYNC_OUTPUT != 0
1504 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
1505 MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1506 in(device_times_start[0:0] : alloc_if(0) free_if(0)) \
1507 nocopy(device__device_times_start)
1509 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
1510 MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1511 out(device_times_start[0:toCopySize_device_times_start] : alloc_if(0) free_if(0)) \
1512 nocopy(device__device_times_start)
1515 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE
1519 atom * atoms = host__atoms;
1520 atom_param * atom_params = host__atom_params;
1521 double4 * forces = host__forces;
1522 double4 * slow_forces = host__slow_forces;
1523 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1524 #define MIC_DEVICE_ATOMS_CLAUSE
1526 #define MIC_DEVICE_ATOMS_CLAUSE \
1527 in(atoms[0:toCopySize_atoms] : alloc_if(0) free_if(0)) \
1528 in(atom_params[0:toCopySize_atom_params] : alloc_if(0) free_if(0))
1534 #if MIC_KERNEL_DATA_TRANSFER_STATS != 0
1535 int transfer_in =
sizeof(isRemote)
1536 + (3 *
sizeof(lata))
1538 + (
sizeof(atom) * toCopySize_atoms)
1539 + (
sizeof(atom_param) * toCopySize_atom_params)
1540 + (
sizeof(patch_pair) * toCopySize_patch_pairs)
1541 + (
sizeof(force_list) * toCopySize_force_lists);
1542 int transfer_inout =
sizeof(mic_kernel_data);
1543 int transfer_out =
sizeof(double4) * (toCopySize_forces + toCopySize_slow_forces)
1546 MICP(
"timestep:%06d - transfering %d bytes (isRemote:%d, in:%d, inout:%d, out:%d)\n", \
1547 host__timestep, transfer_in + transfer_inout + transfer_out, \
1548 isRemote, transfer_in, transfer_inout, transfer_out \
1551 printf(
"[MIC-DEBUG] :: PE %04d-%05d-%1d :: transfering %d bytes (in:%d, inout:%d, out:%d)\n",
1552 host__pe, host__timestep, isRemote,
1553 transfer_in + transfer_inout + transfer_out, transfer_in, transfer_inout, transfer_out
1561 patch_pair * patch_pairs = host__patch_pairs;
1562 force_list * force_lists = host__force_lists;
1563 #if MIC_SYNC_INPUT != 0
1566 if (singleKernelFlag != 0 || isRemote != 0) {
1567 MICP(
"<< pushing input data >>\n"); MICPF;
1568 #if MIC_TRACING != 0
1569 double input_start = CmiWallTimer();
1571 #pragma offload_transfer target(mic:deviceNum) \
1572 in(patch_pairs[0:toCopySize_patch_pairs] : alloc_if(0) free_if(0)) \
1573 in(force_lists[0:toCopySize_force_lists] : alloc_if(0) free_if(0)) \
1574 in(_kernel_data[0:2] : alloc_if(0) free_if(0)) \
1575 MIC_DEVICE_ATOMS_CLAUSE
1577 #if MIC_TRACING != 0
1578 double input_end = CmiWallTimer();
1579 MIC_TRACING_RECORD(MIC_EVENT_SYNC_INPUT_PRAGMA, input_start, input_end);
1584 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1585 #define TMP_SYNC_ATOMS_CLAUSE
1587 #define TMP_SYNC_ATOMS_CLAUSE \
1588 in(atoms[0:0] : alloc_if(0) free_if(0)) \
1589 in(atom_params[0:0] : alloc_if(0) free_if(0))
1592 #if MIC_SYNC_OUTPUT != 0
1593 #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1594 in(_kernel_data[0:0] : alloc_if(0) free_if(0))
1596 #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1597 out(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
1600 #define MIC_DEVICE_SYNC_INPUT_CLAUSE \
1601 nocopy(device__lata) nocopy(device__latb) nocopy(device__latc) \
1602 nocopy(device__atoms_size) \
1603 in(patch_pairs[0:0] : alloc_if(0) free_if(0)) nocopy(device__patch_pairs_size) \
1604 in(force_lists[0:0] : alloc_if(0) free_if(0)) nocopy(device__force_lists_size) \
1605 TMP_SYNC_KERNEL_DATA_CLAUSE \
1606 TMP_SYNC_ATOMS_CLAUSE
1610 #if MIC_SYNC_OUTPUT != 0
1611 #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1612 in(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
1614 #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1615 inout(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
1619 #define MIC_DEVICE_SYNC_INPUT_CLAUSE \
1620 nocopy(device__lata) nocopy(device__latb) nocopy(device__latc) \
1621 in(patch_pairs[0:toCopySize_patch_pairs] : alloc_if(0) free_if(0)) \
1622 in(force_lists[0:toCopySize_force_lists] : alloc_if(0) free_if(0)) \
1623 nocopy(device__atoms_size) nocopy(device__patch_pairs_size) nocopy(device__force_lists_size) \
1624 TMP_SYNC_KERNEL_DATA_CLAUSE \
1625 MIC_DEVICE_ATOMS_CLAUSE
1629 MICP(
"<< issuing %s kernel >>\n", (isRemote) ? (
"remote") : (
"local")); MICPF;
1631 #if MIC_SYNC_OUTPUT != 0
1632 #define MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
1633 in(forces[0:0] : alloc_if(0) free_if(0)) \
1634 in(slow_forces[0:0] : alloc_if(0) free_if(0)) \
1635 nocopy(device__timestep)
1637 #define MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
1638 out(forces[toCopyStart_forces:toCopySize_forces] : alloc_if(0) free_if(0)) \
1639 out(slow_forces[toCopyStart_slow_forces:toCopySize_slow_forces] : alloc_if(0) free_if(0)) \
1640 out(device__timestep : into(host__timestep))
1643 #if MIC_DATA_STRUCT_VERIFY != 0
1644 #define MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE \
1645 nocopy(device__exclusion_bits_copy) nocopy(device__constants_copy) \
1646 nocopy(device__table_four_copy) nocopy(device__table_four_float_copy) \
1647 nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
1648 nocopy(device__patch_pairs_copy) nocopy(device__force_lists_copy) \
1649 nocopy(device__atoms_copy) nocopy(device__atom_params_copy) \
1650 nocopy(device__patch_pairs_copy_size) nocopy(device__force_lists_copy_size) \
1651 nocopy(device__atoms_copy_size) nocopy(device__atom_params_copy_size)
1653 #define MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE
1656 #if REFINE_PAIRLISTS != 0
1657 #define MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE \
1658 nocopy(device__r2_array) \
1659 nocopy(device__pl_array) \
1660 nocopy(device__pl_size)
1662 #define MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE
1665 #if MIC_TRACING != 0
1666 double pragma_start = CmiWallTimer();
1670 #pragma offload target(mic:deviceNum) \
1672 MIC_DEVICE_SYNC_INPUT_CLAUSE \
1673 MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
1674 MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
1675 nocopy(device__force_buffers) nocopy(device__slow_force_buffers) \
1676 nocopy(device__force_buffers_alloc_size) nocopy(device__force_buffers_req_size) \
1677 nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
1678 nocopy(device__table_four) nocopy(device__table_four_float) nocopy(device__table_four_n_16) \
1679 nocopy(device__lj_table) nocopy(device__lj_table_float) nocopy(device__lj_table_dim) \
1680 nocopy(device__exclusion_bits) \
1681 nocopy(device__constants) \
1682 nocopy(device__atoms) nocopy(device__atom_params) \
1683 nocopy(device__forces) nocopy(device__slow_forces) \
1684 nocopy(device__patch_pairs) nocopy(device__force_lists) \
1685 nocopy(device__numOMPThreads) \
1686 MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE \
1687 MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE \
1688 nocopy(device__pe) nocopy(device__node) \
1689 DEVICE_FPRINTF_CLAUSE
1694 __FULL_CHECK(__ASSERT(isRemote == 0 || isRemote == 1));
1695 mic_kernel_data * kernel_data = _kernel_data + isRemote;
1696 __FULL_CHECK(__ASSERT(kernel_data != NULL));
1697 __FULL_CHECK(__ASSERT(device__numOMPThreads > 0));
1699 #if (MIC_DEVICE_FPRINTF != 0) && (MIC_DEVICE_FPRINTF_REOPEN_FREQ > 0)
1700 if ((device__timestep != 0 && device__timestep % MIC_DEVICE_FPRINTF_REOPEN_FREQ == 0) && (kernel_data->isRemote == 0)) {
1701 char filename[128] = { 0 };
1702 sprintf(filename,
"/tmp/namd_deviceDebugInfo.%d", device__pe);
1703 if (device__node <= 0) {
1704 printf(
"[MIC-DEBUG] :: Reopening debug output files on MIC devices (timestep: %d).\n", device__timestep);
1707 fclose(device__fout);
1708 device__fout = fopen(filename,
"w");
1709 DEVICE_FPRINTF(
"Device file on PE %d (node: %d) - reopen (timestep: %d)...\n", device__pe, device__node, device__timestep);
1713 DEVICE_FPRINTF(
"%d %s ", device__timestep, (kernel_data->isRemote != 0 ?
"R" :
"L"));
1718 device__force_lists = force_lists;
1719 device__atoms =
atoms;
1723 __ASSERT(device__patch_pairs != NULL);
1724 __ASSERT(device__force_lists != NULL);
1725 __ASSERT(device__atoms != NULL);
1726 __ASSERT(device__atom_params != NULL);
1727 __ASSERT(device__forces != NULL);
1728 __ASSERT(device__slow_forces != NULL);
1729 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
1730 device__device_times_start = device_times_start;
1731 __ASSERT(device__device_times_start);
1732 #if MIC_DEVICE_TRACING_DETAILED != 0
1733 device__device_times_computes = device_times_computes;
1734 device__device_times_patches = device_times_patches;
1735 __ASSERT(device__device_times_computes);
1736 __ASSERT(device__device_times_patches);
1739 device__lata.x = kernel_data->lata.x;
1740 device__lata.y = kernel_data->lata.y;
1741 device__lata.z = kernel_data->lata.z;
1742 device__latb.x = kernel_data->latb.x;
1743 device__latb.y = kernel_data->latb.y;
1744 device__latb.z = kernel_data->latb.z;
1745 device__latc.x = kernel_data->latc.x;
1746 device__latc.y = kernel_data->latc.y;
1747 device__latc.z = kernel_data->latc.z;
1748 device__atoms_size = kernel_data->numAtoms;
1749 device__patch_pairs_size = kernel_data->numPatchPairs;
1750 device__force_lists_size = kernel_data->numForceLists;
1751 device__force_buffers_req_size = kernel_data->forceBuffersReqSize;
1754 #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
1755 __ASSERT(device__device_times_start != NULL);
1756 device__device_times_start[((isRemote)?(0):(1))] = getCurrentTime();
1762 #if MIC_DATA_STRUCT_VERIFY != 0
1764 _verify_tables(device__pe, device__timestep, isRemote, -1);
1766 _verify_remalloc<patch_pair>(device__patch_pairs_copy_size, device__patch_pairs_size, device__patch_pairs_copy);
1767 _verify_remalloc<force_list>(device__force_lists_copy_size, device__force_lists_size, device__force_lists_copy);
1768 _verify_remalloc<atom>(device__atoms_copy_size, device__atoms_size, device__atoms_copy);
1769 _verify_remalloc<atom_param>(device__atom_params_copy_size, device__atoms_size, device__atom_params_copy);
1770 __ASSERT(device__patch_pairs_copy != NULL && device__force_lists_copy != NULL);
1771 __ASSERT(device__atoms_copy != NULL && device__atom_params_copy != NULL);
1773 _verify_parallel_memcpy(device__patch_pairs_copy, device__patch_pairs,
sizeof(patch_pair) * device__patch_pairs_size);
1774 _verify_parallel_memcpy(device__force_lists_copy, device__force_lists,
sizeof(force_list) * device__force_lists_size);
1775 _verify_parallel_memcpy(device__atoms_copy, device__atoms,
sizeof(atom) * device__atoms_size);
1776 _verify_parallel_memcpy(device__atom_params_copy, device__atom_params,
sizeof(atom_param) * device__atoms_size);
1780 if (device__force_buffers_req_size > device__force_buffers_alloc_size) {
1781 if (device__force_buffers != NULL) { _MM_FREE_WRAPPER(device__force_buffers); }
1782 if (device__slow_force_buffers != NULL) { _MM_FREE_WRAPPER(device__slow_force_buffers); }
1783 device__force_buffers_alloc_size = device__force_buffers_req_size;
1784 size_t reqSize =
sizeof(double) * 4 * device__force_buffers_alloc_size;
1785 device__force_buffers = (double4*)(_MM_MALLOC_WRAPPER(reqSize, 64,
"device__force_buffers"));
1786 device__slow_force_buffers = (double4*)(_MM_MALLOC_WRAPPER(reqSize, 64,
"device__slow_force_buffers"));
1787 __ASSERT(device__force_buffers != NULL && device__slow_force_buffers != NULL);
1798 double fullElectVirial_xx = 0.0;
1799 double fullElectVirial_xy = 0.0;
1800 double fullElectVirial_xz = 0.0;
1801 double fullElectVirial_yy = 0.0;
1802 double fullElectVirial_yz = 0.0;
1803 double fullElectVirial_zz = 0.0;
1804 double vdwEnergy = 0.0;
1806 double fullElectEnergy = 0.0;
1807 #if MIC_EXCL_CHECKSUM_FULL != 0
1808 int exclusionSum = 0;
1812 const int ppI_start = (isRemote) ? (kernel_data->numLocalComputes) : (0);
1813 const int ppI_end = (device__constants->singleKernelFlag != 0 || isRemote) ? (device__patch_pairs_size) : (kernel_data->numLocalComputes);
1814 __FULL_CHECK(__ASSERT(ppI_start >= 0 && ppI_start <= device__patch_pairs_size));
1815 __FULL_CHECK(__ASSERT(ppI_end >= 0 && ppI_end <= device__patch_pairs_size));
1816 __FULL_CHECK(__ASSERT(ppI_start <= ppI_end));
1819 #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
1820 device__device_times_start[((isRemote)?(2):(3))] = getCurrentTime();
1823 DEVICE_FPRINTF(
" C(%d,%d)", ppI_start, ppI_end);
1827 #if MULTIPLE_THREADS != 0
1828 #if MIC_EXCL_CHECKSUM_FULL != 0
1829 #define EXCL_CHECKSUM_CLAUSE reduction(+ : exclusionSum)
1831 #define EXCL_CHECKSUM_CLAUSE
1833 #pragma omp parallel for schedule(dynamic, 1) \
1834 reduction(+ : vdwEnergy, electEnergy, fullElectEnergy) \
1835 reduction(+ : virial_xx, virial_xy, virial_xz, virial_yy, virial_yz, virial_zz) \
1836 reduction(+ : fullElectVirial_xx, fullElectVirial_xy, fullElectVirial_xz, fullElectVirial_yy, fullElectVirial_yz, fullElectVirial_zz ) \
1837 EXCL_CHECKSUM_CLAUSE
1838 #undef EXCL_CHECKSUM_CLAUSE
1840 for (
int ppI = ppI_start; ppI < ppI_end; ppI++) {
1842 DEVICE_FPRINTF(
" C");
1845 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1846 __FULL_CHECK(__ASSERT(device__device_times_computes != NULL));
1847 device__device_times_computes[ppI * 2] = getCurrentTime();
1854 params.pp = (patch_pair*)(device__patch_pairs + ppI);
1855 __FULL_CHECK(__ASSERT(params.pp != NULL));
1857 #if MIC_SORT_LISTS != 0
1858 const int abSwapFlag = ((params.pp->patch1_size < params.pp->patch2_size) ? (1) : (0));
1859 #define ABSWAP(t, f) ((abSwapFlag)?(t):(f))
1861 #define ABSWAP(t, f) (f)
1866 params.p1 = ABSWAP(params.pp->p2, params.pp->p1);
1867 params.p2 = ABSWAP(params.pp->p1, params.pp->p2);
1868 params.pe = device__pe;
1869 params.timestep = device__timestep;
1877 #if REFINE_PAIRLISTS != 0
1878 const int threadIndex = omp_get_thread_num();
1879 params.plArray_ptr = device__pl_array + threadIndex;
1880 params.plSize_ptr = device__pl_size + threadIndex;
1881 params.r2Array_ptr = device__r2_array + threadIndex;
1886 #if MIC_HANDCODE_FORCE_SINGLE != 0
1887 params.table_four_base_ptr = (
void*)device__table_four_float;
1888 params.lj_table_base_ptr = (
void*)device__lj_table_float;
1890 params.table_four_base_ptr = (
void*)device__table_four;
1891 params.lj_table_base_ptr = (
void*)device__lj_table;
1893 params.table_four_n_16 = device__table_four_n_16;
1894 params.lj_table_dim = device__lj_table_dim;
1895 params.exclusion_bits = device__exclusion_bits;
1896 params.constants = device__constants;
1897 __FULL_CHECK(__ASSERT(params.table_four_base_ptr != NULL));
1898 __FULL_CHECK(__ASSERT(params.lj_table_base_ptr != NULL));
1899 __FULL_CHECK(__ASSERT(params.table_four_n_16 % 16 == 0));
1900 __FULL_CHECK(__ASSERT(params.exclusion_bits != NULL));
1901 __FULL_CHECK(__ASSERT(params.constants != NULL));
1905 params.usePairlists = kernel_data->usePairlists;
1906 params.savePairlists = kernel_data->savePairlists;
1907 params.pairlists_ptr = &(((
int**)device__pairlists)[NUM_PAIRLIST_TYPES * ppI]);
1908 __FULL_CHECK(__ASSERT(device__pairlists != NULL));
1909 __FULL_CHECK(__ASSERT(params.pairlists_ptr != NULL));
1912 int n0 = ABSWAP(params.pp->patch2_size, params.pp->patch1_size);
1913 int n1 = ABSWAP(params.pp->patch1_size, params.pp->patch2_size);
1914 int n0_16 = (n0 + 15) & (~15);
1915 int n1_16 = (n1 + 15) & (~15);
1916 params.numAtoms[0] = n0;
1917 params.numAtoms[1] = n1;
1918 params.numAtoms_16[0] = n0_16;
1919 params.numAtoms_16[1] = n1_16;
1920 __FULL_CHECK(__ASSERT(params.numAtoms[0] >= 0));
1921 __FULL_CHECK(__ASSERT(params.numAtoms[1] >= 0));
1929 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1930 mic_position_t* pBase0 = (mic_position_t*)(params.pp->patch1_atomDataPtr);
1931 mic_position_t* pBase1 = (mic_position_t*)(params.pp->patch2_atomDataPtr);
1932 int *pExtBase0 = (
int*)(pBase0 + n0_16);
1933 int *pExtBase1 = (
int*)(pBase1 + n1_16);
1935 __FULL_CHECK(__ASSERT(device__atoms != NULL));
1936 __FULL_CHECK(__ASSERT(device__atom_params != NULL));
1938 __FULL_CHECK(__ASSERT(params.pp->patch1_atom_start >= 0));
1939 __FULL_CHECK(__ASSERT((params.pp->patch1_atom_start < device__atoms_size) || (params.pp->patch1_atom_start == device__atoms_size && n0 == 0)));
1940 __FULL_CHECK(__ASSERT(params.pp->patch2_atom_start >= 0));
1941 __FULL_CHECK(__ASSERT((params.pp->patch2_atom_start < device__atoms_size) || (params.pp->patch2_atom_start == device__atoms_size && n1 == 0)));
1943 mic_position_t* pBase0 = (mic_position_t*)(device__atoms + ABSWAP(params.pp->patch2_atom_start, params.pp->patch1_atom_start));
1944 mic_position_t* pBase1 = (mic_position_t*)(device__atoms + ABSWAP(params.pp->patch1_atom_start, params.pp->patch2_atom_start));
1945 int* pExtBase0 = (
int*)(device__atom_params + ABSWAP(params.pp->patch2_atom_start, params.pp->patch1_atom_start));
1946 int* pExtBase1 = (
int*)(device__atom_params + ABSWAP(params.pp->patch1_atom_start, params.pp->patch2_atom_start));
1948 __FULL_CHECK(__ASSERT(pBase0 != NULL));
1949 __FULL_CHECK(__ASSERT(pBase1 != NULL));
1950 __FULL_CHECK(__ASSERT(pExtBase0 != NULL));
1951 __FULL_CHECK(__ASSERT(pExtBase1 != NULL));
1952 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1953 params.p[0] = (atom*)pBase0;
1954 params.p[1] = (atom*)pBase1;
1955 params.pExt[0] = (atom_param*)pExtBase0;
1956 params.pExt[1] = (atom_param*)pExtBase1;
1958 params.p_x[0] = pBase0 + (0 * n0_16); params.p_x[1] = pBase1 + (0 * n1_16);
1959 params.p_y[0] = pBase0 + (1 * n0_16); params.p_y[1] = pBase1 + (1 * n1_16);
1960 params.p_z[0] = pBase0 + (2 * n0_16); params.p_z[1] = pBase1 + (2 * n1_16);
1961 params.p_q[0] = pBase0 + (3 * n0_16); params.p_q[1] = pBase1 + (3 * n1_16);
1962 params.pExt_vdwType[0] = pExtBase0 + (0 * n0_16);
1963 params.pExt_index[0] = pExtBase0 + (1 * n0_16);
1964 params.pExt_exclIndex[0] = pExtBase0 + (2 * n0_16);
1965 params.pExt_exclMaxDiff[0] = pExtBase0 + (3 * n0_16);
1966 params.pExt_vdwType[1] = pExtBase1 + (0 * n1_16);
1967 params.pExt_index[1] = pExtBase1 + (1 * n1_16);
1968 params.pExt_exclIndex[1] = pExtBase1 + (2 * n1_16);
1969 params.pExt_exclMaxDiff[1] = pExtBase1 + (3 * n1_16);
1975 __FULL_CHECK(__ASSERT(device__force_lists != NULL));
1976 __FULL_CHECK(__ASSERT(device__force_buffers != NULL));
1977 __FULL_CHECK(__ASSERT(device__slow_force_buffers != NULL));
1978 __FULL_CHECK(__ASSERT(params.pp->patch1_force_list_index < device__force_lists_size));
1979 __FULL_CHECK(__ASSERT(params.pp->patch2_force_list_index < device__force_lists_size));
1980 __FULL_CHECK(__ASSERT(params.pp->patch1_force_start < device__force_buffers_req_size));
1981 __FULL_CHECK(__ASSERT(params.pp->patch2_force_start < device__force_buffers_req_size));
1982 params.doSlow = kernel_data->doSlow;
1983 params.fl[0] = (force_list*)(device__force_lists + ABSWAP(params.pp->patch2_force_list_index, params.pp->patch1_force_list_index));
1984 params.fl[1] = (force_list*)(device__force_lists + ABSWAP(params.pp->patch1_force_list_index, params.pp->patch2_force_list_index));
1985 double *ffBase0 = (
double*)(device__force_buffers + ABSWAP(params.pp->patch2_force_start, params.pp->patch1_force_start));
1986 double *ffBase1 = (
double*)(device__force_buffers + ABSWAP(params.pp->patch1_force_start, params.pp->patch2_force_start));
1987 double *fullfBase0 = (
double*)(device__slow_force_buffers + ABSWAP(params.pp->patch2_force_start, params.pp->patch1_force_start));
1988 double *fullfBase1 = (
double*)(device__slow_force_buffers + ABSWAP(params.pp->patch1_force_start, params.pp->patch2_force_start));
1989 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1990 params.ff[0] = (double4*)ffBase0;
1991 params.ff[1] = (double4*)ffBase1;
1992 params.fullf[0] = (double4*)fullfBase0;
1993 params.fullf[1] = (double4*)fullfBase1;
1994 __ASSERT(params.ff[0] != NULL);
1995 __ASSERT(params.ff[1] != NULL);
1996 __ASSERT(params.fullf[0] != NULL);
1997 __ASSERT(params.fullf[1] != NULL);
1999 params.ff_x[0] = ffBase0 + (0 * n0_16); params.ff_x[1] = ffBase1 + (0 * n1_16);
2000 params.ff_y[0] = ffBase0 + (1 * n0_16); params.ff_y[1] = ffBase1 + (1 * n1_16);
2001 params.ff_z[0] = ffBase0 + (2 * n0_16); params.ff_z[1] = ffBase1 + (2 * n1_16);
2002 params.ff_w[0] = ffBase0 + (3 * n0_16); params.ff_w[1] = ffBase1 + (3 * n1_16);
2003 params.fullf_x[0] = fullfBase0 + (0 * n0_16); params.fullf_x[1] = fullfBase1 + (0 * n1_16);
2004 params.fullf_y[0] = fullfBase0 + (1 * n0_16); params.fullf_y[1] = fullfBase1 + (1 * n1_16);
2005 params.fullf_z[0] = fullfBase0 + (2 * n0_16); params.fullf_z[1] = fullfBase1 + (2 * n1_16);
2006 params.fullf_w[0] = fullfBase0 + (3 * n0_16); params.fullf_w[1] = fullfBase1 + (3 * n1_16);
2007 __ASSERT(params.ff_x[0] != NULL); __ASSERT(params.ff_x[1] != NULL);
2008 __ASSERT(params.ff_y[0] != NULL); __ASSERT(params.ff_y[1] != NULL);
2009 __ASSERT(params.ff_z[0] != NULL); __ASSERT(params.ff_z[1] != NULL);
2010 __ASSERT(params.ff_w[0] != NULL); __ASSERT(params.ff_w[1] != NULL);
2011 __ASSERT(params.fullf_x[0] != NULL); __ASSERT(params.fullf_x[1] != NULL);
2012 __ASSERT(params.fullf_y[0] != NULL); __ASSERT(params.fullf_y[1] != NULL);
2013 __ASSERT(params.fullf_z[0] != NULL); __ASSERT(params.fullf_z[1] != NULL);
2014 __ASSERT(params.fullf_w[0] != NULL); __ASSERT(params.fullf_w[1] != NULL);
2021 params.offset.x = params.pp->offset.x * device__lata.x
2022 + params.pp->offset.y * device__latb.x
2023 + params.pp->offset.z * device__latc.x;
2024 params.offset.y = params.pp->offset.x * device__lata.y
2025 + params.pp->offset.y * device__latb.y
2026 + params.pp->offset.z * device__latc.y;
2027 params.offset.z = params.pp->offset.x * device__lata.z
2028 + params.pp->offset.y * device__latb.z
2029 + params.pp->offset.z * device__latc.z;
2030 #if MIC_SORT_LISTS != 0
2031 params.offset.x *= ABSWAP(-1.0f, 1.0f);
2032 params.offset.y *= ABSWAP(-1.0f, 1.0f);
2033 params.offset.z *= ABSWAP(-1.0f, 1.0f);
2037 float patch1_center_x = params.pp->patch1_center_x * device__lata.x
2038 + params.pp->patch1_center_y * device__latb.x
2039 + params.pp->patch1_center_z * device__latc.x;
2040 float patch1_center_y = params.pp->patch1_center_x * device__lata.y
2041 + params.pp->patch1_center_y * device__latb.y
2042 + params.pp->patch1_center_z * device__latc.y;
2043 float patch1_center_z = params.pp->patch1_center_x * device__lata.z
2044 + params.pp->patch1_center_y * device__latb.z
2045 + params.pp->patch1_center_z * device__latc.z;
2046 float patch2_center_x = params.pp->patch2_center_x * device__lata.x
2047 + params.pp->patch2_center_y * device__latb.x
2048 + params.pp->patch2_center_z * device__latc.x;
2049 float patch2_center_y = params.pp->patch2_center_x * device__lata.y
2050 + params.pp->patch2_center_y * device__latb.y
2051 + params.pp->patch2_center_z * device__latc.y;
2052 float patch2_center_z = params.pp->patch2_center_x * device__lata.z
2053 + params.pp->patch2_center_y * device__latb.z
2054 + params.pp->patch2_center_z * device__latc.z;
2055 params.patch1_center_x = ABSWAP(patch2_center_x, patch1_center_x);
2056 params.patch2_center_x = ABSWAP(patch1_center_x, patch2_center_x);
2057 params.patch1_center_y = ABSWAP(patch2_center_y, patch1_center_y);
2058 params.patch2_center_y = ABSWAP(patch1_center_y, patch2_center_y);
2059 params.patch1_center_z = ABSWAP(patch2_center_z, patch1_center_z);
2060 params.patch2_center_z = ABSWAP(patch1_center_z, patch2_center_z);
2063 params.virial_xx = 0;
2064 params.virial_xy = 0;
2065 params.virial_xz = 0;
2066 params.virial_yy = 0;
2067 params.virial_yz = 0;
2068 params.virial_zz = 0;
2069 params.fullElectVirial_xx = 0;
2070 params.fullElectVirial_xy = 0;
2071 params.fullElectVirial_xz = 0;
2072 params.fullElectVirial_yy = 0;
2073 params.fullElectVirial_yz = 0;
2074 params.fullElectVirial_zz = 0;
2075 params.vdwEnergy = 0.0;
2076 params.electEnergy = 0.0;
2077 params.fullElectEnergy = 0.0;
2078 #if MIC_EXCL_CHECKSUM_FULL != 0
2079 params.exclusionSum = 0;
2084 int isSelf = (params.pp->patch1_force_start == params.pp->patch2_force_start);
2086 int selfBit = ((isSelf) ? (0x01) : (0x00));
2087 int doSlowBit = ((kernel_data->doSlow) ? (0x02) : (0x00));
2088 int doEnergyBit = ((kernel_data->doEnergy) ? (0x04) : (0x00));
2089 int kernelSelect = selfBit | doSlowBit | doEnergyBit;
2090 DEVICE_FPRINTF(
"%d", kernelSelect);
2091 switch (kernelSelect) {
2092 case 0x00: calc_pair(params);
break;
2093 case 0x01: calc_self(params);
break;
2094 case 0x02: calc_pair_fullelect(params);
break;
2095 case 0x03: calc_self_fullelect(params);
break;
2096 case 0x04: calc_pair_energy(params);
break;
2097 case 0x05: calc_self_energy(params);
break;
2098 case 0x06: calc_pair_energy_fullelect(params);
break;
2099 case 0x07: calc_self_energy_fullelect(params);
break;
2101 mic_dev_die(
"!!! INVALID KERNEL SELECTION ON MIC DEVICE !!!\n");
2107 #if MIC_SORT_LISTS != 0
2109 virial_xx -= params.virial_xx;
2110 virial_xy -= params.virial_xy;
2111 virial_xz -= params.virial_xz;
2112 virial_yy -= params.virial_yy;
2113 virial_yz -= params.virial_yz;
2114 virial_zz -= params.virial_zz;
2115 fullElectVirial_xx -= params.fullElectVirial_xx;
2116 fullElectVirial_xy -= params.fullElectVirial_xy;
2117 fullElectVirial_xz -= params.fullElectVirial_xz;
2118 fullElectVirial_yy -= params.fullElectVirial_yy;
2119 fullElectVirial_yz -= params.fullElectVirial_yz;
2120 fullElectVirial_zz -= params.fullElectVirial_zz;
2121 vdwEnergy -= params.vdwEnergy;
2122 electEnergy -= params.electEnergy;
2123 fullElectEnergy -= params.fullElectEnergy;
2126 virial_xx += params.virial_xx;
2127 virial_xy += params.virial_xy;
2128 virial_xz += params.virial_xz;
2129 virial_yy += params.virial_yy;
2130 virial_yz += params.virial_yz;
2131 virial_zz += params.virial_zz;
2132 fullElectVirial_xx += params.fullElectVirial_xx;
2133 fullElectVirial_xy += params.fullElectVirial_xy;
2134 fullElectVirial_xz += params.fullElectVirial_xz;
2135 fullElectVirial_yy += params.fullElectVirial_yy;
2136 fullElectVirial_yz += params.fullElectVirial_yz;
2137 fullElectVirial_zz += params.fullElectVirial_zz;
2138 vdwEnergy += params.vdwEnergy;
2139 electEnergy += params.electEnergy;
2140 fullElectEnergy += params.fullElectEnergy;
2141 #if MIC_SORT_LISTS != 0
2145 #if MIC_EXCL_CHECKSUM_FULL != 0
2146 exclusionSum += params.exclusionSum;
2152 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
2153 device__device_times_computes[ppI * 2 + 1] = getCurrentTime();
2159 #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
2160 device__device_times_start[((isRemote)?(4):(5))] = getCurrentTime();
2171 kernel_data->fullElectVirial_xx = fullElectVirial_xx;
2172 kernel_data->fullElectVirial_xy = fullElectVirial_xy;
2173 kernel_data->fullElectVirial_xz = fullElectVirial_xz;
2174 kernel_data->fullElectVirial_yy = fullElectVirial_yy;
2175 kernel_data->fullElectVirial_yz = fullElectVirial_yz;
2176 kernel_data->fullElectVirial_zz = fullElectVirial_zz;
2177 kernel_data->vdwEnergy = vdwEnergy;
2179 kernel_data->fullElectEnergy = fullElectEnergy;
2180 #if MIC_EXCL_CHECKSUM_FULL != 0
2181 kernel_data->exclusionSum = exclusionSum;
2185 int flI_start = (isRemote) ? (kernel_data->numLocalPatches) : (0);
2186 int flI_end = (device__constants->singleKernelFlag != 0 || isRemote) ? (device__force_lists_size) : (kernel_data->numLocalPatches);
2187 __FULL_CHECK(__ASSERT(flI_start >= 0 && flI_start <= device__force_lists_size));
2188 __FULL_CHECK(__ASSERT(flI_end >= 0 && flI_start <= device__force_lists_size));
2189 __FULL_CHECK(__ASSERT(flI_start <= flI_end));
2193 const int numThreads = device__numOMPThreads;
2194 const int numForceLoopIters = flI_end - flI_start;
2195 int numForceLoopSplits = 1;
2196 if ((numForceLoopIters > 0) && ((2 * numThreads) > numForceLoopIters)) {
2197 numForceLoopSplits = (2 * numThreads) / numForceLoopIters;
2198 if (numForceLoopSplits < 2) { numForceLoopSplits = 2; }
2199 if (numForceLoopSplits > 16) { numForceLoopSplits = 16; }
2201 DEVICE_FPRINTF(
" F(%d,%d:%d)", flI_start, flI_end, numForceLoopSplits);
2202 flI_start *= numForceLoopSplits;
2203 flI_end *= numForceLoopSplits;
2206 #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
2207 device__device_times_start[((isRemote)?(6):(7))] = getCurrentTime();
2212 #if MULTIPLE_THREADS != 0
2213 #pragma omp parallel for schedule(dynamic)
2215 for (
int _flI = flI_start; _flI < flI_end; _flI++) {
2218 int flI = _flI / numForceLoopSplits;
2219 __FULL_CHECK(__ASSERT(flI >= 0 && flI < device__force_lists_size));
2220 int flIPart = _flI % numForceLoopSplits;
2226 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
2228 if (flIPart == flI % numForceLoopSplits) {
2229 device__device_times_patches[flI * 2] = getCurrentTime();
2234 const force_list &fl = device__force_lists[flI];
2235 const int f_len = fl.patch_stride * 4;
2236 __ASSUME(f_len % 16 == 0);
2239 int flIPartJLo = (int)(((
float)(f_len)) * (((float)(flIPart )) / ((
float)(numForceLoopSplits))));
2240 int flIPartJHi = (int)(((
float)(f_len)) * (((float)(flIPart + 1)) / ((
float)(numForceLoopSplits))));
2242 flIPartJLo = (flIPartJLo + 7) & (~7);
2243 flIPartJHi = (flIPartJHi + 7) & (~7);
2244 if (flIPartJHi > f_len) { flIPartJHi = f_len; }
2245 __ASSUME(flIPartJLo % 8 == 0);
2246 __ASSUME(flIPartJHi % 8 == 0);
2251 __FULL_CHECK(__ASSERT(device__forces != NULL));
2252 __FULL_CHECK(__ASSERT(fl.force_output_start >= 0));
2253 __FULL_CHECK(__ASSERT((fl.force_output_start < device__atoms_size) || (fl.force_output_start == device__atoms_size && f_len == 0)));
2254 double * RESTRICT fDst = (
double*)(device__forces + fl.force_output_start);
2255 __FULL_CHECK(__ASSERT(fDst != NULL));
2256 __ASSUME_ALIGNED(fDst);
2260 __FULL_CHECK(__ASSERT(device__force_buffers != NULL));
2261 __FULL_CHECK(__ASSERT(fl.force_list_start >= 0));
2262 __FULL_CHECK(__ASSERT((fl.force_list_start < device__force_buffers_req_size) || (fl.force_list_start == device__force_buffers_req_size && f_len == 0)));
2263 double *fSrcBase = (
double*)(device__force_buffers + fl.force_list_start);
2264 __FULL_CHECK(__ASSERT(fSrcBase != NULL));
2265 __ASSUME_ALIGNED(fSrcBase);
2268 memset(fDst + flIPartJLo, 0,
sizeof(
double) * (flIPartJHi - flIPartJLo));
2269 for (
int i = 0; i < fl.force_list_size; i++) {
2271 for (
int j = flIPartJLo; j < flIPartJHi; j++) {
2272 fDst[j] += fSrcBase[i * f_len + j];
2278 if (kernel_data->doSlow) {
2281 __FULL_CHECK(__ASSERT(device__slow_forces != NULL));
2282 double * RESTRICT fDst = (
double*)(device__slow_forces + fl.force_output_start);
2283 __FULL_CHECK(__ASSERT(fDst != NULL));
2284 __ASSUME_ALIGNED(fDst);
2288 __FULL_CHECK(__ASSERT(device__slow_force_buffers != NULL));
2289 double *fSrcBase = (
double*)(device__slow_force_buffers + fl.force_list_start);
2290 __FULL_CHECK(__ASSERT(fSrcBase != NULL));
2291 __ASSUME_ALIGNED(fSrcBase);
2294 memset(fDst + flIPartJLo, 0,
sizeof(
double) * (flIPartJHi - flIPartJLo));
2295 for (
int i = 0; i < fl.force_list_size; i++) {
2297 for (
int j = flIPartJLo; j < flIPartJHi; j++) {
2298 fDst[j] += fSrcBase[i * f_len + j];
2304 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
2305 device__device_times_patches[flI * 2 + 1] = getCurrentTime();
2311 #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
2312 device__device_times_start[((isRemote)?(8):(9))] = getCurrentTime();
2316 #if MIC_DATA_STRUCT_VERIFY != 0
2317 _verify_data_structures(device__pe, device__timestep, isRemote, 2, ppI_start, ppI_end, kernel_data->doSlow);
2318 _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (
char*)device__patch_pairs, (
char*)device__patch_pairs_copy,
sizeof(patch_pairs) * device__patch_pairs_size,
"device__patch_pairs_copy");
2319 _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (
char*)device__force_lists, (
char*)device__force_lists_copy,
sizeof(force_lists) * device__force_lists_size,
"device__force_lists_copy");
2320 _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (
char*)device__atoms, (
char*)device__atoms_copy,
sizeof(atom) * device__atoms_size,
"device__atoms_copy");
2321 _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (
char*)device__atom_params, (
char*)device__atom_params_copy,
sizeof(atom_param) * device__atoms_size,
"device__atom_params_copy");
2325 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
2326 if (device__timestep % 100 == 0 && isRemote == 0) {
2328 uint64_t plTotalMem =
sizeof(
void*) * device__pairlists_alloc_size;
2329 uint64_t plUsedMem = plTotalMem;
2330 uint64_t plPeakUsedMem = plTotalMem;
2331 if (device__pairlists != NULL) {
2332 for (
int k = 0; k < device__pairlists_alloc_size; k++) {
2333 int * plPtr = ((
int**)device__pairlists)[k];
2334 if (plPtr != NULL) {
2335 plTotalMem += plPtr[0] *
sizeof(int) + 64;
2336 plUsedMem += plPtr[1] *
sizeof(int);
2337 plPeakUsedMem += plPtr[-1] *
sizeof(int);
2342 uint64_t fbsTotalMem = device__force_buffers_alloc_size * 4 *
sizeof(double);
2346 printf(
"[MIC-MEMINFO] :: PE %d :: MIC mem usage (MB) -- ts: %d - "
2347 "f: %.3lf, t: %.3lf, c: %.3lf, a: %.3lf, i: %.3lf - "
2348 "vS: %.3lf, vP: %.3lf - "
2349 "plTM: %.3lf, plUM: %.3lf, plUPM: %.3lf, fbsTM(x2): %.3lf\n",
2350 ((
int)(device__pe)),
2351 ((
int)(device__timestep)),
2352 ((
double)(mi.memFree)) * 1.0e-6,
2353 ((
double)(mi.memTotal)) * 1.0e-6,
2354 ((
double)(mi.cached)) * 1.0e-6,
2355 ((
double)(mi.active)) * 1.0e-6,
2356 ((
double)(mi.inactive)) * 1.0e-6,
2357 ((
double)(mi.vmSize)) * 1.0e-6,
2358 ((
double)(mi.vmPeak)) * 1.0e-6,
2359 ((
double)(plTotalMem)) * 1.0e-6,
2360 ((
double)(plUsedMem)) * 1.0e-6,
2361 ((
double)(plPeakUsedMem)) * 1.0e-6,
2362 ((
double)(fbsTotalMem)) * 1.0e-6
2368 if (kernel_data->isRemote == 0) { device__timestep++; }
2370 DEVICE_FPRINTF(
" |\n");
2374 #if MIC_TRACING != 0
2375 double pragma_end = CmiWallTimer();
2376 MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_PRAGMA, pragma_start, pragma_end);
2379 #if MIC_SYNC_INPUT != 0
2380 #undef TMP_SYNC_ATOMS_CLAUSE
2382 #undef TMP_SYNC_KERNEL_DATA_CLAUSE
2383 #undef MIC_DEVICE_SYNC_INPUT_CLAUSE
2385 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2386 #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
2388 #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2390 #undef MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE
2391 #undef MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE
2394 void mic_transfer_output(
const int deviceNum,
2396 const int numLocalAtoms,
2400 const int numAtoms = host__atoms_size;
2402 mic_kernel_data * kernel_data = host__kernel_data;
2404 int toCopyStart_forces = (singleKernelFlag != 0) ? (0) : ((isRemote) ? (numLocalAtoms) : (0));
2405 int toCopySize_forces = (singleKernelFlag != 0) ? (numAtoms) : ((isRemote) ? (numAtoms - numLocalAtoms) : (numLocalAtoms));
2406 int toCopyStart_slow_forces = (doSlow) ? (toCopyStart_forces) : (0);
2407 int toCopySize_slow_forces = (doSlow) ? (toCopySize_forces) : (0);
2409 double4 * forces = host__forces;
2410 double4 * slow_forces = host__slow_forces;
2412 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2413 const int numPatchPairs = host__patch_pairs_size;
2414 const int numForceLists = host__force_lists_size;
2415 double * device_times_start = host__device_times_start;
2416 int toCopySize_device_times_start = ((isRemote) ? (0) : (10));
2417 #if MIC_DEVICE_TRACING_DETAILED != 0
2418 double * device_times_computes = host__device_times_computes;
2419 double * device_times_patches = host__device_times_patches;
2420 int toCopySize_device_times_computes = ((isRemote) ? (0) : (2 * numPatchPairs));
2421 int toCopySize_device_times_patches = ((isRemote) ? (0) : (2 * numForceLists));
2422 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
2423 out(device_times_computes[0:toCopySize_device_times_computes] : alloc_if(0) free_if(0)) \
2424 out(device_times_patches[0:toCopySize_device_times_patches] : alloc_if(0) free_if(0))
2426 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
2428 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
2429 MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
2430 out(device_times_start[0:toCopySize_device_times_start] : alloc_if(0) free_if(0))
2432 #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2435 #pragma offload_transfer target(mic:deviceNum) \
2436 out(forces[toCopyStart_forces:toCopySize_forces] : alloc_if(0) free_if(0)) \
2437 out(slow_forces[toCopyStart_slow_forces:toCopySize_slow_forces] : alloc_if(0) free_if(0)) \
2438 out(kernel_data[isRemote:1] : alloc_if(0) free_if(0)) \
2439 MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2443 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2444 #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
2446 #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2449 int mic_check_remote_kernel_complete(
const int deviceNum) {
2454 tag_remote_kernel = 0;
2460 int mic_check_local_kernel_complete(
const int deviceNum) {
2465 tag_local_kernel = 0;
2472 void mic_free_device(
const int deviceNum) {
2474 mic_kernel_data * kernel_data = host__kernel_data;
2477 #pragma offload target(mic:deviceNum) \
2478 nocopy(kernel_data : alloc_if(0) free_if(1)) \
2479 nocopy(device__table_four) nocopy(device__table_four_float) \
2480 nocopy(device__lj_table) nocopy(device__lj_table_float) \
2481 nocopy(device__exclusion_bits) nocopy(device__exclusion_bits_copy) \
2482 nocopy(device__constants) \
2483 nocopy(device__force_buffers) nocopy(device__slow_force_buffers) \
2484 nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
2485 nocopy(device__pl_array) nocopy(device__pl_size) nocopy(device__r2_array) \
2486 nocopy(device__patch_pairs : alloc_if(0) free_if(1)) \
2487 nocopy(device__force_lists : alloc_if(0) free_if(1)) \
2488 nocopy(device__numOMPThreads) \
2489 DEVICE_FPRINTF_CLAUSE
2492 if (device__pairlists != NULL) {
2493 int **pairlists = (
int**)(device__pairlists);
2494 for (
int i = 0; i < device__pairlists_alloc_size; i++) {
2495 if (pairlists[i] != NULL) { _MM_FREE_WRAPPER(pairlists[i]); }
2497 _MM_FREE_WRAPPER(pairlists);
2498 device__pairlists = 0;
2499 device__pairlists_alloc_size = 0;
2503 #if REFINE_PAIRLISTS != 0
2504 for (
int i = 0; i < device__numOMPThreads; i++) {
2505 if (device__pl_array[i] != NULL) { _MM_FREE_WRAPPER(device__pl_array[i]); }
2506 if (device__r2_array[i] != NULL) { _MM_FREE_WRAPPER(device__r2_array[i]); }
2508 _MM_FREE_WRAPPER(device__pl_array); device__pl_array = NULL;
2509 _MM_FREE_WRAPPER(device__r2_array); device__r2_array = NULL;
2510 _MM_FREE_WRAPPER(device__pl_size); device__pl_size = NULL;
2514 #if MIC_HANDCODE_FORCE_SINGLE != 0
2515 if (device__table_four != NULL) { _MM_FREE_WRAPPER(device__table_four); device__table_four = NULL; }
2516 if (device__table_four_float != NULL) { _MM_FREE_WRAPPER(device__table_four_float); device__table_four_float = NULL; }
2517 if (device__lj_table != NULL) { _MM_FREE_WRAPPER(device__lj_table); device__lj_table = NULL; }
2518 if (device__lj_table_float != NULL) { _MM_FREE_WRAPPER(device__lj_table_float); device__lj_table_float = NULL; }
2520 if (device__exclusion_bits != NULL) { _MM_FREE_WRAPPER(device__exclusion_bits); device__exclusion_bits = NULL; }
2521 if (device__exclusion_bits_copy != NULL) { _MM_FREE_WRAPPER(device__exclusion_bits_copy); device__exclusion_bits_copy = NULL; }
2522 if (device__constants != NULL) { _MM_FREE_WRAPPER(device__constants); device__constants = NULL; }
2525 if (device__force_buffers != NULL) { _MM_FREE_WRAPPER(device__force_buffers); device__force_buffers = NULL; }
2526 if (device__slow_force_buffers != NULL) { _MM_FREE_WRAPPER(device__slow_force_buffers); device__slow_force_buffers = NULL; }
2528 #if MIC_DEVICE_FPRINTF != 0
2529 fclose(device__fout);
2533 if (host__kernel_data != NULL) { _MM_FREE_WRAPPER(host__kernel_data); host__kernel_data = NULL; }
2538 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
2540 __declspec(target(mic))
2541 void printMemInfo(
int device__pe,
int device__timestep, MemInfo * mi) {
2542 printf(
"[MIC-MEMINFO] :: PE %d :: MIC mem usage (MB) -- ts: %d - "
2543 "f: %.3lf, t: %.3lf, c: %.3lf, a: %.3lf, i: %.3lf - "
2544 "vS: %.3lf, vP: %.3lf\n",
2545 ((
int)(device__pe)),
2546 ((
int)(device__timestep)),
2547 ((
double)(mi->memFree)) * 1.0e-6,
2548 ((
double)(mi->memTotal)) * 1.0e-6,
2549 ((
double)(mi->cached)) * 1.0e-6,
2550 ((
double)(mi->active)) * 1.0e-6,
2551 ((
double)(mi->inactive)) * 1.0e-6,
2552 ((
double)(mi->vmSize)) * 1.0e-6,
2553 ((
double)(mi->vmPeak)) * 1.0e-6
2557 __declspec(target(mic))
2558 void readMemInfo_processLine(MemInfo * memInfo,
char * n,
char * v,
char * u) {
2559 assert(n != NULL && v != NULL);
2561 size_t * loc = NULL;
2562 if (strcmp(n,
"MemTotal:") == 0) { loc = &(memInfo->memTotal); }
2563 if (strcmp(n,
"MemFree:") == 0) { loc = &(memInfo->memFree); }
2564 if (strcmp(n,
"Cached:") == 0) { loc = &(memInfo->cached); }
2565 if (strcmp(n,
"Active:") == 0) { loc = &(memInfo->active); }
2566 if (strcmp(n,
"Inactive:") == 0) { loc = &(memInfo->inactive); }
2567 if (strcmp(n,
"VmSize:") == 0) { loc = &(memInfo->vmSize); }
2568 if (strcmp(n,
"VmPeak:") == 0) { loc = &(memInfo->vmPeak); }
2569 if (loc == NULL) {
return; }
2571 *loc = (size_t)(atoi(v));
2573 if (strcmp(u,
"kB") == 0) { (*loc) *= 1024; }
2577 __declspec(target(mic))
2578 bool readMemInfo(MemInfo * memInfo) {
2579 if (memInfo == NULL) {
return false; }
2580 memset(memInfo, 0,
sizeof(MemInfo));
2582 FILE * procMem = fopen(
"/proc/meminfo",
"r");
2583 if (procMem == NULL) {
2584 printf(
"[WARNING] :: Unable to read /proc/meminfo\n");
2587 char line[256], n[128], v[128], u[128];
2588 while (!feof(procMem)) {
2589 int numFilled = fscanf(procMem,
"%[^\n]\n", line);
2590 if (numFilled == 0) { printf(
"empty line\n");
break; }
2591 numFilled = sscanf(line,
"%s%s%s", n, v, u);
2592 if (numFilled == 3) {
2593 readMemInfo_processLine(memInfo, n, v, u);
2594 }
else if (numFilled == 2) {
2595 int vLen = strlen(v);
2596 if (v[vLen-1] ==
':') {
2597 memcpy(n, v, vLen + 1);
2599 readMemInfo_processLine(memInfo, n, v, NULL);
2604 pid_t pid = getpid();
2606 sprintf(filename,
"/proc/%d/status", (
int)pid);
2607 procMem = fopen(filename,
"r");
2608 if (procMem == NULL) {
2609 printf(
"[WARNING] :: Unable to read %s\n", filename);
2612 while (!feof(procMem)) {
2613 int numFilled = fscanf(procMem,
"%[^\n]\n", line);
2614 if (numFilled == 0) { printf(
"empty line\n");
break; }
2615 numFilled = sscanf(line,
"%s%s%s", n, v, u);
2616 if (numFilled == 3) {
2617 readMemInfo_processLine(memInfo, n, v, u);
2618 }
else if (numFilled == 2) {
2619 readMemInfo_processLine(memInfo, n, v, NULL);
2627 #endif // MIC_TRACK_DEVICE_MEM_USAGE != 0
2630 __declspec(target(mic))
2631 void* _mm_malloc_withPrint(
size_t s,
int a,
char * l) {
2632 void * ptr = _mm_malloc(s, a);
2633 printf(
"[MIC-MALLOC] :: _mm_malloc - l: \"%s\", s: %lu (%lu), ptr: %p\n", l, s,
sizeof(s), ptr);
2637 __declspec(target(mic))
2638 void _mm_free_withPrint(
void * ptr) {
2639 printf(
"[MIC-MALLOC] :: _mm_free - ptr: %p\n", ptr);
register BigReal virial_xy
register BigReal virial_xz
register BigReal virial_yz
static __global__ void(const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const int *vdw_types, unsigned int *plist, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials, unsigned int *global_counters, int *force_ready_queue, const unsigned int *overflow_exclusions, const int npatches, const int block_begin, const int total_block_count, int *block_order, exclmask *exclmasks, const int lj_table_size, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float plcutoff2, const int doSlow)
static __thread int patch_pairs_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
register BigReal electEnergy
static __thread atom * atoms
static __thread float4 * forces
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
register BigReal virial_yy
static __thread patch_pair * patch_pairs
texture< float2, 1, cudaReadModeElementType > lj_table
register BigReal virial_zz
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
register BigReal virial_xx
static __thread atom_param * atom_params
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
static __thread int atoms_size