NAMD
ComputeNonbondedMICKernel.C
Go to the documentation of this file.
1 // NOTE: See ComputeNonbondedMICKernel.h for a general description of NAMD on MIC.
2 
3 // Only compile the contents of this file if this build supports MIC.
4 #ifdef NAMD_MIC
5 
6 
7 #include <stdlib.h>
8 #include <stdio.h>
9 #include <offload.h>
11 #include <assert.h>
12 #include <math.h>
13 
14 // DMK - DEBUG
15 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
16  #include <sys/types.h>
17  #include <unistd.h>
18 #endif
19 
20 // Setup __ASSUME_ALIGNED macro to take the appropriate action based on macro flags
21 #if (CHECK_ASSUME_ALIGNED != 0)
22  #define __ASSUME_ALIGNED(v) assert(((unsigned long long int)(v)) % 64 == 0); __assume_aligned((v), MIC_ALIGN);
23 #else
24  #define __ASSUME_ALIGNED(v) __assume_aligned((v), MIC_ALIGN);
25 #endif
26 
27 // Setup __ASSUME macro to take the appropriate action based on macro flags
28 #if (CHECK_ASSUME != 0)
29  #define __ASSUME(s) assert(s); __assume(s);
30 #else
31  #define __ASSUME(s) __assume(s);
32 #endif
33 
34 // Setup RESTRICT macro
35 #define RESTRICT __restrict
36 
37 #ifdef WIN32
38  #define __thread __declspec(thread)
39 #endif
40 
41 
43 // Global data used during simulation (tables, constants, etc. that are setup
44 // during startup but essentially read-only and constant throughout the
45 // steady-state simulation.
46 
47 __attribute__((target(mic))) double * device__table_four = NULL;
48 __attribute__((target(mic))) float * device__table_four_float = NULL;
49 __attribute__((target(mic))) int device__table_four_n_16 = 0;
50 
51 __attribute__((target(mic))) double * device__lj_table = NULL;
52 __attribute__((target(mic))) float * device__lj_table_float = NULL;
53 __attribute__((target(mic))) int device__lj_table_dim = 0;
54 __attribute__((target(mic))) int device__lj_table_size = 0;
55 
56 __attribute__((target(mic))) unsigned int * device__exclusion_bits = NULL;
57 __attribute__((target(mic))) long int device__exclusion_bits_size = 0;
58 
59 __attribute__((target(mic))) mic_constants * device__constants = NULL;
60 
61 __attribute__((target(mic))) double * device__table_four_copy = NULL;
62 __attribute__((target(mic))) float * device__table_four_float_copy = NULL;
63 __attribute__((target(mic))) double * device__lj_table_copy = NULL;
64 __attribute__((target(mic))) float * device__lj_table_float_copy = NULL;
65 __attribute__((target(mic))) unsigned int * device__exclusion_bits_copy = NULL;
66 __attribute__((target(mic))) mic_constants * device__constants_copy = NULL;
67 
68 __attribute__((target(mic))) patch_pair* device__patch_pairs_copy = NULL;
69 __attribute__((target(mic))) force_list* device__force_lists_copy = NULL;
70 __attribute__((target(mic))) atom* device__atoms_copy = NULL;
71 __attribute__((target(mic))) atom_param* device__atom_params_copy = NULL;
72 __attribute__((target(mic))) int device__patch_pairs_copy_size = 0;
73 __attribute__((target(mic))) int device__force_lists_copy_size = 0;
74 __attribute__((target(mic))) int device__atoms_copy_size = 0;
75 __attribute__((target(mic))) int device__atom_params_copy_size = 0;
76 
77 
79 // Device variables which exist both on the host and the MIC device and/or are
80 // used to manage moving data betwen the host and the MIC device.
81 
82 // DMK - DEBUG - PE and node info for printing debug output
83 __thread int host__pe = -1;
84 __thread int host__node = -1;
85 __attribute__((target(mic))) int device__pe = -1;
86 __attribute__((target(mic))) int device__node = -1;
87 
88 __thread int singleKernelFlag = 0;
89 
90 __thread patch_pair * host__patch_pairs = NULL;
91 __thread int host__patch_pairs_size = 0;
92 __thread int host__patch_pairs_bufSize = 0;
93 __attribute__((target(mic))) const patch_pair * device__patch_pairs = NULL;
94 __attribute__((target(mic))) int device__patch_pairs_size = 0;
95 
96 __thread force_list * host__force_lists = NULL;
97 __thread int host__force_lists_size = 0;
98 __thread int host__force_lists_bufSize = 0;
99 __attribute__((target(mic))) force_list * device__force_lists = NULL;
100 __attribute__((target(mic))) int device__force_lists_size = 0;
101 
102 __attribute__((target(mic))) uintptr_t device__pairlists = 0; // NOTE: Don't use a global in case the device is shared between multiple threads, so each thread's patch pair lists has its own pairlist pointer
103 __attribute__((target(mic))) int device__pairlists_alloc_size = 0;
104 
105 __attribute__((target(mic))) int** device__pl_array = NULL;
106 __attribute__((target(mic))) int* device__pl_size = NULL;
107 __attribute__((target(mic))) double** device__r2_array = NULL;
108 
109 __thread atom * host__atoms = NULL;
110 __thread atom_param * host__atom_params = NULL;
111 __thread double4 * host__forces = NULL;
112 __thread double4 * host__slow_forces = NULL;
113 __thread int host__atoms_size = 0;
114 __thread int host__atoms_bufSize = 0;
115 __attribute__((target(mic))) atom * device__atoms = NULL;
116 __attribute__((target(mic))) atom_param * device__atom_params = NULL;
117 __attribute__((target(mic))) double4 * device__forces = NULL;
118 __attribute__((target(mic))) double4 * device__slow_forces = NULL;
119 __attribute__((target(mic))) int device__atoms_size = 0;
120 
121 __thread size_t host__force_buffers_req_size = 0;
122 __attribute__((target(mic))) size_t device__force_buffers_req_size = 0;
123 __attribute__((target(mic))) size_t device__force_buffers_alloc_size = 0;
124 __attribute__((target(mic))) double4 * device__force_buffers = NULL;
125 __attribute__((target(mic))) double4 * device__slow_force_buffers = NULL;
126 
127 __attribute__((target(mic))) mic_position3_t device__lata;
128 __attribute__((target(mic))) mic_position3_t device__latb;
129 __attribute__((target(mic))) mic_position3_t device__latc;
130 
131 __thread mic_kernel_data * host__kernel_data = NULL;
132 
133 __attribute__((target(mic))) int device__numOMPThreads = -1;
134 
135 __thread int tag_atom_params;
136 __thread int tag_remote_kernel;
137 __thread int tag_local_kernel;
138 
139 __thread int patch_pairs_copySize;
140 __thread int force_lists_copySize;
141 __thread int atom_params_copySize;
142 
143 #if MIC_DEVICE_FPRINTF != 0
144 __attribute__((target(mic))) FILE * device__fout = NULL;
145 #endif
146 
147 // DMK - DEBUG - Based on the number of kernels invoked, track the timestep number.
148 //__thread int host__timestep = 0;
149 __attribute__((target(mic))) int device__timestep = 0;
150 
151 
152 // DMK - TRACING / TIMING
153 #include <sys/time.h>
154 __declspec(target(mic))
155 double getCurrentTime() {
156  timeval now;
157  gettimeofday(&now, NULL);
158  return (double)(now.tv_sec + (now.tv_usec * 1.0e-6));
159 }
160 
161 // DMK - TRACING
162 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
163  #if MIC_DEVICE_TRACING_DETAILED != 0
164  __thread double * host__device_times_computes = NULL;
165  __thread double * host__device_times_patches = NULL;
166  __attribute__((target(mic))) double * device__device_times_computes = NULL;
167  __attribute__((target(mic))) double * device__device_times_patches = NULL;
168  #endif
169  __thread double * host__device_times_start = NULL;
170  __attribute__((target(mic))) double * device__device_times_start = NULL;
171 #endif
172 
173 
174 // DMK - TODO : A device version of the die function, which does not have access
175 // to the host info. Only to be used within the kernel itself. Perhaps add
176 // some startup code to send host/PE# and device# to device and then target
177 // mic_die to the device as well, removing the need for this separate function.
178 __attribute__((target(mic)))
179 void mic_dev_die(const char * const str) {
180  #ifdef __MIC__
181  const char * const loc = "on device";
182  #else
183  const char * const loc = "on host";
184  #endif
185  if (str != NULL) {
186  printf("[MIC_DIE] :: \"%s\" (%s)\n", str, loc);
187  } else {
188  printf("[MIC_DIE] :: mic_dev_die called (%s)\n", loc);
189  }
190  fflush(NULL);
191  abort();
192 }
193 
194 
195 __attribute__((target(mic)))
196 void mic_print_config() {
197 
198  #if MIC_PRINT_CONFIG != 0
199 
200  // DMK - TODO | FIXME : Create a mechanism, so that it is only printed once if
201  // there are multiple MIC devices being used.
202 
203  printf("device :: MULTIPLE_THREADS (%d)\n", MULTIPLE_THREADS);
204  printf("device :: # OpenMP Threads : %d\n", device__numOMPThreads);
205 
206  printf("device :: MIC_HANDCODE_FORCE (%d)\n", MIC_HANDCODE_FORCE);
207  printf("device :: MIC_HANDCODE_FORCE_PFDIST (%d)\n", MIC_HANDCODE_FORCE_PFDIST);
208  printf("device :: MIC_HANDCODE_FORCE_USEGATHER_NBTBL (%d)\n", MIC_HANDCODE_FORCE_USEGATHER_NBTBL);
209  printf("device :: MIC_HANDCODE_FORCE_USEGATHER (%d)\n", MIC_HANDCODE_FORCE_USEGATHER);
210  printf("device :: MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT (%d)\n", MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT);
211  printf("device :: MIC_HANDCODE_FORCE_COMBINE_FORCES (%d)\n", MIC_HANDCODE_FORCE_COMBINE_FORCES);
212 
213  printf("device :: MIC_HANDCODE_FORCE_SINGLE (%d)\n", MIC_HANDCODE_FORCE_SINGLE);
214  printf("device :: MIC_HANDCODE_FORCE_CALCR2TABLE (%d)\n", MIC_HANDCODE_FORCE_CALCR2TABLE);
215  printf("device :: MIC_HANDCODE_FORCE_SOA_VS_AOS (%d)\n", MIC_HANDCODE_FORCE_SOA_VS_AOS);
216 
217  printf("device :: MIC_SORT_ATOMS (%d)\n", MIC_SORT_ATOMS);
218  printf("device :: MIC_SORT_LISTS (%d)\n", MIC_SORT_LISTS);
219 
220  printf("device :: MIC_HANDCODE_PLGEN (%d)\n", MIC_HANDCODE_PLGEN);
221  printf("device :: MIC_TILE_PLGEN (%d)\n", MIC_TILE_PLGEN);
222  printf("device :: MIC_CONDITION_NORMAL (%d)\n", MIC_CONDITION_NORMAL);
223  printf("device :: MIC_PAD_PLGEN (%d)\n", MIC_PAD_PLGEN);
224 
225  printf("device :: MIC_SORT_COMPUTES (%d)\n", MIC_SORT_COMPUTES);
226 
227  printf("device :: MIC_SYNC_INPUT (%d)\n", MIC_SYNC_INPUT);
228 
229  printf("device :: REFINE_PAIRLISTS (%d)\n", REFINE_PAIRLISTS);
230  printf("device :: REFINE_PAIRLIST_HANDCODE (%d)\n", REFINE_PAIRLIST_HANDCODE);
231  printf("device :: REFINE_PAIRLISTS_XYZ (%d)\n", REFINE_PAIRLISTS_XYZ);
232 
233  printf("device :: MIC_FULL_CHECK (%d)\n", MIC_FULL_CHECK);
234  printf("device :: MIC_EXCL_CHECKSUM_FULL (%d)\n", MIC_EXCL_CHECKSUM_FULL);
235 
236  printf("device :: MIC_ALIGN (%d)\n", MIC_ALIGN);
237 
238  fflush(NULL);
239 
240  #endif // MIC_PRINT_CONFIG
241 }
242 
243 
244 // Function to initialize a given MIC device by initializing various device
245 // variables, arrays, etc.
246 // Input:
247 // - pe : the PE number for the host core associated with the device
248 // - deviceNum : the device number to be initialized
249 void mic_init_device(const int pe, const int node, const int deviceNum) {
250 
251  // Record the PE and node info associated with the given device
252  host__pe = pe;
253  host__node = node;
254 
255  // Initialize kernel data structures
256  host__kernel_data = (mic_kernel_data*)(_MM_MALLOC_WRAPPER(2 * sizeof(mic_kernel_data), 64, "mic_kernel_data"));
257  __ASSERT(host__kernel_data != NULL);
258  mic_kernel_data * kernel_data = host__kernel_data;
259 
260  // Initialize flags for offload buffers that are not copied every timestep
261  patch_pairs_copySize = 0;
262  force_lists_copySize = 0;
263  atom_params_copySize = 0;
264 
265  // Allocate data for the host__device_times_start buffer (constant size throughout simulation)
266  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
267  host__device_times_start = (double*)_MM_MALLOC_WRAPPER(10 * sizeof(double), 64, "host__device_times_start");
268  __ASSERT(host__device_times_start != NULL);
269  double * device_times_start = host__device_times_start;
270  #define DEVICE_TIMES_CLAUSE nocopy(device_times_start[0:10] : alloc_if(1) free_if(0) align(64)) \
271  nocopy(device__device_times_start)
272  #else
273  #define DEVICE_TIMES_CLAUSE
274  #endif
275 
276  // Initialize the device itself via an offload pragma section
277  #pragma offload target(mic:deviceNum) \
278  in(pe) in(node) nocopy(device__pe) nocopy(device__node) \
279  in(kernel_data[0:2] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
280  nocopy(device__pl_array) nocopy(device__pl_size) nocopy(device__r2_array) \
281  nocopy(device__numOMPThreads) \
282  DEVICE_TIMES_CLAUSE \
283  DEVICE_FPRINTF_CLAUSE
284  {
285  __MUST_BE_MIC;
286  __FULL_CHECK(__ASSERT(node >= 0 && pe >= 0));
287 
288  device__pe = pe;
289  device__node = node;
290 
291  #if MIC_DEVICE_FPRINTF != 0
292  {
293  char filename[128] = { 0 };
294  sprintf(filename, "/tmp/namd_deviceDebugInfo.%d", device__pe);
295  if (device__node <= 0) {
296  printf("[MIC-DEBUG] :: Generating debug output to device file for MICs.\n");
297  fflush(NULL);
298  }
299  device__fout = fopen(filename, "w");
300  }
301  #endif
302 
303  DEVICE_FPRINTF("Device on PE %d (node: %d) initializing...\n", device__pe, device__node);
304 
305  // Get the number of threads available to the code
306  #if MULTIPLE_THREADS != 0
307  device__numOMPThreads = omp_get_max_threads();
308  #else
309  device__numOMPThreads = 1;
310  #endif
311 
312  // Initialize the r2 arrays (scratch buffers used by computes to refine pairlists
313  // each timestep, when pairlist refinement is enabled)
314  #if REFINE_PAIRLISTS != 0
315  device__pl_array = (int**)(_MM_MALLOC_WRAPPER(device__numOMPThreads * sizeof(int*), 64, "device__pl_array")); __ASSERT(device__pl_array != NULL);
316  device__pl_size = (int*)(_MM_MALLOC_WRAPPER(device__numOMPThreads * sizeof(int), 64, "device__pl_size")); __ASSERT(device__pl_size != NULL);
317  device__r2_array = (double**)(_MM_MALLOC_WRAPPER(device__numOMPThreads * sizeof(double*), 64, "device__r2_array"); __ASSERT(device__r2_array != NULL);
318  for (int i = 0; i < device__numOMPThreads; i++) {
319  device__pl_array[i] = NULL;
320  device__pl_size[i] = 0;
321  device__r2_array[i] = NULL;
322  }
323  #endif
324 
325  // Copy the pointer to the device_times_start buffer
326  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
327  device__device_times_start = device_times_start;
328  #endif
329 
330  // DMK : NOTE | TODO | FIXME - This will print the configuration of the MICs, but the condition
331  // assumes that there will be at least one MIC on the first node (node 0). Further, it assumes
332  // that all MICs will be the same (i.e. only prints on one device). If there are cases where
333  // these assumptions do not hold, we need to expand this code to cover those cases. For now,
334  // leaving it this way because it reduces the amount of output during the run (espeically when
335  // scaling to multiple nodes and/or multiple MICs per node) and is likely the case (for now).
336  if (node == 0 && deviceNum == 0) { mic_print_config(); }
337 
338  } // end pragma offload
339 
340  #undef DEVICE_TIMES_CLAUSE
341 }
342 
343 
344 // Function that returns 0 if executed on the host, 1 if executed on the MIC device. Used to
345 // indicate if a target is available or not during application startup.
346 // Input: N/A
347 // Output:
348 // - 0 if executed on host, 1 if executed on device
349 __attribute__((target(mic))) int mic_check_internal() {
350  int retval;
351  #ifdef __MIC__
352  retval = 1;
353  #else
354  retval = 0;
355  #endif
356  return retval;
357 }
358 
359 
360 // Function to check that the given device is available for offloading.
361 // Input:
362 // - dev : The device number (0+) to check
363 // Output:
364 // - 1 if device is available (will call mic_dev_die if not available to prevent further application progress)
365 int mic_check(int deviceNum) {
366  int dev_ok = 0;
367  #pragma offload target(mic:deviceNum) inout(dev_ok)
368  {
369  #if !defined(__MIC__)
370  mic_dev_die("mic_check :: Attempt to execute offload on host (not supported yet)... exiting."); fflush(NULL);
371  #endif
372  dev_ok = mic_check_internal();
373  }
374  return dev_ok;
375 }
376 
377 
378 // DMK - NOTE : The following is debug code used for verifying data structures. Should not be used in
379 // production runs.
380 #if MIC_DATA_STRUCT_VERIFY != 0
381 
382 
383 __declspec(target(mic))
384 void _verify_parallel_memcpy(void * const dst,
385  const void * const src,
386  const size_t size
387  ) {
388  const char* const src_c = (const char* const)src;
389  char* const dst_c = (char* const)dst;
390  #pragma omp parallel for schedule(static, 4096)
391  for (int i = 0; i < size; i++) { dst_c[i] = src_c[i]; }
392 }
393 
394 template<class T>
395 __declspec(target(mic))
396 void* _verify_remalloc(int &copySize, const int size, T* &ptr) {
397  if (copySize < size) {
398  copySize = (int)(1.2f * size); // Add some extra buffer room
399  copySize = (copySize + 1023) & (~1023); // Round up to multiple of 1024
400  _MM_FREE_WRAPPER(ptr);
401  ptr = (T*)_MM_MALLOC_WRAPPER(copySize * sizeof(T), 64, "_verify_remalloc");
402  __FULL_CHECK(assert(ptr != NULL));
403  }
404  return ptr;
405 }
406 
407 
408 __declspec(target(mic))
409 int _verify_pairlist(const int pe, const int timestep, const int isRemote, const int phase,
410  const int i_upper, const int j_upper,
411  const int * const pairlist_base,
412  const int ppI, const char * const plStr,
413  const patch_pair &pp, const int plTypeIndex
414  ) {
415 
416  const atom_param * const pExt_0 = device__atom_params + pp.patch1_atom_start;
417  const atom_param * const pExt_1 = device__atom_params + pp.patch2_atom_start;
418 
419  if (pairlist_base == NULL) {
420  if (timestep > 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pairlist_base == NULL !!!\n", pe, timestep, isRemote, phase); return 1; }
421  } else if (i_upper <= 0) {
422  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i_upper <= 0 !!!\n", pe, timestep, isRemote, phase); return 1;
423  } else if (j_upper <= 0) {
424  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: j_upper <= 0 !!!\n", pe, timestep, isRemote, phase); return 1;
425  } else {
426 
427  int currentI = 0;
428  int inPadding = 0;
429  int pairlist_size = pairlist_base[1] - 2;
430  int pairlist_allocSize = pairlist_base[0];
431  const int * const pairlist = pairlist_base + 2;
432 
433  if (pairlist_size + 2 > pairlist_allocSize) {
434  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pairlist_size + 2 > pairlist_allocSize !!!\n", pe, timestep, isRemote, phase); return 1;
435  }
436 
437  for (int k = 0; k < pairlist_size; k++) {
438 
439  int somethingWrong = 0;
440  int i = (pairlist[k] >> 16) & 0xFFFF;
441  int j = pairlist[k] & 0xFFFF;
442 
443  // Verify the interaction is in the correct list
444  if (j != 0xFFFF) { // If this is a valid entry, not padding
445  const int maxDiff = pExt_1[j].excl_maxdiff;
446  const int indexDiff = pExt_0[i].index - pExt_1[j].index;
447  if (indexDiff < -1 * maxDiff || indexDiff > maxDiff) { // Not in patten, so must be NORM
448  if (plTypeIndex != PL_NORM_INDEX) {
449  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: interaction outside maxDiff range in non-NORM pairlist !!!"
450  " - ppI:%d, i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d\n",
451  pe, timestep, isRemote, phase,
452  ppI, i, j, indexDiff, maxDiff, plTypeIndex
453  );
454  somethingWrong = 1;
455  }
456  } else { // In pattern, so verify against bits
457  const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
458  const int offset_major = offset / (sizeof(unsigned int) * 8);
459  const int offset_minor = offset % (sizeof(unsigned int) * 8);
460  const int exclFlags = ((device__exclusion_bits[offset_major]) >> offset_minor) & 0x03;
461  if (exclFlags == 0x3) {
462  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid exclusion bit pattern detected !!!"
463  " - ppI:%d, i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d\n",
464  pe, timestep, isRemote, phase,
465  ppI, i, j, indexDiff, maxDiff, plTypeIndex
466  );
467  somethingWrong = 1;
468  } else if (exclFlags != plTypeIndex) {
469  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: plTypeIndex/exclution_bits mismatch !!!"
470  " - i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d, exclFlags:%d\n",
471  pe, timestep, isRemote, phase,
472  i, j, indexDiff, maxDiff, plTypeIndex, exclFlags
473  );
474  somethingWrong = 1;
475  }
476  }
477  }
478 
479  // Check for a new "i" value
480  if (i != currentI) {
481  if (i < currentI) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i < currentI !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
482  if (k % 16 != 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i changed when k % 16 != 0 !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
483  currentI = i;
484  inPadding = 0; // New i means we are transitioning into a non-padding region
485  }
486 
487  // If we are in a padding region...
488  if (inPadding) {
489  if (i < 0 || i >= i_upper) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid i value detected in padded region!!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
490  if (j != 0xFFFF) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid j value detected in padded region !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
491 
492  // Otherwise, we are in a non-padding region...
493  } else {
494  if (i < 0 || i >= i_upper) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid i value detected !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
495  if (j == 0xFFFF) { // Detect transition into a padded region
496  inPadding = 1;
497  } else {
498  if (j < 0 || j >= j_upper) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid j value detected !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
499  }
500  }
501 
502  if (somethingWrong) {
503  char buf[2048];
504  int lo = k - 3; if (lo < 0) { lo = 0; }
505  int hi = k + 3; if (hi >= pairlist_size) { hi = pairlist_size; }
506  char *str = buf;
507  str += sprintf(str, "[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pl.%s[%d:%d->%d,%d:%d] = { ... ", pe, timestep, isRemote, phase, plStr, ppI, lo, hi, k, pairlist_size);
508  for (int _k = lo; _k < hi; _k++) { str += sprintf(str, "%s0x%08x%s ", ((_k==k)?("<<"):("")), pairlist[_k], ((_k==k)?(">>"):(""))); }
509  str += sprintf(str, "}, i_upper:%d, j_upper:%d\n", i_upper, j_upper);
510  printf(buf);
511  }
512  }
513  }
514 
515  return 0;
516 }
517 
518 
519 __declspec(target(mic))
520 int _verify_forces(const int pe, const int timestep, const int isRemote, const int phase,
521  const int index, const double4 * const forceBuffers, const double4 * const forces,
522  const char * const typeStr
523  ) {
524 
525  force_list &fl = device__force_lists[index];
526 
527  // Verify the indexes / sizes
528  if (fl.force_list_start < 0 || fl.force_list_start >= device__force_buffers_req_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_list_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, index, fl.force_list_start, device__force_buffers_req_size); return 1; }
529  if (fl.force_output_start < 0 || fl.force_output_start >= device__atoms_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_output_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, index, fl.force_output_start, device__atoms_size); return 1; }
530  if (fl.patch_stride < 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].patch_stride(%d) < 0 !!!\n", pe, timestep, isRemote, phase, index, fl.patch_stride); return 1; }
531  if (fl.force_list_size < 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_list_size(%d) < 0 !!!\n", pe, timestep, isRemote, phase, index, fl.force_list_size); return 1; }
532  if (fl.force_list_start + (4 * fl.patch_stride * fl.force_list_size) > device__force_buffers_req_size) {
533  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: fl[%d].force_list_start(%d) + (4 * fl.patch_stride(%d) * fl.force_list_size(%d)) > device__force_buffers_req_size(%lu) !!!\n",
534  pe, timestep, isRemote, phase, index, fl.force_list_start, fl.patch_stride, fl.force_list_size, device__force_buffers_req_size);
535  return 1;
536  }
537 
538  // Check sub-lists for large forces
539  double* f = (double*)(forceBuffers + fl.force_list_start);
540  for (int i = 0; i < 4 * fl.patch_stride * fl.force_list_size; i++) {
541  if (fabsf(f[i]) > 2.5e2) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: Large %sforce detected (1) !!! f[%d]:%le\n", pe, timestep, isRemote, phase, typeStr, i, f[i]); } // NOTE: Want to print them all, so don't return here
542  }
543 
544  // Check final list for large forces
545  f = (double*)(forces + fl.force_output_start);
546  for (int i = 0; i < 4 * fl.patch_stride; i++) {
547  if (fabsf(f[i]) > 2.5e2) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: Large %sforce detected (2) !!! f[%d]:%le\n", pe, timestep, isRemote, phase, typeStr, i, f[i]); } // NOTE: Want to print them all, so don't return here
548  }
549 
550  return 0;
551 }
552 
553 __declspec(target(mic))
554 int _verify_buffers_match(const int pe, const int timestep, const int isRemote, const int phase,
555  const char * const buf, const char * const golden, const unsigned int bufSize,
556  const char * const str
557  ) {
558  if (buf == NULL) {
559  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf == NULL (%s) !!!\n", pe, timestep, isRemote, phase, str); return 1;
560  } else if (golden == NULL) {
561  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: golden == NULL (%s) !!!\n", pe, timestep, isRemote, phase, str); return 1;
562  } else if (bufSize < 0) {
563  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: bufSize <= 0 (%s) !!!\n", pe, timestep, isRemote, phase, str); return 1;
564  } else {
565  int mismatchCount = 0;
566  #pragma omp parallel for reduction(+:mismatchCount)
567  for (int i = 0; i < bufSize; i++) {
568  #if 1
569  if (buf[i] != golden[i]) {
570  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
571  pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
572  );
573  mismatchCount++;
574  }
575  #else
576  if (start < 0) {
577  if (buf[i] != golden[i]) {
578  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
579  pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
580  );
581  start = i;
582  mismatchFound = 1;
583  }
584  } else {
585  if (buf[i] == golden[i] || i == bufSize - 1) {
586  if (i < bufSize - 1 && buf[i+1] != golden[i+1]) { // NOTE: Ignore single byte matches found within consecutive mismatches
587  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
588  pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
589  );
590  } else {
591  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf mismatch range %d -> %d (%s) !!!\n",
592  pe, timestep, isRemote, phase, start, ((buf[i] != golden[i] && i == bufSize - 1) ? (i+1) : (i)), str
593  );
594  start = -1;
595  }
596  } else if (buf[i] != golden[i] && i - start < 16) {
597  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
598  pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
599  );
600  }
601  }
602  #endif
603  }
604  if (mismatchCount > 0) { return 1; }
605  }
606  return 0;
607 }
608 
609 __declspec(target(mic))
610 void _verify_tables(const int pe, const int timestep, const int isRemote, const int phase) {
611 
612  // Verify that certain tables/buffers haven't been modified
613  if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__exclusion_bits), (char*)(device__exclusion_bits_copy), device__exclusion_bits_size * sizeof(unsigned int), "exclusion_bits")) { return; }
614  if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__constants), (char*)(device__constants_copy), sizeof(mic_constants), "constants")) { return; }
615  #if MIC_HANDCODE_FORCE_SINGLE != 0
616  if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__table_four_float), (char*)(device__table_four_float_copy), 61 * device__table_four_n_16 * sizeof(float), "table_four(float)")) { return; }
617  if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__lj_table_float), (char*)(device__lj_table_float_copy), device__lj_table_size / sizeof(double) * sizeof(float), "lj_table(float)")) { return; }
618  #else
619  if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__table_four), (char*)(device__table_four_copy), 61 * device__table_four_n_16 * sizeof(double), "table_four(double)")) { return; }
620  if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__lj_table), (char*)(device__lj_table_copy), device__lj_table_size * sizeof(double), "lj_table(double)")) { return; }
621  #endif
622 
623  // Check to see if the constants struct has been placed inside the exclusion_bits buffer
624  // NOTE: This is a specific check related to a specific error that was occuring (leaving in for now)
625  char *p1 = (char*)(device__constants);
626  char *p2 = (char*)(device__exclusion_bits);
627  int pDiff = p1 - p2;
628  if (p1 >= p2 && p1 < p2 + (device__exclusion_bits_size * sizeof(unsigned int))) {
629  printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: !!!! OVERLAP OF EXCLUSIONS AND CONSTANTS DETECTED !!!! - constants:%p(%ld), exclusions:%p(%ld)\n",
630  pe, timestep, isRemote, phase, device__constants, sizeof(device__constants), device__exclusion_bits, device__exclusion_bits_size * sizeof(unsigned int)
631  );
632  }
633 
634  fflush(NULL);
635 }
636 
637 
638 __declspec(target(mic))
639 void _verify_data_structures(const int pe, const int timestep, const int isRemote, const int phase,
640  const int check_lo, const int check_hi, const int doSlow
641  ) {
642 
643  // Verify that the basic arrays are allocated
644  if (device__atoms == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__atoms == NULL !!!\n", pe, timestep, isRemote, phase); return; }
645  if (device__atom_params == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__atom_params == NULL !!!\n", pe, timestep, isRemote, phase); return; }
646  if (device__forces == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__forces == NULL !!!\n", pe, timestep, isRemote, phase); return; }
647  if (device__slow_forces == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__slow_forces == NULL !!!\n", pe, timestep, isRemote, phase); return; }
648  if (device__patch_pairs == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs == NULL !!!\n", pe, timestep, isRemote, phase); return; }
649  if (device__force_lists == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists == NULL !!!\n", pe, timestep, isRemote, phase); return; }
650  if (device__pairlists == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__pairlists == NULL !!!\n", pe, timestep, isRemote, phase); return; }
651 
652  // Verify sizes (as much as possible)
653  if (device__patch_pairs_size <= 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs_size <= 0 !!!\n", pe, timestep, isRemote, phase); return; }
654  if (device__force_lists_size <= 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists_size <= 0 !!!\n", pe, timestep, isRemote, phase); return; }
655 
656  // Verify the tables
657  _verify_tables(pe, timestep, isRemote, phase);
658 
659  // For each patch pair
660  #if MULTIPLE_THREADS != 0
661  #pragma omp parallel for
662  #endif
663  for (int k = check_lo; k < check_hi; k++) {
664 
665  const patch_pair &pp = device__patch_pairs[k];
666 
667  // Check indexing
668  if (pp.patch1_atom_start < 0 || pp.patch1_atom_start >= device__atoms_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch1_atom_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, k, pp.patch1_atom_start, device__atoms_size); }
669  if (pp.patch2_atom_start < 0 || pp.patch2_atom_start >= device__atoms_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch2_atom_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, k, pp.patch2_atom_start, device__atoms_size); }
670  if (pp.patch1_force_start < 0 || pp.patch1_force_start >= device__force_buffers_req_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch1_force_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, k, pp.patch1_force_start, device__force_buffers_req_size); }
671  if (pp.patch2_force_start < 0 || pp.patch2_force_start >= device__force_buffers_req_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch2_force_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, k, pp.patch2_force_start, device__force_buffers_req_size); }
672 
673  // Verify the pairlists
674  int i_upper = pp.patch1_size;
675  int j_upper = pp.patch2_size;
676  const int ** const pairlists = (const int ** const)device__pairlists;
677  _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_NORM_INDEX], k, "NORM", pp, PL_NORM_INDEX);
678  _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_EXCL_INDEX], k, "EXCL", pp, PL_EXCL_INDEX);
679  _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_MOD_INDEX], k, "MOD", pp, PL_MOD_INDEX);
680  }
681 
682  // Verify forces if this is the local kernel (end of timestep)
683  if (isRemote == 0) {
684  #if MULTIPLE_THREADS != 0
685  #pragma omp parallel for
686  #endif
687  for (int k = 0; k < device__force_lists_size; k++) {
688  _verify_forces(pe, timestep, isRemote, phase, k, device__force_buffers, device__forces, "");
689  if (doSlow) { _verify_forces(pe, timestep, isRemote, phase, k, device__slow_force_buffers, device__slow_forces, "slow "); }
690  }
691  }
692 
693  fflush(NULL);
694 }
695 
696 #endif // MIC_DATA_STRUCT_VERIFY
697 
698 
699 
700 // Function called during startup to push the table_four data (lookup table used
701 // during force computation) that has been calculated on the host down to the
702 // given device.
703 // Input:
704 // - deviceNum : the device to push the data to (number within node)
705 // - table_four : pointer to the table data itself
706 // - table_four_n : dimension of the table_four table (used for indexing and
707 // size calculations)
708 // Output: N/A
709 void mic_bind_table_four(const int deviceNum,
710  const double *table_four,
711  const int table_four_n,
712  const int table_four_n_16
713  ) {
714 
715  // Verify parameters
716  __FULL_CHECK(__ASSERT(deviceNum >= 0));
717  __FULL_CHECK(__ASSERT(table_four != NULL));
718  __FULL_CHECK(__ASSERT(table_four_n > 0));
719  __FULL_CHECK(__ASSERT(table_four_n_16 > 0 && table_four_n_16 >= table_four_n));
720  __FULL_CHECK(__ASSERT(table_four_n_16 % 16 == 0));
721 
722  // Copy the table pointer and dimension information into the device variables.
723  // Note that there are actually several sub-tables within the table_four
724  // table, each a multiple of table_four_n elements in length.
725  int numTableFourElems = 61 * table_four_n_16; // WARNING !!! Must match ComputeNonbondedUtil.C (const 61) !!!
726 
727  // Transfer the table_four data and dimension data to the given device
728  #pragma offload target(mic:deviceNum) \
729  in(table_four_n_16) nocopy(device__table_four_n_16) \
730  in(numTableFourElems) \
731  in(table_four[0:numTableFourElems] : alloc_if(1) free_if(1)) \
732  nocopy(device__table_four) nocopy(device__table_four_float) \
733  nocopy(device__table_four_copy) nocopy(device__table_four_float_copy)
734  {
735  __MUST_BE_MIC;
736 
737  // Copy and verify the table_four_n_16 value
738  device__table_four_n_16 = table_four_n_16;
739  __FULL_CHECK(__ASSERT(device__table_four_n_16 > 0));
740 
741  // Allocate memory on the device for the table_four table. Depending on the precision being used
742  // for the table, either create a 64-bit or 32-bit copy of the data on the device.
743  __FULL_CHECK(__ASSERT(device__table_four == NULL && device__table_four_float == NULL)); // Check for double allocation
744  #if MIC_HANDCODE_FORCE_SINGLE != 0
745 
746  // Allocate and copy data into a 32-bit version of the table
747  device__table_four_float = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numTableFourElems, 64, "device__table_four_float"));
748  __ASSERT(device__table_four_float != NULL);
749  for (int i = 0; i < numTableFourElems; i++) {
750  device__table_four_float[i] = (float)(table_four[i]);
751  }
752 
753  // Create a second copy for verification purposes
754  #if MIC_DATA_STRUCT_VERIFY != 0
755  device__table_four_float_copy = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numTableFourElems, 64, "device__table_four_float_copy"));
756  __ASSERT(device__table_four_float_copy != NULL);
757  memcpy(device__table_four_float_copy, device__table_four_float, numTableFourElems * sizeof(float));
758  #endif
759 
760  #else // MIC_HANDCODE_FORCE_SINGLE != 0
761 
762  // Allocate and copy data into a 64-bit version of the table
763  device__table_four = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numTableFourElems, 64, "device__table_four"));
764  __ASSERT(device__table_four != NULL);
765  for (int i = 0; i < numTableFourElems; i++) {
766  device__table_four[i] = table_four[i];
767  }
768  #if MIC_DATA_STRUCT_VERIFY != 0
769  device__table_four_copy = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numTableFourElems, 64, "device__table_four_copy"));
770  __ASSERT(device__table_four_copy != NULL);
771  memcpy(device__table_four_copy, device__table_four, numTableFourElems * sizeof(double));
772  #endif
773 
774  #endif // MIC_HANDCODE_FORCE_SINGLE != 0
775 
776  } // end pragma offload
777 }
778 
779 
780 // Function called during startup to push the lj_table data (lookup table used
781 // during force computation) that has been calculated on the host down to the
782 // given device.
783 // Input:
784 // - deviceNum : the device to push the data to (number within node)
785 // - lj_table : pointer to the table data itself (64-bit version on the host)
786 // - lj_table_dim : dimension of the lj_table (used for indexing)
787 // - lj_table_size : size of the lj_table
788 // Output: N/A
789 void mic_bind_lj_table(const int deviceNum,
790  const char * lj_table,
791  const int lj_table_dim,
792  const int lj_table_size
793  ) {
794 
795  // Verify Parameters
796  __FULL_CHECK(__ASSERT(deviceNum >= 0));
797  __FULL_CHECK(__ASSERT(lj_table != NULL));
798  __FULL_CHECK(__ASSERT(lj_table_dim > 0));
799  __FULL_CHECK(__ASSERT(lj_table_size > 0));
800 
801  // Transfer the table data, dimension, and size information to the given device.
802  #pragma offload target(mic:deviceNum) \
803  in(lj_table_dim) nocopy(device__lj_table_dim) \
804  in(lj_table_size) nocopy(device__lj_table_size) \
805  in(lj_table[0:lj_table_size] : alloc_if(1) free_if(1)) \
806  nocopy(device__lj_table) nocopy(device__lj_table_float) \
807  nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
808  nocopy(device__pe)
809  {
810  __MUST_BE_MIC;
811 
812  // Copy and verify the LJ table size and dimension info
813  device__lj_table_dim = lj_table_dim;
814  device__lj_table_size = lj_table_size;
815  __FULL_CHECK(__ASSERT(device__lj_table_dim > 0 && device__lj_table_size > 0));
816 
817  // Allocate memory on the device for the LJ table. Depending on the precision being used
818  // for the table, either create a 64-bit or 32-bit copy of the data on the device.
819  __FULL_CHECK(__ASSERT(device__lj_table == NULL && device__lj_table_float == NULL)); // Check for double allocation
820  int numElements = device__lj_table_size * sizeof(char) / sizeof(double);
821  #if MIC_HANDCODE_FORCE_SINGLE != 0
822 
823  // Allocate and copy the data into a 32-bit version of the table
824  device__lj_table_float = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numElements, 64, "device__lj_table_float"));
825  __ASSERT(device__lj_table_float != NULL);
826  for (int i = 0; i < numElements; i++) {
827  device__lj_table_float[i] = (float)(((double*)lj_table)[i]);
828  }
829 
830  // Create a copy of the table for verification purposes later
831  #if MIC_DATA_STRUCT_VERIFY != 0
832  device__lj_table_float_copy = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numElements, 64, "device__lj_table_float_copy"));
833  __ASSERT(device__lj_table_float_copy != NULL);
834  memcpy(device__lj_table_float_copy, device__lj_table_float, sizeof(float) * numElements);
835  #endif
836 
837  #else // MIC_HANDCODE_FORCE_SINGLE != 0
838 
839  // Allocate and copy the data into a 64-bit version of the table
840  device__lj_table = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numElements, 64, "device__lj_table"));
841  __ASSERT(device__lj_table_float != NULL);
842  for (int i = 0; i < numElements; i++) {
843  device__lj_table[i] = ((double*)lj_table)[i];
844  }
845 
846  // Create a copy of the table for verificiation purposes later
847  #if MIC_DATA_STRUCT_VERIFY != 0
848  device__lj_table_copy = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numElements, 64, "device__lj_table_copy"));
849  __ASSERT(device__lj_table_copy != NULL);
850  memcpy(device__lj_table_copy, device__lj_table, sizeof(double) * numElements);
851  #endif
852 
853  #endif // MIC_HANDCODE_FORCE_SINGLE != 0
854 
855  } // end pragma offload
856 }
857 
858 
859 // Function called during startup to push the exclusion data (lookup table used
860 // during pairlist generation) that has been calculated on the host down to the
861 // given device.
862 // Input:
863 // - deviceNum : the device to push the data to (number within node)
864 // - exclusion_bits : pointer to the exclusion data itself
865 // - exclusion_bits_size : size of the exclsion data
866 // Output: N/A
867 void mic_bind_exclusions(const int deviceNum,
868  unsigned int *exclusion_bits,
869  const long int exclusion_bits_size
870  ) {
871 
872  // Verify Parameters
873  __FULL_CHECK(__ASSERT(deviceNum >= 0));
874  __FULL_CHECK(__ASSERT(exclusion_bits != NULL));
875  __FULL_CHECK(__ASSERT(exclusion_bits_size > 0));
876 
877  // Transfer the exclusion data and size information down to the given device.
878  #pragma offload target(mic:deviceNum) \
879  in(exclusion_bits_size) nocopy(device__exclusion_bits_size) \
880  in(exclusion_bits[0:exclusion_bits_size] : alloc_if(1) free_if(1)) \
881  nocopy(device__exclusion_bits) nocopy(device__exclusion_bits_copy)
882  {
883  __MUST_BE_MIC;
884 
885  // Copy and verify the table size info
886  device__exclusion_bits_size = exclusion_bits_size;
887  __FULL_CHECK(__ASSERT(device__exclusion_bits_size > 0));
888 
889  // Create a copy of the exclusion bits buffer on the device
890  __ASSERT(device__exclusion_bits == NULL); // Check for double allocate
891  device__exclusion_bits = (unsigned int *)_MM_MALLOC_WRAPPER(sizeof(unsigned int) * device__exclusion_bits_size, 64, "device__exclusion_bits");
892  __ASSERT(device__exclusion_bits != NULL);
893  memcpy(device__exclusion_bits, exclusion_bits, sizeof(unsigned int) * device__exclusion_bits_size);
894 
895  // Create a copy of the table for verification purposes later
896  #if MIC_DATA_STRUCT_VERIFY != 0
897  device__exclusion_bits_copy = (unsigned int*)(_MM_MALLOC_WRAPPER(device__exclusion_bits_size * sizeof(unsigned int), 64, " device__exclusion_bits_copy"));
898  __ASSERT(device__exclusion_bits_copy != NULL);
899  memcpy(device__exclusion_bits_copy, device__exclusion_bits, sizeof(unsigned int) * device__exclusion_bits_size);
900  #else
901  device__exclusion_bits_copy = NULL;
902  #endif
903 
904  } // end pragma offload
905 }
906 
907 
908 // Function called during startup to push (and calculate) several variables that
909 // are constant and used throughout the simulation.
910 // Input:
911 // - deviceNum : the device to push the data to (number within node)
912 // ... : remaining parameters are simply simulation constants
913 // Output: N/A
914 void mic_bind_constants(const int deviceNum,
915  const double cutoff2,
916  const double dielectric_1,
917  const double scaling,
918  const double scale14,
919  const double r2_delta,
920  const int r2_delta_exp,
921  const int commOnly
922  ) {
923 
924  // Verify Parameters
925  __FULL_CHECK(__ASSERT(deviceNum >= 0));
926 
927  // Create a mic_constants data structure and copy (or calculate) the
928  // various constants that are to be pushed to the device in to this
929  // data structure.
930  mic_constants constants;
931  constants.cutoff2 = cutoff2;
932  constants.dielectric_1 = dielectric_1;
933  constants.scaling = scaling;
934  constants.scale14 = scale14;
935  constants.modf_mod = 1.0 - scale14;
936  constants.r2_delta = r2_delta;
937  constants.r2_delta_exp = r2_delta_exp;
938  constants.r2_delta_expc = 64 * (r2_delta_exp - 1023);
939  constants.commOnly = commOnly;
940  constants.singleKernelFlag = singleKernelFlag;
941 
942  // Transfer the constants to the given device
943  #pragma offload target(mic:deviceNum) \
944  in(constants) nocopy(device__constants) \
945  nocopy(device__pe) nocopy(device__exclusion_bits) \
946  nocopy(device__exclusion_bits_copy) nocopy(device__exclusion_bits_size) \
947  nocopy(device__table_four_copy) nocopy(device__table_four_float_copy) \
948  nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
949  nocopy(device__constants_copy)
950  {
951  __MUST_BE_MIC;
952 
953  // Create a copy of the constant info on the device
954  __ASSERT(device__constants == NULL); // Check for double allocation
955  device__constants = (mic_constants*)_MM_MALLOC_WRAPPER(sizeof(mic_constants), 64, "device__constants");
956  __ASSERT(device__constants != NULL);
957  memcpy(device__constants, &constants, sizeof(mic_constants));
958 
959  // Correct r2_delta_expc for use with single precision if using mixed precision is
960  // being used on the device
961  #if MIC_HANDCODE_FORCE_SINGLE != 0
962  device__constants->r2_delta_expc = 64 * (device__constants->r2_delta_exp - 127);
963  #endif
964 
965  // Copy the data for verification purposes later
966  #if MIC_DATA_STRUCT_VERIFY != 0
967  device__constants_copy = (mic_constants*)_MM_MALLOC_WRAPPER(sizeof(mic_constants), 64, "device__constants_copy");
968  __ASSERT(device__constants_copy != NULL);
969  memcpy(device__constants_copy, device__constants, sizeof(mic_constants));
970  #endif
971 
972  } // end pragma offload
973 }
974 
975 
976 // Function called after patch pairs have been modified on the host to push those
977 // modifications to the MIC device.
978 // Input:
979 // - deviceNum : the device to push the data to (number within node)
980 // - patch_pairs : array containing individual patch_pair records (data to copy)
981 // - patch_pairs_size : the length (in elements) of the patch_pairs array (valid/used elements)
982 // - patch_pairs_bufSize : the actual length (in elements) of the patch_pairs array (allocated)
983 // Output: N/A
984 void mic_bind_patch_pairs_only(const int deviceNum,
985  const patch_pair *patch_pairs,
986  const int patch_pairs_size,
987  const int patch_pairs_bufSize
988  ) {
989 
990  // NOTE: This function does not actually transfer the patch pairs, it only (re)allocates
991  // the device buffer(s) associated with them. See mic_nonbonded_compute() for transfer.
992 
993  // Verify parameters
994  __FULL_CHECK(__ASSERT(deviceNum >= 0));
995  __FULL_CHECK(__ASSERT(patch_pairs != NULL));
996  __FULL_CHECK(__ASSERT(patch_pairs_size > 0));
997  __FULL_CHECK(__ASSERT(patch_pairs_bufSize > 0));
998  __FULL_CHECK(__ASSERT(patch_pairs_bufSize >= patch_pairs_size));
999 
1000  // Check if the buffer currently allocated on the device is too small. If so,
1001  // reallocate the buffer to be larger.
1002  if (patch_pairs_bufSize > host__patch_pairs_bufSize) { // If buffer is too small, reallocate
1003 
1004  // Setup clause required to free device_times_computes buffer on the device
1005  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1006  double * device_times_computes = host__device_times_computes;
1007  #define TIMES_COMPUTES_FREE_CLAUSE nocopy(device_times_computes : alloc_if(0) free_if(1))
1008  #else
1009  #define TIMES_COMPUTES_FREE_CLAUSE
1010  #endif
1011 
1012  // Free old buffers
1013  if (host__patch_pairs != NULL) {
1014 
1015  const patch_pair * _patch_pairs = host__patch_pairs;
1016  #pragma offload target(mic:deviceNum) \
1017  nocopy(_patch_pairs : alloc_if(0) free_if(1)) \
1018  TIMES_COMPUTES_FREE_CLAUSE
1019  { __MUST_BE_MIC; }
1020  }
1021 
1022  // (Re)allocate memory for device_times_computes buffer on the host, and setup clause for device allocation
1023  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1024  if (host__device_times_computes != NULL) { _MM_FREE_WRAPPER(host__device_times_computes); }
1025  host__device_times_computes = (double*)_MM_MALLOC_WRAPPER(2 * patch_pairs_bufSize * sizeof(double), 64, "host__device_times_computes");
1026  __ASSERT(host__device_times_computes != NULL);
1027  device_times_computes = host__device_times_computes;
1028  #define TIMES_COMPUTES_ALLOC_CLAUSE nocopy(device_times_computes[0:2*patch_pairs_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
1029  #else
1030  #define TIMES_COMPUTES_ALLOC_CLAUSE
1031  #endif
1032 
1033  // Record the new host-side pointer and buffer size
1034  host__patch_pairs = (patch_pair*)patch_pairs;
1035  host__patch_pairs_bufSize = patch_pairs_bufSize;
1036 
1037  // Allocate new memory
1038  #pragma offload target(mic:deviceNum) \
1039  in(patch_pairs_size) nocopy(device__patch_pairs_size) \
1040  in(patch_pairs_bufSize) \
1041  nocopy(patch_pairs[0:patch_pairs_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1042  nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
1043  nocopy(device__numOMPThreads) nocopy(device__pe) \
1044  TIMES_COMPUTES_ALLOC_CLAUSE
1045  {
1046  __MUST_BE_MIC;
1047 
1048  // Copy and verify the number of patch pairs
1049  device__patch_pairs_size = patch_pairs_size;
1050  __ASSERT(device__patch_pairs_size > 0);
1051 
1052  // Make sure there are enough pairlists pointers (one per pairlist type per patch_pair)
1053  const int numPairlists = NUM_PAIRLIST_TYPES * device__patch_pairs_size;
1054  if (numPairlists > device__pairlists_alloc_size) {
1055  int **new_pairlists = (int**)(_MM_MALLOC_WRAPPER(numPairlists * sizeof(int*), 64, "device__pairlists"));
1056  int initStart = 0;
1057  if (device__pairlists != 0) {
1058  int **old_pairlists = (int**)(device__pairlists);
1059  memcpy((void*)new_pairlists, (void*)old_pairlists, sizeof(int*) * device__pairlists_alloc_size);
1060  _MM_FREE_WRAPPER(old_pairlists);
1061  initStart = device__pairlists_alloc_size;
1062  }
1063  for (int i = initStart; i < numPairlists; i++) { new_pairlists[i] = NULL; }
1064  device__pairlists = (uintptr_t)new_pairlists;
1065  device__pairlists_alloc_size = numPairlists;
1066  }
1067 
1068  } // end pragma offload
1069 
1070  #undef TIMES_COMPUTES_FREE_CLAUSE
1071  #undef TIMES_COMPUTES_ALLOC_CLAUSE
1072  }
1073 
1074  // Record the current 'used' length of the array and flag the array for data transfer
1075  host__patch_pairs_size = patch_pairs_size;
1076  patch_pairs_copySize = patch_pairs_size;
1077 }
1078 
1079 
1080 // Function called after force lists have been modified on the host to push those
1081 // modifications to the MIC device.
1082 // Input:
1083 // - deviceNum : the device to push the data to (number within node)
1084 // - force_lists : array containing individual force_list records (data to copy)
1085 // - force_lists_size : the length (in elements) of the force_lists array (valid/used elements)
1086 // - force_lists_bufSize : the actual length (in elements) of the force_lists array (allocated)
1087 // Output: N/A
1088 void mic_bind_force_lists_only(const int deviceNum,
1089  force_list *force_lists,
1090  const int force_lists_size,
1091  const int force_lists_bufSize
1092  ) {
1093 
1094  // NOTE: This function does not actually transfer the force lists, it only (re)allocates
1095  // the device buffer(s) associated with them. See mic_nonbonded_compute() for transfer.
1096 
1097  // Verify parameters
1098  __FULL_CHECK(__ASSERT(deviceNum >= 0));
1099  __FULL_CHECK(__ASSERT(force_lists != NULL));
1100  __FULL_CHECK(__ASSERT(force_lists_size > 0));
1101  __FULL_CHECK(__ASSERT(force_lists_bufSize > 0));
1102  __FULL_CHECK(__ASSERT(force_lists_bufSize >= force_lists_size));
1103 
1104  // Check if the buffer currently allocated on the device is too small. If so, reallocate the buffer.
1105  if (force_lists_bufSize > host__force_lists_bufSize) {
1106 
1107  // Setup clause needed to free the device_times_patches buffer on the device
1108  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1109  double * device_times_patches = host__device_times_patches;
1110  #define TIMES_PATCHES_FREE_CLAUSE nocopy(device_times_patches : alloc_if(0) free_if(1))
1111  #else
1112  #define TIMES_PATCHES_FREE_CLAUSE
1113  #endif
1114 
1115  // Free the old buffer
1116  if (host__force_lists != NULL) {
1117  const force_list * _force_lists = host__force_lists;
1118  #pragma offload target(mic:deviceNum) \
1119  nocopy(_force_lists : alloc_if(0) free_if(1)) \
1120  TIMES_PATCHES_FREE_CLAUSE
1121  { __MUST_BE_MIC; }
1122  }
1123 
1124  // (Re)allocate the device_times_patches buffer on the host and setup the clause required to allocate on the device
1125  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1126  if (host__device_times_patches != NULL) { _MM_FREE_WRAPPER(host__device_times_patches); }
1127  host__device_times_patches = (double*)_MM_MALLOC_WRAPPER(2 * force_lists_bufSize * sizeof(double), 64, "host__device_times_patches");
1128  __ASSERT(host__device_times_patches != NULL);
1129  device_times_patches = host__device_times_patches;
1130  #define TIMES_PATCHES_ALLOC_CLAUSE nocopy(device_times_patches[0:2*force_lists_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
1131  #else
1132  #define TIMES_PATCHES_ALLOC_CLAUSE
1133  #endif
1134 
1135  // Record teh new host-side pointer and buffer size
1136  host__force_lists = force_lists;
1137  host__force_lists_bufSize = force_lists_bufSize;
1138 
1139  // Allocate the new memory
1140  #pragma offload target(mic:deviceNum) \
1141  in(force_lists_size) nocopy(device__force_lists_size) \
1142  nocopy(force_lists[0:force_lists_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1143  TIMES_PATCHES_ALLOC_CLAUSE
1144  {
1145  __MUST_BE_MIC;
1146 
1147  // Copy and verify the number of force lists
1148  device__force_lists_size = force_lists_size;
1149  __ASSERT(device__force_lists_size > 0);
1150  }
1151 
1152  #undef TIMES_PATCHES_FREE_CLAUSE
1153  #undef TIMES_PATCHES_ALLOC_CLAUSE
1154 
1155  } // end pragma offload
1156 
1157  // Record the current 'used' length of the array and flag the array for data transfer
1158  host__force_lists_size = force_lists_size;
1159  force_lists_copySize = force_lists_size;
1160 }
1161 
1162 
1163 // Function called after the atom list has been modified on the host to push those
1164 // modifications to the MIC device.
1165 // Input:
1166 // - deviceNum : the device to push the data to (number within node)
1167 // - atoms : array containing individual atom records (data to copy)
1168 // - atom_params : array containing individual atom_param records (data to copy)
1169 // - forces : array that will contain output data from the device (data to copy back)
1170 // - slow_forces : array that will contain output data from the device (data to copy back)
1171 // - atoms_size : the length (in elements) of the specified arrays (valid/used elements)
1172 // - atoms_bufSize : the actual length (in elements) of the specified arrays (allocated)
1173 // Output: N/A
1174 void mic_bind_atoms_only(const int deviceNum,
1175  atom *atoms,
1176  atom_param *atom_params,
1177  double4 *forces,
1178  double4 *slow_forces,
1179  const int atoms_size,
1180  const int atoms_bufSize
1181  ) {
1182 
1183  // NOTE: This function does not actually transfer the atom/force data, it only (re)allocates
1184  // the device buffer(s) associated with them. See mic_nonbonded_compute() for transfer.
1185 
1186  // Verify parameters
1187  __FULL_CHECK(__ASSERT(deviceNum >= 0));
1188  __FULL_CHECK(__ASSERT(atoms != NULL));
1189  __FULL_CHECK(__ASSERT(atom_params != NULL));
1190  __FULL_CHECK(__ASSERT(forces != NULL));
1191  __FULL_CHECK(__ASSERT(slow_forces != NULL));
1192  __FULL_CHECK(__ASSERT(atoms_size > 0));
1193  __FULL_CHECK(__ASSERT(atoms_bufSize > 0));
1194  __FULL_CHECK(__ASSERT(atoms_bufSize >= atoms_size));
1195 
1196  // Check if the buffers are large enough (reallocate if not)
1197  if (atoms_bufSize > host__atoms_bufSize) {
1198 
1199  // Free the old buffers
1200  if (host__atoms != NULL) {
1201  const atom * _atoms = host__atoms;
1202  const atom_param * _atom_params = host__atom_params;
1203  const double4 * _forces = host__forces;
1204  const double4 * _slow_forces = host__slow_forces;
1205 
1206  #pragma offload target(mic:deviceNum) \
1207  nocopy(_atoms : alloc_if(0) free_if(1)) \
1208  nocopy(_atom_params : alloc_if(0) free_if(1)) \
1209  nocopy(_forces : alloc_if(0) free_if(1)) \
1210  nocopy(_slow_forces : alloc_if(0) free_if(1))
1211  { __MUST_BE_MIC; }
1212  }
1213 
1214  // Copy the new buffer and size info
1215  host__atoms = atoms;
1216  host__atom_params = atom_params;
1217  host__forces = forces;
1218  host__slow_forces = slow_forces;
1219  host__atoms_size = atoms_size;
1220  host__atoms_bufSize = atoms_bufSize;
1221 
1222  // Allocate the new buffers
1223  #pragma offload target(mic:deviceNum) \
1224  in(atoms_size : into(device__atoms_size)) \
1225  nocopy(atoms[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1226  nocopy(atom_params[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1227  nocopy(forces[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1228  nocopy(slow_forces[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
1229  { __MUST_BE_MIC; }
1230  }
1231 
1232  // Signal the copy
1233  host__atoms_size = atoms_size;
1234  atom_params_copySize = atoms_size;
1235 }
1236 
1237 
1238 // Function called to allocate memory on the MIC device related to the individual and private
1239 // force buffers for each of the compute objects. Note that this function simply allocates
1240 // the memory on the device, as there is no host-side equivalent buffer (i.e. device only).
1241 // Input:
1242 // - deviceNum : the device to push the data to (number within node)
1243 // - force_buffers_size : the amount of memory to be allocated on the device for the buffer
1244 void mic_bind_force_buffers_only(const int deviceNum,
1245  const size_t force_buffers_size
1246  ) {
1247 
1248  // Check if the requested size is larger than the already allocated size. If so, flag the buffer
1249  // as needing to be reallocated by increasing the request size passed to the device (occurs later).
1250  if (host__force_buffers_req_size < force_buffers_size) {
1251  host__force_buffers_req_size = (int)(force_buffers_size * 1.1f);
1252  host__force_buffers_req_size = (host__force_buffers_req_size + 4095) & (~4095); // Round up to 4K
1253  }
1254 }
1255 
1256 
1257 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1258 
1259 void mic_submit_patch_data(const int deviceNum,
1260  void * const hostPtr,
1261  void* &hostPtr_prev,
1262  const int transferBytes,
1263  const int allocBytes_host,
1264  int &allocBytes_device,
1265  uint64_t &devicePtr,
1266  void* &signal
1267  ) {
1268 
1269  // NOTE: This function is called once per patch per timestep by one of the host threads
1270  // TODO : Since signals are thread specific, cannot use them here since the master does
1271  // the checking for signal completions. Figure out a way to make this asynchronous,
1272  // but just do synchronous transfer for now to see what happens.
1273 
1274  // Check to see if the device's copy of the buffer needs to be reallocated
1275  // NOTE: If we are in a position where the buffer might grow in size, the contents
1276  // need to be rewritten in full (i.e. atom migration has occured), so don't worry
1277  // about the previous contents of the buffer
1278  int allocFlag = 0;
1279  if (hostPtr != hostPtr_prev || allocBytes_host > allocBytes_device) {
1280 
1281  // Free the old device buffer if it has been allocated previously
1282  if (devicePtr != 0) {
1283  char* __hostPtr_prev = (char*)hostPtr_prev;
1284  #pragma offload target(mic:deviceNum) \
1285  out(__hostPtr_prev[0:0] : alloc_if(0) free_if(1))
1286  { }
1287  }
1288 
1289  // Flag that allocation is needed
1290  allocFlag = 1;
1291  }
1292 
1293  // Transfer the data (allocating if needed)
1294  char* __hostPtr = (char*)hostPtr;
1295  if (allocFlag != 0) {
1296 
1297  uint64_t __devicePtr = 0;
1298  allocBytes_device = allocBytes_host;
1299  #pragma offload target(mic:deviceNum) \
1300  in(__hostPtr[0:allocBytes_host] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
1301  out(__devicePtr)
1302  { __devicePtr = (uint64_t)__hostPtr; }
1303 
1304  // Record the pointer for use later
1305  devicePtr = __devicePtr;
1306  signal = NULL; // Synchronous, so no need for the host to wait
1307 
1308  } else {
1309 
1310  #pragma offload_transfer target(mic:deviceNum) \
1311  in(__hostPtr[0:transferBytes] : alloc_if(0) free_if(0) align(MIC_ALIGN)) \
1312  signal(hostPtr)
1313  { }
1314 
1315  signal = hostPtr; // Asynchronous, so force the host to wait on the signal later
1316  }
1317 
1318  // Take not of the host buffer that was transfered
1319  hostPtr_prev = hostPtr;
1320 }
1321 
1322 #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL
1323 
1324 
1325 // Kernel bodies : Include CopmuteNonbondedMICKernelBase.h to declare each
1326 // version of the force computation functions (calc_pair, calc_self, etc.).
1327 // Each time the header file is included, different macros are set/unset
1328 // to control the exact contents of 'that version of the kernel.'
1329 
1330 #define NBPAIR 1
1331 #define NBSELF 2
1332 
1333 #define NBTYPE NBPAIR
1335  #define CALCENERGY
1337  #undef CALCENERGY
1338  #define FULLELECT
1340  #define CALCENERGY
1342  #undef CALCENERGY
1343  #undef FULLELECT
1344 #undef NBTYPE
1345 
1346 #define NBTYPE NBSELF
1348  #define CALCENERGY
1350  #undef CALCENERGY
1351  #define FULLELECT
1353  #define CALCENERGY
1355  #undef CALCENERGY
1356  #undef FULLELECT
1357 #undef NBTYPE
1358 
1359 
1360 // This function is the main "computation" function, called each timestep to
1361 // initiate computation on the MIC device. A signal is setup as part of
1362 // the offload pragma, which is periodically checked by the
1363 // mic_check_remote_kernel_complete and mic_check_local_kernel_complete
1364 // functions (which are called periodically by the Charm++ runtime system),
1365 // so progress can continue once the device has finished its computation.
1366 // Input:
1367 // - deviceNum : the device to push the data to (number within node)
1368 // - isRemote : flag indicating a 'remote' (1) or 'local' (0) kernel invocation (currently, must be 'local')
1369 // - lata : lattice information ('a' along with latb and latc)
1370 // - latb : lattice information ('b' along with lata and latc)
1371 // - latc : lattice information ('c' along with lata and latb)
1372 // - doSlow : flag indicating if slow forces should be calculated in this kernel invocation
1373 // - doEnergy : flag indiciating if energy information should be calculated in this kernel invocation
1374 // - usePairlists : flag indicating if pairlists should be used within this kernel invocation (currently, must be)
1375 // - savePairlists : flag indicating if pairlists should be saved across kernel invocations (currently, must be)
1376 // - atomsChanged : flag indicating whether or not the atoms (and related) buffers have changed (length, not content)
1377 // Output: N/A
1378 void mic_nonbonded_forces(const int deviceNum,
1379  const int isRemote,
1380  const int numLocalAtoms,
1381  const int numLocalComputes,
1382  const int numLocalPatches,
1383  const mic_position3_t lata,
1384  const mic_position3_t latb,
1385  const mic_position3_t latc,
1386  const int doSlow,
1387  const int doEnergy,
1388  const int usePairlists,
1389  const int savePairlists,
1390  const int atomsChanged
1391  ) {
1392 
1393  const int numAtoms = host__atoms_size;
1394  const int numPatchPairs = host__patch_pairs_size;
1395  const int numForceLists = host__force_lists_size;
1396 
1397  // Get and set the tag to use
1398  int *tag_kernel = ((isRemote) ? (&tag_remote_kernel) : (&tag_local_kernel));
1399  *tag_kernel = 1;
1400 
1401  // Setup the kernel data structures
1402  mic_kernel_data * remote_kernel_data = host__kernel_data + 1; //host__remote_kernel_data;
1403  mic_kernel_data * local_kernel_data = host__kernel_data; //host__local_kernel_data;
1404  mic_kernel_data * kernel_data = ((isRemote) ? (remote_kernel_data) : (local_kernel_data));
1405  mic_kernel_data * _kernel_data = host__kernel_data;
1406  #if MIC_SYNC_INPUT != 0
1407  // NOTE: With MIC_SYNC_INPUT, all input is pushed before either of the "local" or "remote" kernels
1408  // are executed, so setup both kernel_data structs as the "remote" kernel is being issued
1409  if (singleKernelFlag != 0 || isRemote) {
1410  remote_kernel_data->isRemote = 1;
1411  local_kernel_data->isRemote = 0;
1412  remote_kernel_data->numLocalAtoms = local_kernel_data->numLocalAtoms = numLocalAtoms;
1413  remote_kernel_data->numLocalComputes = local_kernel_data->numLocalComputes = numLocalComputes;
1414  remote_kernel_data->numLocalPatches = local_kernel_data->numLocalPatches = numLocalPatches;
1415  remote_kernel_data->doSlow = local_kernel_data->doSlow = doSlow;
1416  remote_kernel_data->doEnergy = local_kernel_data->doEnergy = doEnergy;
1417  remote_kernel_data->usePairlists = local_kernel_data->usePairlists = usePairlists;
1418  remote_kernel_data->savePairlists = local_kernel_data->savePairlists = savePairlists;
1419  remote_kernel_data->lata.x = local_kernel_data->lata.x = lata.x;
1420  remote_kernel_data->lata.y = local_kernel_data->lata.y = lata.y;
1421  remote_kernel_data->lata.z = local_kernel_data->lata.z = lata.z;
1422  remote_kernel_data->latb.x = local_kernel_data->latb.x = latb.x;
1423  remote_kernel_data->latb.y = local_kernel_data->latb.y = latb.y;
1424  remote_kernel_data->latb.z = local_kernel_data->latb.z = latb.z;
1425  remote_kernel_data->latc.x = local_kernel_data->latc.x = latc.x;
1426  remote_kernel_data->latc.y = local_kernel_data->latc.y = latc.y;
1427  remote_kernel_data->latc.z = local_kernel_data->latc.z = latc.z;
1428  remote_kernel_data->numAtoms = local_kernel_data->numAtoms = numAtoms;
1429  remote_kernel_data->numPatchPairs = local_kernel_data->numPatchPairs = numPatchPairs;
1430  remote_kernel_data->numForceLists = local_kernel_data->numForceLists = numForceLists;
1431  remote_kernel_data->forceBuffersReqSize = local_kernel_data->forceBuffersReqSize = host__force_buffers_req_size;
1432  }
1433  #else
1434  kernel_data->isRemote = isRemote;
1435  kernel_data->numLocalAtoms = numLocalAtoms;
1436  kernel_data->numLocalComputes = numLocalComputes;
1437  kernel_data->numLocalPatches = numLocalPatches;
1438  kernel_data->doSlow = doSlow;
1439  kernel_data->doEnergy = doEnergy;
1440  kernel_data->usePairlists = usePairlists;
1441  kernel_data->savePairlists = savePairlists;
1442  kernel_data->lata.x = lata.x;
1443  kernel_data->lata.y = lata.y;
1444  kernel_data->lata.z = lata.z;
1445  kernel_data->latb.x = latb.x;
1446  kernel_data->latb.y = latb.y;
1447  kernel_data->latb.z = latb.z;
1448  kernel_data->latc.x = latc.x;
1449  kernel_data->latc.y = latc.y;
1450  kernel_data->latc.z = latc.z;
1451  kernel_data->numAtoms = numAtoms;
1452  kernel_data->numPatchPairs = numPatchPairs;
1453  kernel_data->numForceLists = numForceLists;
1454  kernel_data->forceBuffersReqSize = host__force_buffers_req_size;
1455  #endif
1456 
1457  // For buffers that are only periodically copied/updated, get the
1458  // flags that indicate whether or not those buffers should be
1459  // transfered during this kernel invocation, and then reset them
1460  const int slowForcesNumAtoms = ((doSlow) ? (numAtoms) : (0));
1461  const int _patch_pairs_copySize = patch_pairs_copySize; // from __thread variable to stack variable for use within offload pragma
1462  const int _force_lists_copySize = force_lists_copySize; // from __thread variable to stack variable for use within offload pragma
1463  const int _atom_params_copySize = atom_params_copySize; // from __thread variable to stack variable for use within offload pragma
1464  patch_pairs_copySize = 0;
1465  force_lists_copySize = 0;
1466  atom_params_copySize = 0;
1467 
1468  // Calculate the start/size of any data buffers that need to be copied to the device this timestep
1469  // NOTE: Some "remote" computes still need "local patch" data, so copy all patch data before/during "remote."
1470  int toCopySize_atoms = (singleKernelFlag != 0 || isRemote) ? (numAtoms) : (0); // The remote kernel will copy in all atoms
1471  int toCopySize_atom_params = (singleKernelFlag != 0 || isRemote) ? (_atom_params_copySize) : (0); // If there are atom_params to copy, the remote kernel will copy in all atom_params
1472  int toCopySize_patch_pairs = (singleKernelFlag != 0 || isRemote) ? (_patch_pairs_copySize) : (0); // This could potentially be split between kernels (but is relatively rare, so don't worry about it for now)
1473  int toCopySize_force_lists = (singleKernelFlag != 0 || isRemote) ? (_force_lists_copySize) : (0); // This could potentially be split between kernels (but is relatively rare, so don't worry about it for now)
1474  int toCopyStart_forces = (singleKernelFlag != 0) ? (0) : ((isRemote) ? (numLocalAtoms) : (0));
1475  int toCopySize_forces = (singleKernelFlag != 0) ? (numAtoms) : ((isRemote) ? (numAtoms - numLocalAtoms) : (numLocalAtoms));
1476  int toCopyStart_slow_forces = (doSlow) ? (toCopyStart_forces) : (0);
1477  int toCopySize_slow_forces = (doSlow) ? (toCopySize_forces) : (0);
1478 
1479  // If tracing data is to be recorded, setup local variables to the host buffers that will
1480  // receive the data and setup the clauses that will be used to transfer the data
1481  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
1482  double * device_times_start = host__device_times_start;
1483  int toCopySize_device_times_start = ((isRemote) ? (0) : (10));
1484  #if MIC_DEVICE_TRACING_DETAILED != 0
1485  double * device_times_computes = host__device_times_computes;
1486  double * device_times_patches = host__device_times_patches;
1487  int toCopySize_device_times_computes = ((isRemote) ? (0) : (2 * numPatchPairs));
1488  int toCopySize_device_times_patches = ((isRemote) ? (0) : (2 * numForceLists));
1489  #if MIC_SYNC_OUTPUT != 0
1490  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1491  in(device_times_computes[0:0] : alloc_if(0) free_if(0)) \
1492  in(device_times_patches[0:0] : alloc_if(0) free_if(0)) \
1493  nocopy(device__device_times_computes) nocopy(device__device_times_patches)
1494  #else
1495  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1496  out(device_times_computes[0:toCopySize_device_times_computes] : alloc_if(0) free_if(0)) \
1497  out(device_times_patches[0:toCopySize_device_times_patches] : alloc_if(0) free_if(0)) \
1498  nocopy(device__device_times_computes) nocopy(device__device_times_patches)
1499  #endif
1500  #else
1501  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
1502  #endif
1503  #if MIC_SYNC_OUTPUT != 0
1504  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
1505  MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1506  in(device_times_start[0:0] : alloc_if(0) free_if(0)) \
1507  nocopy(device__device_times_start)
1508  #else
1509  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
1510  MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
1511  out(device_times_start[0:toCopySize_device_times_start] : alloc_if(0) free_if(0)) \
1512  nocopy(device__device_times_start)
1513  #endif
1514  #else
1515  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE
1516  #endif
1517 
1518  // Setup local pointers and offload clauses for transfering atom input data
1519  atom * atoms = host__atoms;
1520  atom_param * atom_params = host__atom_params;
1521  double4 * forces = host__forces;
1522  double4 * slow_forces = host__slow_forces;
1523  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1524  #define MIC_DEVICE_ATOMS_CLAUSE
1525  #else
1526  #define MIC_DEVICE_ATOMS_CLAUSE \
1527  in(atoms[0:toCopySize_atoms] : alloc_if(0) free_if(0)) \
1528  in(atom_params[0:toCopySize_atom_params] : alloc_if(0) free_if(0))
1529  #endif
1530 
1531  // If kernel data transfer stats are being collected, calculate and print the amount of data
1532  // being transfered, separated by type : in, out, or inout.
1533  // NOTE: Only include data required for production runs (e.g. not tracing data).
1534  #if MIC_KERNEL_DATA_TRANSFER_STATS != 0
1535  int transfer_in = sizeof(isRemote)
1536  + (3 * sizeof(lata)) // for lata, latb, and latc
1537  + sizeof(size_t) // for device__force_buffers_req_size
1538  + (sizeof(atom) * toCopySize_atoms)
1539  + (sizeof(atom_param) * toCopySize_atom_params)
1540  + (sizeof(patch_pair) * toCopySize_patch_pairs)
1541  + (sizeof(force_list) * toCopySize_force_lists);
1542  int transfer_inout = sizeof(mic_kernel_data);
1543  int transfer_out = sizeof(double4) * (toCopySize_forces + toCopySize_slow_forces)
1544  + sizeof(int); // for device__timestep
1545  #if MIC_DEBUG > 0
1546  MICP("timestep:%06d - transfering %d bytes (isRemote:%d, in:%d, inout:%d, out:%d)\n", \
1547  host__timestep, transfer_in + transfer_inout + transfer_out, \
1548  isRemote, transfer_in, transfer_inout, transfer_out \
1549  ); MICPF;
1550  #else
1551  printf("[MIC-DEBUG] :: PE %04d-%05d-%1d :: transfering %d bytes (in:%d, inout:%d, out:%d)\n",
1552  host__pe, host__timestep, isRemote,
1553  transfer_in + transfer_inout + transfer_out, transfer_in, transfer_inout, transfer_out
1554  );
1555  #endif
1556  #endif
1557 
1558  // Setup input clauses required for this kernel.
1559  // NOTE: If MIC_SYNC_INPUT is being used, use an offload pragma to first transfer the data to
1560  // the device and use nocopy clauses for the actual kernel offload pragmas.
1561  patch_pair * patch_pairs = host__patch_pairs;
1562  force_list * force_lists = host__force_lists;
1563  #if MIC_SYNC_INPUT != 0
1564 
1565  // Push the data to the device
1566  if (singleKernelFlag != 0 || isRemote != 0) {
1567  MICP("<< pushing input data >>\n"); MICPF;
1568  #if MIC_TRACING != 0
1569  double input_start = CmiWallTimer();
1570  #endif
1571  #pragma offload_transfer target(mic:deviceNum) \
1572  in(patch_pairs[0:toCopySize_patch_pairs] : alloc_if(0) free_if(0)) \
1573  in(force_lists[0:toCopySize_force_lists] : alloc_if(0) free_if(0)) \
1574  in(_kernel_data[0:2] : alloc_if(0) free_if(0)) \
1575  MIC_DEVICE_ATOMS_CLAUSE
1576  { }
1577  #if MIC_TRACING != 0
1578  double input_end = CmiWallTimer();
1579  MIC_TRACING_RECORD(MIC_EVENT_SYNC_INPUT_PRAGMA, input_start, input_end);
1580  #endif
1581  }
1582 
1583  // Setup the clauses for the actual kernel offload pragma (so that data is not transfered)
1584  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1585  #define TMP_SYNC_ATOMS_CLAUSE
1586  #else
1587  #define TMP_SYNC_ATOMS_CLAUSE \
1588  in(atoms[0:0] : alloc_if(0) free_if(0)) \
1589  in(atom_params[0:0] : alloc_if(0) free_if(0))
1590  #endif
1591 
1592  #if MIC_SYNC_OUTPUT != 0
1593  #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1594  in(_kernel_data[0:0] : alloc_if(0) free_if(0))
1595  #else
1596  #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1597  out(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
1598  #endif
1599 
1600  #define MIC_DEVICE_SYNC_INPUT_CLAUSE \
1601  nocopy(device__lata) nocopy(device__latb) nocopy(device__latc) \
1602  nocopy(device__atoms_size) \
1603  in(patch_pairs[0:0] : alloc_if(0) free_if(0)) nocopy(device__patch_pairs_size) \
1604  in(force_lists[0:0] : alloc_if(0) free_if(0)) nocopy(device__force_lists_size) \
1605  TMP_SYNC_KERNEL_DATA_CLAUSE \
1606  TMP_SYNC_ATOMS_CLAUSE
1607 
1608  #else
1609 
1610  #if MIC_SYNC_OUTPUT != 0
1611  #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1612  in(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
1613  #else
1614  #define TMP_SYNC_KERNEL_DATA_CLAUSE \
1615  inout(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
1616  #endif
1617 
1618  // Setup the clauses for the actual kernel offload pragma (so that data is transfered)
1619  #define MIC_DEVICE_SYNC_INPUT_CLAUSE \
1620  nocopy(device__lata) nocopy(device__latb) nocopy(device__latc) \
1621  in(patch_pairs[0:toCopySize_patch_pairs] : alloc_if(0) free_if(0)) \
1622  in(force_lists[0:toCopySize_force_lists] : alloc_if(0) free_if(0)) \
1623  nocopy(device__atoms_size) nocopy(device__patch_pairs_size) nocopy(device__force_lists_size) \
1624  TMP_SYNC_KERNEL_DATA_CLAUSE \
1625  MIC_DEVICE_ATOMS_CLAUSE
1626 
1627  #endif
1628 
1629  MICP("<< issuing %s kernel >>\n", (isRemote) ? ("remote") : ("local")); MICPF;
1630 
1631  #if MIC_SYNC_OUTPUT != 0
1632  #define MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
1633  in(forces[0:0] : alloc_if(0) free_if(0)) \
1634  in(slow_forces[0:0] : alloc_if(0) free_if(0)) \
1635  nocopy(device__timestep)
1636  #else
1637  #define MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
1638  out(forces[toCopyStart_forces:toCopySize_forces] : alloc_if(0) free_if(0)) \
1639  out(slow_forces[toCopyStart_slow_forces:toCopySize_slow_forces] : alloc_if(0) free_if(0)) \
1640  out(device__timestep : into(host__timestep))
1641  #endif
1642 
1643  #if MIC_DATA_STRUCT_VERIFY != 0
1644  #define MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE \
1645  nocopy(device__exclusion_bits_copy) nocopy(device__constants_copy) \
1646  nocopy(device__table_four_copy) nocopy(device__table_four_float_copy) \
1647  nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
1648  nocopy(device__patch_pairs_copy) nocopy(device__force_lists_copy) \
1649  nocopy(device__atoms_copy) nocopy(device__atom_params_copy) \
1650  nocopy(device__patch_pairs_copy_size) nocopy(device__force_lists_copy_size) \
1651  nocopy(device__atoms_copy_size) nocopy(device__atom_params_copy_size)
1652  #else
1653  #define MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE
1654  #endif
1655 
1656  #if REFINE_PAIRLISTS != 0
1657  #define MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE \
1658  nocopy(device__r2_array) \
1659  nocopy(device__pl_array) \
1660  nocopy(device__pl_size)
1661  #else
1662  #define MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE
1663  #endif
1664 
1665  #if MIC_TRACING != 0
1666  double pragma_start = CmiWallTimer();
1667  #endif
1668 
1669  // Trigger computation on the MIC device (asynchronous offload pragma that does the kernel work)
1670  #pragma offload target(mic:deviceNum) \
1671  in(isRemote) \
1672  MIC_DEVICE_SYNC_INPUT_CLAUSE \
1673  MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
1674  MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
1675  nocopy(device__force_buffers) nocopy(device__slow_force_buffers) \
1676  nocopy(device__force_buffers_alloc_size) nocopy(device__force_buffers_req_size) \
1677  nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
1678  nocopy(device__table_four) nocopy(device__table_four_float) nocopy(device__table_four_n_16) \
1679  nocopy(device__lj_table) nocopy(device__lj_table_float) nocopy(device__lj_table_dim) \
1680  nocopy(device__exclusion_bits) \
1681  nocopy(device__constants) \
1682  nocopy(device__atoms) nocopy(device__atom_params) \
1683  nocopy(device__forces) nocopy(device__slow_forces) \
1684  nocopy(device__patch_pairs) nocopy(device__force_lists) \
1685  nocopy(device__numOMPThreads) \
1686  MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE \
1687  MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE \
1688  nocopy(device__pe) nocopy(device__node) \
1689  DEVICE_FPRINTF_CLAUSE
1690  {
1691 // DEVICE_FPRINTF_CLAUSE \
1692 // signal(tag_kernel)
1693  __MUST_BE_MIC;
1694  __FULL_CHECK(__ASSERT(isRemote == 0 || isRemote == 1));
1695  mic_kernel_data * kernel_data = _kernel_data + isRemote;
1696  __FULL_CHECK(__ASSERT(kernel_data != NULL));
1697  __FULL_CHECK(__ASSERT(device__numOMPThreads > 0));
1698 
1699  #if (MIC_DEVICE_FPRINTF != 0) && (MIC_DEVICE_FPRINTF_REOPEN_FREQ > 0)
1700  if ((device__timestep != 0 && device__timestep % MIC_DEVICE_FPRINTF_REOPEN_FREQ == 0) && (kernel_data->isRemote == 0)) {
1701  char filename[128] = { 0 };
1702  sprintf(filename, "/tmp/namd_deviceDebugInfo.%d", device__pe);
1703  if (device__node <= 0) {
1704  printf("[MIC-DEBUG] :: Reopening debug output files on MIC devices (timestep: %d).\n", device__timestep);
1705  fflush(NULL);
1706  }
1707  fclose(device__fout);
1708  device__fout = fopen(filename, "w");
1709  DEVICE_FPRINTF("Device file on PE %d (node: %d) - reopen (timestep: %d)...\n", device__pe, device__node, device__timestep);
1710  }
1711  #endif
1712 
1713  DEVICE_FPRINTF("%d %s ", device__timestep, (kernel_data->isRemote != 0 ? "R" : "L"));
1714 
1715  // DMK - TODO | FIXME - For now, copy values over into device__xxx variables
1716  // so the older version of the code will still work (remove them eventually)
1717  device__patch_pairs = patch_pairs;
1718  device__force_lists = force_lists;
1719  device__atoms = atoms;
1720  device__atom_params = atom_params;
1721  device__forces = forces;
1722  device__slow_forces = slow_forces;
1723  __ASSERT(device__patch_pairs != NULL);
1724  __ASSERT(device__force_lists != NULL);
1725  __ASSERT(device__atoms != NULL);
1726  __ASSERT(device__atom_params != NULL);
1727  __ASSERT(device__forces != NULL);
1728  __ASSERT(device__slow_forces != NULL);
1729  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
1730  device__device_times_start = device_times_start;
1731  __ASSERT(device__device_times_start);
1732  #if MIC_DEVICE_TRACING_DETAILED != 0
1733  device__device_times_computes = device_times_computes;
1734  device__device_times_patches = device_times_patches;
1735  __ASSERT(device__device_times_computes);
1736  __ASSERT(device__device_times_patches);
1737  #endif
1738  #endif
1739  device__lata.x = kernel_data->lata.x;
1740  device__lata.y = kernel_data->lata.y;
1741  device__lata.z = kernel_data->lata.z;
1742  device__latb.x = kernel_data->latb.x;
1743  device__latb.y = kernel_data->latb.y;
1744  device__latb.z = kernel_data->latb.z;
1745  device__latc.x = kernel_data->latc.x;
1746  device__latc.y = kernel_data->latc.y;
1747  device__latc.z = kernel_data->latc.z;
1748  device__atoms_size = kernel_data->numAtoms;
1749  device__patch_pairs_size = kernel_data->numPatchPairs;
1750  device__force_lists_size = kernel_data->numForceLists;
1751  device__force_buffers_req_size = kernel_data->forceBuffersReqSize;
1752 
1753  // TRACING - Record the start time of the kernel
1754  #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
1755  __ASSERT(device__device_times_start != NULL);
1756  device__device_times_start[((isRemote)?(0):(1))] = getCurrentTime();
1757  #endif
1758 
1759  // If data structure verification is being performed, create copies of the buffers that can
1760  // change throughout the simulation at the start of the timestep on the device, and verify
1761  // the lookup tables before any work is performed.
1762  #if MIC_DATA_STRUCT_VERIFY != 0
1763 
1764  _verify_tables(device__pe, device__timestep, isRemote, -1);
1765 
1766  _verify_remalloc<patch_pair>(device__patch_pairs_copy_size, device__patch_pairs_size, device__patch_pairs_copy);
1767  _verify_remalloc<force_list>(device__force_lists_copy_size, device__force_lists_size, device__force_lists_copy);
1768  _verify_remalloc<atom>(device__atoms_copy_size, device__atoms_size, device__atoms_copy);
1769  _verify_remalloc<atom_param>(device__atom_params_copy_size, device__atoms_size, device__atom_params_copy);
1770  __ASSERT(device__patch_pairs_copy != NULL && device__force_lists_copy != NULL);
1771  __ASSERT(device__atoms_copy != NULL && device__atom_params_copy != NULL);
1772 
1773  _verify_parallel_memcpy(device__patch_pairs_copy, device__patch_pairs, sizeof(patch_pair) * device__patch_pairs_size);
1774  _verify_parallel_memcpy(device__force_lists_copy, device__force_lists, sizeof(force_list) * device__force_lists_size);
1775  _verify_parallel_memcpy(device__atoms_copy, device__atoms, sizeof(atom) * device__atoms_size);
1776  _verify_parallel_memcpy(device__atom_params_copy, device__atom_params, sizeof(atom_param) * device__atoms_size);
1777  #endif
1778 
1779  // Make sure there is enough memory allocated for the force buffers (reallocate if not)
1780  if (device__force_buffers_req_size > device__force_buffers_alloc_size) {
1781  if (device__force_buffers != NULL) { _MM_FREE_WRAPPER(device__force_buffers); }
1782  if (device__slow_force_buffers != NULL) { _MM_FREE_WRAPPER(device__slow_force_buffers); }
1783  device__force_buffers_alloc_size = device__force_buffers_req_size;
1784  size_t reqSize = sizeof(double) * 4 * device__force_buffers_alloc_size;
1785  device__force_buffers = (double4*)(_MM_MALLOC_WRAPPER(reqSize, 64, "device__force_buffers"));
1786  device__slow_force_buffers = (double4*)(_MM_MALLOC_WRAPPER(reqSize, 64, "device__slow_force_buffers"));
1787  __ASSERT(device__force_buffers != NULL && device__slow_force_buffers != NULL);
1788  }
1789 
1790  // Declare and initialize the overall virial summation/accumulation variables that will
1791  // eventually be passed back to the host
1792  double virial_xx = 0.0;
1793  double virial_xy = 0.0;
1794  double virial_xz = 0.0;
1795  double virial_yy = 0.0;
1796  double virial_yz = 0.0;
1797  double virial_zz = 0.0;
1798  double fullElectVirial_xx = 0.0;
1799  double fullElectVirial_xy = 0.0;
1800  double fullElectVirial_xz = 0.0;
1801  double fullElectVirial_yy = 0.0;
1802  double fullElectVirial_yz = 0.0;
1803  double fullElectVirial_zz = 0.0;
1804  double vdwEnergy = 0.0;
1805  double electEnergy = 0.0;
1806  double fullElectEnergy = 0.0;
1807  #if MIC_EXCL_CHECKSUM_FULL != 0
1808  int exclusionSum = 0;
1809  #endif
1810 
1811  // Calculate the lower and upper bounds for the patch pairs that will be computed within this kernel
1812  const int ppI_start = (isRemote) ? (kernel_data->numLocalComputes) : (0);
1813  const int ppI_end = (device__constants->singleKernelFlag != 0 || isRemote) ? (device__patch_pairs_size) : (kernel_data->numLocalComputes);
1814  __FULL_CHECK(__ASSERT(ppI_start >= 0 && ppI_start <= device__patch_pairs_size));
1815  __FULL_CHECK(__ASSERT(ppI_end >= 0 && ppI_end <= device__patch_pairs_size));
1816  __FULL_CHECK(__ASSERT(ppI_start <= ppI_end));
1817 
1818  // TRACING - Record the start time for the patch pairs (computes) parallel region
1819  #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
1820  device__device_times_start[((isRemote)?(2):(3))] = getCurrentTime();
1821  #endif
1822 
1823  DEVICE_FPRINTF(" C(%d,%d)", ppI_start, ppI_end);
1824 
1825  // Process each patch pair (compute)
1826  #pragma novector
1827  #if MULTIPLE_THREADS != 0
1828  #if MIC_EXCL_CHECKSUM_FULL != 0
1829  #define EXCL_CHECKSUM_CLAUSE reduction(+ : exclusionSum)
1830  #else
1831  #define EXCL_CHECKSUM_CLAUSE
1832  #endif
1833  #pragma omp parallel for schedule(dynamic, 1) \
1834  reduction(+ : vdwEnergy, electEnergy, fullElectEnergy) \
1835  reduction(+ : virial_xx, virial_xy, virial_xz, virial_yy, virial_yz, virial_zz) \
1836  reduction(+ : fullElectVirial_xx, fullElectVirial_xy, fullElectVirial_xz, fullElectVirial_yy, fullElectVirial_yz, fullElectVirial_zz ) \
1837  EXCL_CHECKSUM_CLAUSE
1838  #undef EXCL_CHECKSUM_CLAUSE
1839  #endif
1840  for (int ppI = ppI_start; ppI < ppI_end; ppI++) {
1841 
1842  DEVICE_FPRINTF(" C");
1843 
1844  // TRACING - Record the start of each patch pair (compute)
1845  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
1846  __FULL_CHECK(__ASSERT(device__device_times_computes != NULL));
1847  device__device_times_computes[ppI * 2] = getCurrentTime();
1848  #endif
1849 
1850  // Create and populate the mic_params data structure (passed into the kernel function)
1851  mic_params params;
1852 
1853  // Initialize the pointer to the patch_pairs structure for this compute
1854  params.pp = (patch_pair*)(device__patch_pairs + ppI);
1855  __FULL_CHECK(__ASSERT(params.pp != NULL));
1856 
1857  #if MIC_SORT_LISTS != 0
1858  const int abSwapFlag = ((params.pp->patch1_size < params.pp->patch2_size) ? (1) : (0));
1859  #define ABSWAP(t, f) ((abSwapFlag)?(t):(f))
1860  #else
1861  #define ABSWAP(t, f) (f)
1862  #endif
1863 
1864  // DMK - DEBUG - Set a few values used for debugging
1865  params.ppI = ppI;
1866  params.p1 = ABSWAP(params.pp->p2, params.pp->p1);
1867  params.p2 = ABSWAP(params.pp->p1, params.pp->p2);
1868  params.pe = device__pe;
1869  params.timestep = device__timestep;
1870 
1871  // If pairlist refinement is enabled, initialize the related scratch buffer pointers
1872  // In some cases, buffers used by the compute kernel functions (calc_self, etc.) only
1873  // require scratch buffers (i.e. not used for input or output). As such, we can
1874  // allocate these buffers on a one-per-thread basis rather than a one-per-compute
1875  // basis to save some memory (and get memory location reuse for any given thread).
1876  // These fields relate to the pointer and sizes for those buffers.
1877  #if REFINE_PAIRLISTS != 0
1878  const int threadIndex = omp_get_thread_num();
1879  params.plArray_ptr = device__pl_array + threadIndex;
1880  params.plSize_ptr = device__pl_size + threadIndex;
1881  params.r2Array_ptr = device__r2_array + threadIndex;
1882  #endif
1883 
1884  // Initialize pointers to the various lookup tables, constants, etc. that are used
1885  // from within the compute functions
1886  #if MIC_HANDCODE_FORCE_SINGLE != 0
1887  params.table_four_base_ptr = (void*)device__table_four_float;
1888  params.lj_table_base_ptr = (void*)device__lj_table_float;
1889  #else
1890  params.table_four_base_ptr = (void*)device__table_four;
1891  params.lj_table_base_ptr = (void*)device__lj_table;
1892  #endif
1893  params.table_four_n_16 = device__table_four_n_16;
1894  params.lj_table_dim = device__lj_table_dim;
1895  params.exclusion_bits = device__exclusion_bits;
1896  params.constants = device__constants;
1897  __FULL_CHECK(__ASSERT(params.table_four_base_ptr != NULL));
1898  __FULL_CHECK(__ASSERT(params.lj_table_base_ptr != NULL));
1899  __FULL_CHECK(__ASSERT(params.table_four_n_16 % 16 == 0));
1900  __FULL_CHECK(__ASSERT(params.exclusion_bits != NULL));
1901  __FULL_CHECK(__ASSERT(params.constants != NULL));
1902 
1903  // Setup pairlist flags and pairlist buffer (an array of pointers, one for each pairlist,
1904  // for each compute)
1905  params.usePairlists = kernel_data->usePairlists;
1906  params.savePairlists = kernel_data->savePairlists;
1907  params.pairlists_ptr = &(((int**)device__pairlists)[NUM_PAIRLIST_TYPES * ppI]);
1908  __FULL_CHECK(__ASSERT(device__pairlists != NULL));
1909  __FULL_CHECK(__ASSERT(params.pairlists_ptr != NULL));
1910 
1911  // Setup the sizes of the atom lists and force lists
1912  int n0 = ABSWAP(params.pp->patch2_size, params.pp->patch1_size);
1913  int n1 = ABSWAP(params.pp->patch1_size, params.pp->patch2_size);
1914  int n0_16 = (n0 + 15) & (~15); // Round up to nearest multiple of 16
1915  int n1_16 = (n1 + 15) & (~15);
1916  params.numAtoms[0] = n0;
1917  params.numAtoms[1] = n1;
1918  params.numAtoms_16[0] = n0_16;
1919  params.numAtoms_16[1] = n1_16;
1920  __FULL_CHECK(__ASSERT(params.numAtoms[0] >= 0));
1921  __FULL_CHECK(__ASSERT(params.numAtoms[1] >= 0));
1922 
1923  // Setup the pointers to the input particle data for this compute
1924  // NOTE: All of the particle data is sent in two chunks (atoms and atom_params, where
1925  // atoms changes every timestep but atom_params only changes periodically). Each
1926  // of these buffers contains several sub-arrays (one for each field) in an
1927  // structure-of-arrays (SOA) format. This code, along with the force code
1928  // below it, is simply creating the array pointers for each "field's array."
1929  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1930  mic_position_t* pBase0 = (mic_position_t*)(params.pp->patch1_atomDataPtr);
1931  mic_position_t* pBase1 = (mic_position_t*)(params.pp->patch2_atomDataPtr);
1932  int *pExtBase0 = (int*)(pBase0 + n0_16);
1933  int *pExtBase1 = (int*)(pBase1 + n1_16);
1934  #else
1935  __FULL_CHECK(__ASSERT(device__atoms != NULL));
1936  __FULL_CHECK(__ASSERT(device__atom_params != NULL));
1937 
1938  __FULL_CHECK(__ASSERT(params.pp->patch1_atom_start >= 0));
1939  __FULL_CHECK(__ASSERT((params.pp->patch1_atom_start < device__atoms_size) || (params.pp->patch1_atom_start == device__atoms_size && n0 == 0)));
1940  __FULL_CHECK(__ASSERT(params.pp->patch2_atom_start >= 0));
1941  __FULL_CHECK(__ASSERT((params.pp->patch2_atom_start < device__atoms_size) || (params.pp->patch2_atom_start == device__atoms_size && n1 == 0)));
1942 
1943  mic_position_t* pBase0 = (mic_position_t*)(device__atoms + ABSWAP(params.pp->patch2_atom_start, params.pp->patch1_atom_start));
1944  mic_position_t* pBase1 = (mic_position_t*)(device__atoms + ABSWAP(params.pp->patch1_atom_start, params.pp->patch2_atom_start));
1945  int* pExtBase0 = (int*)(device__atom_params + ABSWAP(params.pp->patch2_atom_start, params.pp->patch1_atom_start));
1946  int* pExtBase1 = (int*)(device__atom_params + ABSWAP(params.pp->patch1_atom_start, params.pp->patch2_atom_start));
1947  #endif
1948  __FULL_CHECK(__ASSERT(pBase0 != NULL));
1949  __FULL_CHECK(__ASSERT(pBase1 != NULL));
1950  __FULL_CHECK(__ASSERT(pExtBase0 != NULL));
1951  __FULL_CHECK(__ASSERT(pExtBase1 != NULL));
1952  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1953  params.p[0] = (atom*)pBase0;
1954  params.p[1] = (atom*)pBase1;
1955  params.pExt[0] = (atom_param*)pExtBase0;
1956  params.pExt[1] = (atom_param*)pExtBase1;
1957  #else
1958  params.p_x[0] = pBase0 + (0 * n0_16); params.p_x[1] = pBase1 + (0 * n1_16);
1959  params.p_y[0] = pBase0 + (1 * n0_16); params.p_y[1] = pBase1 + (1 * n1_16);
1960  params.p_z[0] = pBase0 + (2 * n0_16); params.p_z[1] = pBase1 + (2 * n1_16);
1961  params.p_q[0] = pBase0 + (3 * n0_16); params.p_q[1] = pBase1 + (3 * n1_16);
1962  params.pExt_vdwType[0] = pExtBase0 + (0 * n0_16);
1963  params.pExt_index[0] = pExtBase0 + (1 * n0_16);
1964  params.pExt_exclIndex[0] = pExtBase0 + (2 * n0_16);
1965  params.pExt_exclMaxDiff[0] = pExtBase0 + (3 * n0_16);
1966  params.pExt_vdwType[1] = pExtBase1 + (0 * n1_16);
1967  params.pExt_index[1] = pExtBase1 + (1 * n1_16);
1968  params.pExt_exclIndex[1] = pExtBase1 + (2 * n1_16);
1969  params.pExt_exclMaxDiff[1] = pExtBase1 + (3 * n1_16);
1970  #endif
1971 
1972  // Setup the pointers to the output force data for this compute
1973  // NOTE: Forces are output every timestep, but slow forces (fullf) are output only
1974  // during some timesteps.
1975  __FULL_CHECK(__ASSERT(device__force_lists != NULL));
1976  __FULL_CHECK(__ASSERT(device__force_buffers != NULL));
1977  __FULL_CHECK(__ASSERT(device__slow_force_buffers != NULL));
1978  __FULL_CHECK(__ASSERT(params.pp->patch1_force_list_index < device__force_lists_size));
1979  __FULL_CHECK(__ASSERT(params.pp->patch2_force_list_index < device__force_lists_size));
1980  __FULL_CHECK(__ASSERT(params.pp->patch1_force_start < device__force_buffers_req_size));
1981  __FULL_CHECK(__ASSERT(params.pp->patch2_force_start < device__force_buffers_req_size));
1982  params.doSlow = kernel_data->doSlow; // Flag indicating if slow forces should be calculate this timestep or not
1983  params.fl[0] = (force_list*)(device__force_lists + ABSWAP(params.pp->patch2_force_list_index, params.pp->patch1_force_list_index));
1984  params.fl[1] = (force_list*)(device__force_lists + ABSWAP(params.pp->patch1_force_list_index, params.pp->patch2_force_list_index));
1985  double *ffBase0 = (double*)(device__force_buffers + ABSWAP(params.pp->patch2_force_start, params.pp->patch1_force_start));
1986  double *ffBase1 = (double*)(device__force_buffers + ABSWAP(params.pp->patch1_force_start, params.pp->patch2_force_start));
1987  double *fullfBase0 = (double*)(device__slow_force_buffers + ABSWAP(params.pp->patch2_force_start, params.pp->patch1_force_start));
1988  double *fullfBase1 = (double*)(device__slow_force_buffers + ABSWAP(params.pp->patch1_force_start, params.pp->patch2_force_start));
1989  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1990  params.ff[0] = (double4*)ffBase0;
1991  params.ff[1] = (double4*)ffBase1;
1992  params.fullf[0] = (double4*)fullfBase0;
1993  params.fullf[1] = (double4*)fullfBase1;
1994  __ASSERT(params.ff[0] != NULL);
1995  __ASSERT(params.ff[1] != NULL);
1996  __ASSERT(params.fullf[0] != NULL);
1997  __ASSERT(params.fullf[1] != NULL);
1998  #else
1999  params.ff_x[0] = ffBase0 + (0 * n0_16); params.ff_x[1] = ffBase1 + (0 * n1_16);
2000  params.ff_y[0] = ffBase0 + (1 * n0_16); params.ff_y[1] = ffBase1 + (1 * n1_16);
2001  params.ff_z[0] = ffBase0 + (2 * n0_16); params.ff_z[1] = ffBase1 + (2 * n1_16);
2002  params.ff_w[0] = ffBase0 + (3 * n0_16); params.ff_w[1] = ffBase1 + (3 * n1_16);
2003  params.fullf_x[0] = fullfBase0 + (0 * n0_16); params.fullf_x[1] = fullfBase1 + (0 * n1_16);
2004  params.fullf_y[0] = fullfBase0 + (1 * n0_16); params.fullf_y[1] = fullfBase1 + (1 * n1_16);
2005  params.fullf_z[0] = fullfBase0 + (2 * n0_16); params.fullf_z[1] = fullfBase1 + (2 * n1_16);
2006  params.fullf_w[0] = fullfBase0 + (3 * n0_16); params.fullf_w[1] = fullfBase1 + (3 * n1_16);
2007  __ASSERT(params.ff_x[0] != NULL); __ASSERT(params.ff_x[1] != NULL);
2008  __ASSERT(params.ff_y[0] != NULL); __ASSERT(params.ff_y[1] != NULL);
2009  __ASSERT(params.ff_z[0] != NULL); __ASSERT(params.ff_z[1] != NULL);
2010  __ASSERT(params.ff_w[0] != NULL); __ASSERT(params.ff_w[1] != NULL);
2011  __ASSERT(params.fullf_x[0] != NULL); __ASSERT(params.fullf_x[1] != NULL);
2012  __ASSERT(params.fullf_y[0] != NULL); __ASSERT(params.fullf_y[1] != NULL);
2013  __ASSERT(params.fullf_z[0] != NULL); __ASSERT(params.fullf_z[1] != NULL);
2014  __ASSERT(params.fullf_w[0] != NULL); __ASSERT(params.fullf_w[1] != NULL);
2015  #endif
2016 
2017  // Create the offsets for the first list of particles.
2018  // NOTE: In this version of the nonbonded code, the positions of the atoms are stored
2019  // as offsets within the patch in which they are located. These offsets represent
2020  // the offsets between the two patches being interacted (including periodic boundaries).
2021  params.offset.x = params.pp->offset.x * device__lata.x
2022  + params.pp->offset.y * device__latb.x
2023  + params.pp->offset.z * device__latc.x;
2024  params.offset.y = params.pp->offset.x * device__lata.y
2025  + params.pp->offset.y * device__latb.y
2026  + params.pp->offset.z * device__latc.y;
2027  params.offset.z = params.pp->offset.x * device__lata.z
2028  + params.pp->offset.y * device__latb.z
2029  + params.pp->offset.z * device__latc.z;
2030  #if MIC_SORT_LISTS != 0
2031  params.offset.x *= ABSWAP(-1.0f, 1.0f);
2032  params.offset.y *= ABSWAP(-1.0f, 1.0f);
2033  params.offset.z *= ABSWAP(-1.0f, 1.0f);
2034  #endif
2035 
2036  // DMK - DEBUG - Record the center point for each of the input patches
2037  float patch1_center_x = params.pp->patch1_center_x * device__lata.x
2038  + params.pp->patch1_center_y * device__latb.x
2039  + params.pp->patch1_center_z * device__latc.x;
2040  float patch1_center_y = params.pp->patch1_center_x * device__lata.y
2041  + params.pp->patch1_center_y * device__latb.y
2042  + params.pp->patch1_center_z * device__latc.y;
2043  float patch1_center_z = params.pp->patch1_center_x * device__lata.z
2044  + params.pp->patch1_center_y * device__latb.z
2045  + params.pp->patch1_center_z * device__latc.z;
2046  float patch2_center_x = params.pp->patch2_center_x * device__lata.x
2047  + params.pp->patch2_center_y * device__latb.x
2048  + params.pp->patch2_center_z * device__latc.x;
2049  float patch2_center_y = params.pp->patch2_center_x * device__lata.y
2050  + params.pp->patch2_center_y * device__latb.y
2051  + params.pp->patch2_center_z * device__latc.y;
2052  float patch2_center_z = params.pp->patch2_center_x * device__lata.z
2053  + params.pp->patch2_center_y * device__latb.z
2054  + params.pp->patch2_center_z * device__latc.z;
2055  params.patch1_center_x = ABSWAP(patch2_center_x, patch1_center_x);
2056  params.patch2_center_x = ABSWAP(patch1_center_x, patch2_center_x);
2057  params.patch1_center_y = ABSWAP(patch2_center_y, patch1_center_y);
2058  params.patch2_center_y = ABSWAP(patch1_center_y, patch2_center_y);
2059  params.patch1_center_z = ABSWAP(patch2_center_z, patch1_center_z);
2060  params.patch2_center_z = ABSWAP(patch1_center_z, patch2_center_z);
2061 
2062  // Initialize the virial accumulators for this compute (zero them out)
2063  params.virial_xx = 0;
2064  params.virial_xy = 0;
2065  params.virial_xz = 0;
2066  params.virial_yy = 0;
2067  params.virial_yz = 0;
2068  params.virial_zz = 0;
2069  params.fullElectVirial_xx = 0;
2070  params.fullElectVirial_xy = 0;
2071  params.fullElectVirial_xz = 0;
2072  params.fullElectVirial_yy = 0;
2073  params.fullElectVirial_yz = 0;
2074  params.fullElectVirial_zz = 0;
2075  params.vdwEnergy = 0.0;
2076  params.electEnergy = 0.0;
2077  params.fullElectEnergy = 0.0;
2078  #if MIC_EXCL_CHECKSUM_FULL != 0
2079  params.exclusionSum = 0;
2080  #endif
2081 
2082  // Select the version of the kernel to call based on the timestep's requirements
2083  // and what type of compute this compute is
2084  int isSelf = (params.pp->patch1_force_start == params.pp->patch2_force_start);
2085  // NOTE: Many ways to check this (arbitrary test used here)
2086  int selfBit = ((isSelf) ? (0x01) : (0x00));
2087  int doSlowBit = ((kernel_data->doSlow) ? (0x02) : (0x00));
2088  int doEnergyBit = ((kernel_data->doEnergy) ? (0x04) : (0x00));
2089  int kernelSelect = selfBit | doSlowBit | doEnergyBit;
2090  DEVICE_FPRINTF("%d", kernelSelect);
2091  switch (kernelSelect) {
2092  case 0x00: calc_pair(params); break;
2093  case 0x01: calc_self(params); break;
2094  case 0x02: calc_pair_fullelect(params); break;
2095  case 0x03: calc_self_fullelect(params); break;
2096  case 0x04: calc_pair_energy(params); break;
2097  case 0x05: calc_self_energy(params); break;
2098  case 0x06: calc_pair_energy_fullelect(params); break;
2099  case 0x07: calc_self_energy_fullelect(params); break;
2100  default:
2101  mic_dev_die("!!! INVALID KERNEL SELECTION ON MIC DEVICE !!!\n");
2102  break;
2103  } // end switch (kernelSelect)
2104 
2105  // Contribute this compute's virial summations into the overall virial summation
2106  // that will be passed back to the host
2107  #if MIC_SORT_LISTS != 0
2108  if (abSwapFlag) {
2109  virial_xx -= params.virial_xx;
2110  virial_xy -= params.virial_xy;
2111  virial_xz -= params.virial_xz;
2112  virial_yy -= params.virial_yy;
2113  virial_yz -= params.virial_yz;
2114  virial_zz -= params.virial_zz;
2115  fullElectVirial_xx -= params.fullElectVirial_xx;
2116  fullElectVirial_xy -= params.fullElectVirial_xy;
2117  fullElectVirial_xz -= params.fullElectVirial_xz;
2118  fullElectVirial_yy -= params.fullElectVirial_yy;
2119  fullElectVirial_yz -= params.fullElectVirial_yz;
2120  fullElectVirial_zz -= params.fullElectVirial_zz;
2121  vdwEnergy -= params.vdwEnergy;
2122  electEnergy -= params.electEnergy;
2123  fullElectEnergy -= params.fullElectEnergy;
2124  } else {
2125  #endif
2126  virial_xx += params.virial_xx;
2127  virial_xy += params.virial_xy;
2128  virial_xz += params.virial_xz;
2129  virial_yy += params.virial_yy;
2130  virial_yz += params.virial_yz;
2131  virial_zz += params.virial_zz;
2132  fullElectVirial_xx += params.fullElectVirial_xx;
2133  fullElectVirial_xy += params.fullElectVirial_xy;
2134  fullElectVirial_xz += params.fullElectVirial_xz;
2135  fullElectVirial_yy += params.fullElectVirial_yy;
2136  fullElectVirial_yz += params.fullElectVirial_yz;
2137  fullElectVirial_zz += params.fullElectVirial_zz;
2138  vdwEnergy += params.vdwEnergy;
2139  electEnergy += params.electEnergy;
2140  fullElectEnergy += params.fullElectEnergy;
2141  #if MIC_SORT_LISTS != 0
2142  }
2143  #endif
2144 
2145  #if MIC_EXCL_CHECKSUM_FULL != 0
2146  exclusionSum += params.exclusionSum;
2147  #endif
2148 
2149  #undef ABSWAP
2150 
2151  // TRACING - Record the end time for the compute
2152  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
2153  device__device_times_computes[ppI * 2 + 1] = getCurrentTime();
2154  #endif
2155 
2156  } // end parallel for (ppI < device__patch_pairs_size)
2157 
2158  // TRACING - Record the end time for all the computes
2159  #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
2160  device__device_times_start[((isRemote)?(4):(5))] = getCurrentTime();
2161  #endif
2162 
2163  // Store the reduced virial values into the kernel_data structure to be passed
2164  // back to the host core
2165  kernel_data->virial_xx = virial_xx;
2166  kernel_data->virial_xy = virial_xy;
2167  kernel_data->virial_xz = virial_xz;
2168  kernel_data->virial_yy = virial_yy;
2169  kernel_data->virial_yz = virial_yz;
2170  kernel_data->virial_zz = virial_zz;
2171  kernel_data->fullElectVirial_xx = fullElectVirial_xx;
2172  kernel_data->fullElectVirial_xy = fullElectVirial_xy;
2173  kernel_data->fullElectVirial_xz = fullElectVirial_xz;
2174  kernel_data->fullElectVirial_yy = fullElectVirial_yy;
2175  kernel_data->fullElectVirial_yz = fullElectVirial_yz;
2176  kernel_data->fullElectVirial_zz = fullElectVirial_zz;
2177  kernel_data->vdwEnergy = vdwEnergy;
2178  kernel_data->electEnergy = electEnergy;
2179  kernel_data->fullElectEnergy = fullElectEnergy;
2180  #if MIC_EXCL_CHECKSUM_FULL != 0
2181  kernel_data->exclusionSum = exclusionSum;
2182  #endif
2183 
2184  // Calculate the upper and lower bounds for the force lists to be processed by this kernel
2185  int flI_start = (isRemote) ? (kernel_data->numLocalPatches) : (0);
2186  int flI_end = (device__constants->singleKernelFlag != 0 || isRemote) ? (device__force_lists_size) : (kernel_data->numLocalPatches);
2187  __FULL_CHECK(__ASSERT(flI_start >= 0 && flI_start <= device__force_lists_size));
2188  __FULL_CHECK(__ASSERT(flI_end >= 0 && flI_start <= device__force_lists_size));
2189  __FULL_CHECK(__ASSERT(flI_start <= flI_end));
2190 
2191  // Because there are fewer patches than computes, create more parallelism by sub-dividing the
2192  // the force lists (patches), causing each patch to be processed by multiple threads
2193  const int numThreads = device__numOMPThreads; //omp_get_max_threads();
2194  const int numForceLoopIters = flI_end - flI_start;
2195  int numForceLoopSplits = 1;
2196  if ((numForceLoopIters > 0) && ((2 * numThreads) > numForceLoopIters)) {
2197  numForceLoopSplits = (2 * numThreads) / numForceLoopIters; // NOTE: 2 * numThreads to break it up more (smaller chunks of work)
2198  if (numForceLoopSplits < 2) { numForceLoopSplits = 2; } // At least split in half
2199  if (numForceLoopSplits > 16) { numForceLoopSplits = 16; } // Don't split any single patch too fine (threading overhead costs)
2200  }
2201  DEVICE_FPRINTF(" F(%d,%d:%d)", flI_start, flI_end, numForceLoopSplits);
2202  flI_start *= numForceLoopSplits;
2203  flI_end *= numForceLoopSplits;
2204 
2205  // TRACING - Record the start time for the processing of all force lists (patches)
2206  #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
2207  device__device_times_start[((isRemote)?(6):(7))] = getCurrentTime();
2208  #endif
2209 
2210  // Process each of the force lists (patches)
2211  #pragma novector
2212  #if MULTIPLE_THREADS != 0
2213  #pragma omp parallel for schedule(dynamic)
2214  #endif
2215  for (int _flI = flI_start; _flI < flI_end; _flI++) {
2216 
2217  // From _flI, calculate the flI (patch index) and flIPart (thread per patch) values
2218  int flI = _flI / numForceLoopSplits;
2219  __FULL_CHECK(__ASSERT(flI >= 0 && flI < device__force_lists_size));
2220  int flIPart = _flI % numForceLoopSplits;
2221 
2222  // TRACING - Record the start of each force reduction
2223  // DMK - FIXME | NOTE : THIS ONLY RECORDS ONE TASK/ITERATION OF EACH SPLIT SET, SO
2224  // PROJECTIONS WILL ONLY SHOW SOME OF THE FORCE "TASKS" !!! (the problem is
2225  // numForceLoopSplits can differ between the "remote" and "local" kernels).
2226  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
2227  // DMK - NOTE : Don't select the same part each time (e.g. only flIPart == 0)
2228  if (flIPart == flI % numForceLoopSplits) {
2229  device__device_times_patches[flI * 2] = getCurrentTime();
2230  }
2231  #endif
2232 
2233  // Get the output force list pointer and force array length
2234  const force_list &fl = device__force_lists[flI];
2235  const int f_len = fl.patch_stride * 4; // NOTE : number of individual doubles
2236  __ASSUME(f_len % 16 == 0);
2237 
2238  // Calculate this "task's" portion of the patch object's work (flIPartILo -> flIPartIHi)
2239  int flIPartJLo = (int)(((float)(f_len)) * (((float)(flIPart )) / ((float)(numForceLoopSplits))));
2240  int flIPartJHi = (int)(((float)(f_len)) * (((float)(flIPart + 1)) / ((float)(numForceLoopSplits))));
2241  // NOTE: Force flIPartJLo and flIPartJHi to cacheline boundaries to avoid false sharing
2242  flIPartJLo = (flIPartJLo + 7) & (~7);
2243  flIPartJHi = (flIPartJHi + 7) & (~7);
2244  if (flIPartJHi > f_len) { flIPartJHi = f_len; }
2245  __ASSUME(flIPartJLo % 8 == 0);
2246  __ASSUME(flIPartJHi % 8 == 0); // NOTE: true after capping at f_len since f_len % 16 == 0
2247 
2248  // Sum the force output for each compute contributing to this force list (patch)
2249  {
2250  // Setup the pointer to the final force array that will be passed back up to the host
2251  __FULL_CHECK(__ASSERT(device__forces != NULL));
2252  __FULL_CHECK(__ASSERT(fl.force_output_start >= 0));
2253  __FULL_CHECK(__ASSERT((fl.force_output_start < device__atoms_size) || (fl.force_output_start == device__atoms_size && f_len == 0)));
2254  double * RESTRICT fDst = (double*)(device__forces + fl.force_output_start);
2255  __FULL_CHECK(__ASSERT(fDst != NULL));
2256  __ASSUME_ALIGNED(fDst);
2257 
2258  // Setup the pointer to the list of arrays, each with output from one of
2259  // the compute objects that contributed to this patch's force data
2260  __FULL_CHECK(__ASSERT(device__force_buffers != NULL));
2261  __FULL_CHECK(__ASSERT(fl.force_list_start >= 0));
2262  __FULL_CHECK(__ASSERT((fl.force_list_start < device__force_buffers_req_size) || (fl.force_list_start == device__force_buffers_req_size && f_len == 0)));
2263  double *fSrcBase = (double*)(device__force_buffers + fl.force_list_start);
2264  __FULL_CHECK(__ASSERT(fSrcBase != NULL));
2265  __ASSUME_ALIGNED(fSrcBase);
2266 
2267  // Accumulate the forces from the various computes contributing to this patch
2268  memset(fDst + flIPartJLo, 0, sizeof(double) * (flIPartJHi - flIPartJLo));
2269  for (int i = 0; i < fl.force_list_size; i++) {
2270  #pragma omp simd
2271  for (int j = flIPartJLo; j < flIPartJHi; j++) {
2272  fDst[j] += fSrcBase[i * f_len + j];
2273  }
2274  }
2275  }
2276 
2277  // Sum the output for each contributing compute
2278  if (kernel_data->doSlow) {
2279 
2280  // Setup the pointer to the final force array that will be passed back up to the host
2281  __FULL_CHECK(__ASSERT(device__slow_forces != NULL));
2282  double * RESTRICT fDst = (double*)(device__slow_forces + fl.force_output_start);
2283  __FULL_CHECK(__ASSERT(fDst != NULL));
2284  __ASSUME_ALIGNED(fDst);
2285 
2286  // Setup the pointer to the list of arrays, each with output from one of
2287  // the compute objects that contributed to this patch's force data
2288  __FULL_CHECK(__ASSERT(device__slow_force_buffers != NULL));
2289  double *fSrcBase = (double*)(device__slow_force_buffers + fl.force_list_start);
2290  __FULL_CHECK(__ASSERT(fSrcBase != NULL));
2291  __ASSUME_ALIGNED(fSrcBase);
2292 
2293  // Accumulate the forces from the various computes contributing to this patch
2294  memset(fDst + flIPartJLo, 0, sizeof(double) * (flIPartJHi - flIPartJLo));
2295  for (int i = 0; i < fl.force_list_size; i++) {
2296  #pragma omp simd
2297  for (int j = flIPartJLo; j < flIPartJHi; j++) {
2298  fDst[j] += fSrcBase[i * f_len + j];
2299  }
2300  }
2301  }
2302 
2303  // TRACING - Record the end time for each force list (patch)
2304  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
2305  device__device_times_patches[flI * 2 + 1] = getCurrentTime();
2306  #endif
2307 
2308  } // end parallel for (flI < device__force_lists_size)
2309 
2310  // TRACING - Record the end time for all the force lists (patches)
2311  #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
2312  device__device_times_start[((isRemote)?(8):(9))] = getCurrentTime();
2313  #endif
2314 
2315  // DMK - DEBUG
2316  #if MIC_DATA_STRUCT_VERIFY != 0
2317  _verify_data_structures(device__pe, device__timestep, isRemote, 2, ppI_start, ppI_end, kernel_data->doSlow);
2318  _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__patch_pairs, (char*)device__patch_pairs_copy, sizeof(patch_pairs) * device__patch_pairs_size, "device__patch_pairs_copy");
2319  _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__force_lists, (char*)device__force_lists_copy, sizeof(force_lists) * device__force_lists_size, "device__force_lists_copy");
2320  _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__atoms, (char*)device__atoms_copy, sizeof(atom) * device__atoms_size, "device__atoms_copy");
2321  _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__atom_params, (char*)device__atom_params_copy, sizeof(atom_param) * device__atoms_size, "device__atom_params_copy");
2322  #endif
2323 
2324  // DMK - DEBUG
2325  #if MIC_TRACK_DEVICE_MEM_USAGE != 0
2326  if (device__timestep % 100 == 0 && isRemote == 0) {
2327 
2328  uint64_t plTotalMem = sizeof(void*) * device__pairlists_alloc_size;
2329  uint64_t plUsedMem = plTotalMem;
2330  uint64_t plPeakUsedMem = plTotalMem;
2331  if (device__pairlists != NULL) {
2332  for (int k = 0; k < device__pairlists_alloc_size; k++) {
2333  int * plPtr = ((int**)device__pairlists)[k];
2334  if (plPtr != NULL) {
2335  plTotalMem += plPtr[0] * sizeof(int) + 64;
2336  plUsedMem += plPtr[1] * sizeof(int);
2337  plPeakUsedMem += plPtr[-1] * sizeof(int);
2338  }
2339  }
2340  }
2341 
2342  uint64_t fbsTotalMem = device__force_buffers_alloc_size * 4 * sizeof(double);
2343  MemInfo mi;
2344  readMemInfo(&mi);
2345 
2346  printf("[MIC-MEMINFO] :: PE %d :: MIC mem usage (MB) -- ts: %d - "
2347  "f: %.3lf, t: %.3lf, c: %.3lf, a: %.3lf, i: %.3lf - "
2348  "vS: %.3lf, vP: %.3lf - "
2349  "plTM: %.3lf, plUM: %.3lf, plUPM: %.3lf, fbsTM(x2): %.3lf\n",
2350  ((int)(device__pe)),
2351  ((int)(device__timestep)),
2352  ((double)(mi.memFree)) * 1.0e-6,
2353  ((double)(mi.memTotal)) * 1.0e-6,
2354  ((double)(mi.cached)) * 1.0e-6,
2355  ((double)(mi.active)) * 1.0e-6,
2356  ((double)(mi.inactive)) * 1.0e-6,
2357  ((double)(mi.vmSize)) * 1.0e-6,
2358  ((double)(mi.vmPeak)) * 1.0e-6,
2359  ((double)(plTotalMem)) * 1.0e-6,
2360  ((double)(plUsedMem)) * 1.0e-6,
2361  ((double)(plPeakUsedMem)) * 1.0e-6,
2362  ((double)(fbsTotalMem)) * 1.0e-6
2363  ); fflush(NULL);
2364  }
2365  #endif
2366 
2367  // DMK - DEBUG - Increment the timestep counter on the device
2368  if (kernel_data->isRemote == 0) { device__timestep++; }
2369 
2370  DEVICE_FPRINTF(" |\n");
2371 
2372  } // end pragma offload
2373 
2374  #if MIC_TRACING != 0
2375  double pragma_end = CmiWallTimer();
2376  MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_PRAGMA, pragma_start, pragma_end);
2377  #endif
2378 
2379  #if MIC_SYNC_INPUT != 0
2380  #undef TMP_SYNC_ATOMS_CLAUSE
2381  #endif
2382  #undef TMP_SYNC_KERNEL_DATA_CLAUSE
2383  #undef MIC_DEVICE_SYNC_INPUT_CLAUSE
2384 
2385  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2386  #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
2387  #endif
2388  #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2389 
2390  #undef MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE
2391  #undef MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE
2392 }
2393 
2394 void mic_transfer_output(const int deviceNum,
2395  const int isRemote,
2396  const int numLocalAtoms,
2397  const int doSlow
2398  ) {
2399 
2400  const int numAtoms = host__atoms_size;
2401 
2402  mic_kernel_data * kernel_data = host__kernel_data;
2403 
2404  int toCopyStart_forces = (singleKernelFlag != 0) ? (0) : ((isRemote) ? (numLocalAtoms) : (0));
2405  int toCopySize_forces = (singleKernelFlag != 0) ? (numAtoms) : ((isRemote) ? (numAtoms - numLocalAtoms) : (numLocalAtoms));
2406  int toCopyStart_slow_forces = (doSlow) ? (toCopyStart_forces) : (0);
2407  int toCopySize_slow_forces = (doSlow) ? (toCopySize_forces) : (0);
2408 
2409  double4 * forces = host__forces;
2410  double4 * slow_forces = host__slow_forces;
2411 
2412  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2413  const int numPatchPairs = host__patch_pairs_size;
2414  const int numForceLists = host__force_lists_size;
2415  double * device_times_start = host__device_times_start;
2416  int toCopySize_device_times_start = ((isRemote) ? (0) : (10));
2417  #if MIC_DEVICE_TRACING_DETAILED != 0
2418  double * device_times_computes = host__device_times_computes;
2419  double * device_times_patches = host__device_times_patches;
2420  int toCopySize_device_times_computes = ((isRemote) ? (0) : (2 * numPatchPairs));
2421  int toCopySize_device_times_patches = ((isRemote) ? (0) : (2 * numForceLists));
2422  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
2423  out(device_times_computes[0:toCopySize_device_times_computes] : alloc_if(0) free_if(0)) \
2424  out(device_times_patches[0:toCopySize_device_times_patches] : alloc_if(0) free_if(0))
2425  #else
2426  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
2427  #endif
2428  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
2429  MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
2430  out(device_times_start[0:toCopySize_device_times_start] : alloc_if(0) free_if(0))
2431  #else
2432  #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2433  #endif
2434 
2435  #pragma offload_transfer target(mic:deviceNum) \
2436  out(forces[toCopyStart_forces:toCopySize_forces] : alloc_if(0) free_if(0)) \
2437  out(slow_forces[toCopyStart_slow_forces:toCopySize_slow_forces] : alloc_if(0) free_if(0)) \
2438  out(kernel_data[isRemote:1] : alloc_if(0) free_if(0)) \
2439  MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2440  { }
2441  // out(device__timestep : into(host__timestep))
2442 
2443  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2444  #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
2445  #endif
2446  #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE
2447 }
2448 
2449 int mic_check_remote_kernel_complete(const int deviceNum) {
2450  // DMK - NOTE : Disable these warnings for now since the device code is only called once,
2451  // but the state variable goes from 0 -> 1 -> 2 -> 0, which checks at both 1 and 2
2452  //if (!tag_remote_kernel) { printf("WARNING :: mic_check_remote_kernel_complete :: called when kernel not active.\n"); }
2453  // if (_Offload_signaled(deviceNum, &tag_remote_kernel)) {
2454  tag_remote_kernel = 0;
2455  return 1;
2456  // }
2457  // return 0;
2458 }
2459 
2460 int mic_check_local_kernel_complete(const int deviceNum) {
2461  // DMK - NOTE : Disable these warnings for now since the device code is only called once,
2462  // but the state variable goes from 0 -> 1 -> 2 -> 0, which checks at both 1 and 2
2463  //if (!tag_local_kernel) { printf("WARNING :: mic_check_local_kernel_complete :: called when kernel not active.\n"); }
2464  // if (_Offload_signaled(deviceNum, &tag_local_kernel)) {
2465  tag_local_kernel = 0;
2466  return 1;
2467  // }
2468  // return 0;
2469 }
2470 
2471 
2472 void mic_free_device(const int deviceNum) {
2473 
2474  mic_kernel_data * kernel_data = host__kernel_data;
2475 
2476  // Cleanup kernel data (for buffers allocated via offload pragmas, use "free_if(1)")
2477  #pragma offload target(mic:deviceNum) \
2478  nocopy(kernel_data : alloc_if(0) free_if(1)) \
2479  nocopy(device__table_four) nocopy(device__table_four_float) \
2480  nocopy(device__lj_table) nocopy(device__lj_table_float) \
2481  nocopy(device__exclusion_bits) nocopy(device__exclusion_bits_copy) \
2482  nocopy(device__constants) \
2483  nocopy(device__force_buffers) nocopy(device__slow_force_buffers) \
2484  nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
2485  nocopy(device__pl_array) nocopy(device__pl_size) nocopy(device__r2_array) \
2486  nocopy(device__patch_pairs : alloc_if(0) free_if(1)) \
2487  nocopy(device__force_lists : alloc_if(0) free_if(1)) \
2488  nocopy(device__numOMPThreads) \
2489  DEVICE_FPRINTF_CLAUSE
2490  {
2491  // Cleanup pairlist memory
2492  if (device__pairlists != NULL) {
2493  int **pairlists = (int**)(device__pairlists);
2494  for (int i = 0; i < device__pairlists_alloc_size; i++) {
2495  if (pairlists[i] != NULL) { _MM_FREE_WRAPPER(pairlists[i]); }
2496  }
2497  _MM_FREE_WRAPPER(pairlists);
2498  device__pairlists = 0;
2499  device__pairlists_alloc_size = 0;
2500  }
2501 
2502  // Cleanup refined pairlist memory
2503  #if REFINE_PAIRLISTS != 0
2504  for (int i = 0; i < device__numOMPThreads; i++) {
2505  if (device__pl_array[i] != NULL) { _MM_FREE_WRAPPER(device__pl_array[i]); }
2506  if (device__r2_array[i] != NULL) { _MM_FREE_WRAPPER(device__r2_array[i]); }
2507  }
2508  _MM_FREE_WRAPPER(device__pl_array); device__pl_array = NULL;
2509  _MM_FREE_WRAPPER(device__r2_array); device__r2_array = NULL;
2510  _MM_FREE_WRAPPER(device__pl_size); device__pl_size = NULL;
2511  #endif
2512 
2513  // Clean up tables
2514  #if MIC_HANDCODE_FORCE_SINGLE != 0
2515  if (device__table_four != NULL) { _MM_FREE_WRAPPER(device__table_four); device__table_four = NULL; }
2516  if (device__table_four_float != NULL) { _MM_FREE_WRAPPER(device__table_four_float); device__table_four_float = NULL; }
2517  if (device__lj_table != NULL) { _MM_FREE_WRAPPER(device__lj_table); device__lj_table = NULL; }
2518  if (device__lj_table_float != NULL) { _MM_FREE_WRAPPER(device__lj_table_float); device__lj_table_float = NULL; }
2519  #endif
2520  if (device__exclusion_bits != NULL) { _MM_FREE_WRAPPER(device__exclusion_bits); device__exclusion_bits = NULL; }
2521  if (device__exclusion_bits_copy != NULL) { _MM_FREE_WRAPPER(device__exclusion_bits_copy); device__exclusion_bits_copy = NULL; }
2522  if (device__constants != NULL) { _MM_FREE_WRAPPER(device__constants); device__constants = NULL; }
2523 
2524  // Clean up intermediate force buffers
2525  if (device__force_buffers != NULL) { _MM_FREE_WRAPPER(device__force_buffers); device__force_buffers = NULL; }
2526  if (device__slow_force_buffers != NULL) { _MM_FREE_WRAPPER(device__slow_force_buffers); device__slow_force_buffers = NULL; }
2527 
2528  #if MIC_DEVICE_FPRINTF != 0
2529  fclose(device__fout);
2530  #endif
2531  }
2532 
2533  if (host__kernel_data != NULL) { _MM_FREE_WRAPPER(host__kernel_data); host__kernel_data = NULL; }
2534 }
2535 
2536 
2537 // DMK - DEBUG
2538 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
2539 
2540 __declspec(target(mic))
2541 void printMemInfo(int device__pe, int device__timestep, MemInfo * mi) {
2542  printf("[MIC-MEMINFO] :: PE %d :: MIC mem usage (MB) -- ts: %d - "
2543  "f: %.3lf, t: %.3lf, c: %.3lf, a: %.3lf, i: %.3lf - "
2544  "vS: %.3lf, vP: %.3lf\n",
2545  ((int)(device__pe)),
2546  ((int)(device__timestep)),
2547  ((double)(mi->memFree)) * 1.0e-6,
2548  ((double)(mi->memTotal)) * 1.0e-6,
2549  ((double)(mi->cached)) * 1.0e-6,
2550  ((double)(mi->active)) * 1.0e-6,
2551  ((double)(mi->inactive)) * 1.0e-6,
2552  ((double)(mi->vmSize)) * 1.0e-6,
2553  ((double)(mi->vmPeak)) * 1.0e-6
2554  ); fflush(NULL);
2555 }
2556 
2557 __declspec(target(mic))
2558 void readMemInfo_processLine(MemInfo * memInfo, char * n, char * v, char * u) {
2559  assert(n != NULL && v != NULL);
2560 
2561  size_t * loc = NULL;
2562  if (strcmp(n, "MemTotal:") == 0) { loc = &(memInfo->memTotal); }
2563  if (strcmp(n, "MemFree:") == 0) { loc = &(memInfo->memFree); }
2564  if (strcmp(n, "Cached:") == 0) { loc = &(memInfo->cached); }
2565  if (strcmp(n, "Active:") == 0) { loc = &(memInfo->active); }
2566  if (strcmp(n, "Inactive:") == 0) { loc = &(memInfo->inactive); }
2567  if (strcmp(n, "VmSize:") == 0) { loc = &(memInfo->vmSize); }
2568  if (strcmp(n, "VmPeak:") == 0) { loc = &(memInfo->vmPeak); }
2569  if (loc == NULL) { return; }
2570 
2571  *loc = (size_t)(atoi(v));
2572  if (u != NULL) {
2573  if (strcmp(u, "kB") == 0) { (*loc) *= 1024; }
2574  }
2575 }
2576 
2577 __declspec(target(mic))
2578 bool readMemInfo(MemInfo * memInfo) {
2579  if (memInfo == NULL) { return false; }
2580  memset(memInfo, 0, sizeof(MemInfo));
2581 
2582  FILE * procMem = fopen("/proc/meminfo", "r");
2583  if (procMem == NULL) {
2584  printf("[WARNING] :: Unable to read /proc/meminfo\n");
2585  return false;
2586  }
2587  char line[256], n[128], v[128], u[128];
2588  while (!feof(procMem)) {
2589  int numFilled = fscanf(procMem, "%[^\n]\n", line);
2590  if (numFilled == 0) { printf("empty line\n"); break; }
2591  numFilled = sscanf(line, "%s%s%s", n, v, u);
2592  if (numFilled == 3) {
2593  readMemInfo_processLine(memInfo, n, v, u);
2594  } else if (numFilled == 2) { // No unit or prev line had name only
2595  int vLen = strlen(v);
2596  if (v[vLen-1] == ':') { // Prev was name only (correct)
2597  memcpy(n, v, vLen + 1);
2598  }
2599  readMemInfo_processLine(memInfo, n, v, NULL);
2600  }
2601  }
2602  fclose(procMem);
2603 
2604  pid_t pid = getpid();
2605  char filename[256];
2606  sprintf(filename, "/proc/%d/status", (int)pid);
2607  procMem = fopen(filename, "r");
2608  if (procMem == NULL) {
2609  printf("[WARNING] :: Unable to read %s\n", filename);
2610  return false;
2611  }
2612  while (!feof(procMem)) {
2613  int numFilled = fscanf(procMem, "%[^\n]\n", line);
2614  if (numFilled == 0) { printf("empty line\n"); break; }
2615  numFilled = sscanf(line, "%s%s%s", n, v, u);
2616  if (numFilled == 3) {
2617  readMemInfo_processLine(memInfo, n, v, u);
2618  } else if (numFilled == 2) { // No unit
2619  readMemInfo_processLine(memInfo, n, v, NULL);
2620  }
2621  }
2622  fclose(procMem);
2623 
2624  return true;
2625 }
2626 
2627 #endif // MIC_TRACK_DEVICE_MEM_USAGE != 0
2628 
2629 
2630 __declspec(target(mic))
2631 void* _mm_malloc_withPrint(size_t s, int a, char * l) {
2632  void * ptr = _mm_malloc(s, a);
2633  printf("[MIC-MALLOC] :: _mm_malloc - l: \"%s\", s: %lu (%lu), ptr: %p\n", l, s, sizeof(s), ptr);
2634  return ptr;
2635 }
2636 
2637 __declspec(target(mic))
2638 void _mm_free_withPrint(void * ptr) {
2639  printf("[MIC-MALLOC] :: _mm_free - ptr: %p\n", ptr);
2640  _mm_free(ptr);
2641 }
2642 
2643 
2644 #else // NAMD_MIC
2645 
2646 #include "ComputeNonbondedMICKernel.h"
2648 
2649 #endif // NAMD_MIC
register BigReal virial_xy
register BigReal virial_xz
register BigReal virial_yz
static __global__ void(const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const int *vdw_types, unsigned int *plist, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials, unsigned int *global_counters, int *force_ready_queue, const unsigned int *overflow_exclusions, const int npatches, const int block_begin, const int total_block_count, int *block_order, exclmask *exclmasks, const int lj_table_size, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float plcutoff2, const int doSlow)
static __thread int patch_pairs_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
register BigReal electEnergy
static __thread atom * atoms
static __thread float4 * forces
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
register BigReal virial_yy
static __thread patch_pair * patch_pairs
texture< float2, 1, cudaReadModeElementType > lj_table
register BigReal virial_zz
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
register BigReal virial_xx
static __thread atom_param * atom_params
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
static __thread int atoms_size