00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00022 #include <stdlib.h>
00023 #include "CUDAParPrefixOps.h"
00024
00025 #if defined(VMDUSECUB)
00026 #include <cub/cub.cuh>
00027 #else
00028 #include <thrust/scan.h>
00029 #include <thrust/execution_policy.h>
00030 #endif
00031
00032
00033 #if 0
00034 #define CUERR { cudaError_t err; \
00035 cudaDeviceSynchronize(); \
00036 if ((err = cudaGetLastError()) != cudaSuccess) { \
00037 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00038 }}
00039 #else
00040 #define CUERR
00041 #endif
00042
00043 #if defined(VMDUSECUB)
00044
00045
00046 struct VMDScanSum {
00047 template <typename T>
00048
00049 __device__ __forceinline__
00050 T operator()(const T &a, const T &b) const {
00051 return a + b;
00052 }
00053 };
00054
00055
00056
00057
00058 template <typename T>
00059 long dev_excl_scan_sum_tmpsz(T *in_d, long nitems, T *out_d, T ival) {
00060 size_t tsz = 0;
00061 VMDScanSum sum_op;
00062 cub::DeviceScan::ExclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL,
00063 sum_op, ival, nitems);
00064 return (long) tsz;
00065 }
00066
00067 template <typename T>
00068 void dev_excl_scan_sum(T *in_d, long nitems, T *out_d,
00069 void *scanwork_d, long tlsz, T ival) {
00070 VMDScanSum sum_op;
00071 int autoallocate=0;
00072 size_t tsz = tlsz;
00073
00074 if (scanwork_d == NULL) {
00075 autoallocate=1;
00076 cub::DeviceScan::ExclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL,
00077 sum_op, ival, nitems);
00078
00079 cudaMalloc(&scanwork_d, tsz);
00080 }
00081
00082 cub::DeviceScan::ExclusiveScan(scanwork_d, tsz, in_d, out_d,
00083 sum_op, ival, nitems);
00084 if (autoallocate)
00085 cudaFree(scanwork_d);
00086 }
00087
00088
00089
00090
00091
00092 template <typename T>
00093 long dev_incl_scan_sum_tmpsz(T *in_d, long nitems, T *out_d) {
00094 size_t tsz = 0;
00095 VMDScanSum sum_op;
00096 cub::DeviceScan::InclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL,
00097 sum_op, nitems);
00098 return (long) tsz;
00099 }
00100
00101
00102 template <typename T>
00103 void dev_incl_scan_sum(T *in_d, long nitems, T *out_d,
00104 void *scanwork_d, long tlsz) {
00105 VMDScanSum sum_op;
00106 int autoallocate=0;
00107 size_t tsz = tlsz;
00108
00109 if (scanwork_d == NULL) {
00110 autoallocate=1;
00111 cub::DeviceScan::InclusiveScan((T*) NULL, tsz, (T*) NULL, (T*) NULL,
00112 sum_op, nitems);
00113
00114 cudaMalloc(&scanwork_d, tsz);
00115 }
00116
00117 cub::DeviceScan::InclusiveScan(scanwork_d, tsz, in_d, out_d,
00118 sum_op, nitems);
00119 if (autoallocate)
00120 cudaFree(scanwork_d);
00121 }
00122
00123 #else
00124
00125
00126
00127
00128
00129 template <typename T>
00130 long dev_excl_scan_sum_tmpsz(T *in_d, long nGroups, T *out_d, T ival) {
00131 return 0;
00132 }
00133
00134
00135 template <typename T>
00136 void dev_excl_scan_sum(T *in_d, long nGroups, T *out_d,
00137 void *scanwork_d, long tsz, T ival) {
00138
00139
00140
00141
00142
00143
00144
00145
00146 thrust::exclusive_scan(thrust::cuda::par, in_d, in_d + nGroups, out_d);
00147 }
00148
00149
00150
00151
00152
00153 template <typename T>
00154 long dev_incl_scan_sum_tmpsz(T *in_d, long nGroups, T *out_d) {
00155 return 0;
00156 }
00157
00158 template <typename T>
00159 void dev_incl_scan_sum(T *in_d, long nGroups, T *out_d,
00160 void *scanwork_d, long tsz) {
00161
00162
00163
00164
00165
00166
00167
00168
00169 thrust::inclusive_scan(thrust::cuda::par, in_d, in_d + nGroups, out_d);
00170 }
00171
00172
00173 #endif
00174
00175
00176
00177
00178
00179
00180 #define INST_DEV_EXCL_SCAN_SUM_TMPSZ(T) template long dev_excl_scan_sum_tmpsz<T>(T*, long, T*, T);
00181 #define INST_DEV_EXCL_SCAN_SUM(T) template void dev_excl_scan_sum<T>(T*, long, T*, void*, long, T);
00182
00183 INST_DEV_EXCL_SCAN_SUM_TMPSZ(long)
00184 INST_DEV_EXCL_SCAN_SUM_TMPSZ(int)
00185 INST_DEV_EXCL_SCAN_SUM_TMPSZ(short)
00186 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned long)
00187 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned int)
00188 INST_DEV_EXCL_SCAN_SUM_TMPSZ(unsigned short)
00189
00190 INST_DEV_EXCL_SCAN_SUM(long)
00191 INST_DEV_EXCL_SCAN_SUM(int)
00192 INST_DEV_EXCL_SCAN_SUM(short)
00193 INST_DEV_EXCL_SCAN_SUM(unsigned long)
00194 INST_DEV_EXCL_SCAN_SUM(unsigned int)
00195 INST_DEV_EXCL_SCAN_SUM(unsigned short)
00196
00197
00198 #if 0
00199 inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) {
00200 return make_uint2(a.x + b.x, a.y + b.y);
00201 }
00202
00203 template long dev_excl_scan_sum_tmpsz<uint2>(uint2*, long, uint2*, uint2);
00204 template void dev_excl_scan_sum<uint2>(uint2*, long, uint2*, void*, long, uint2);
00205 #endif
00206
00207 template void dev_incl_scan_sum<float>(float*, long, float*, void*, long);
00208
00209