00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00021 #include <stdlib.h>
00022 #include "CUDASort.h"
00023
00024 #if defined(VMDUSECUB)
00025 #include <cub/cub.cuh>
00026 #else
00027 #include <thrust/sort.h>
00028 #include <thrust/device_ptr.h>
00029 #include <thrust/execution_policy.h>
00030 #endif
00031
00032
00033 #if 0
00034 #define CUERR { cudaError_t err; \
00035 cudaDeviceSynchronize(); \
00036 if ((err = cudaGetLastError()) != cudaSuccess) { \
00037 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00038 }}
00039 #else
00040 #define CUERR
00041 #endif
00042
00043 #if defined(VMDUSECUB)
00044
00045
00046
00047
00048 template <typename KeyT, typename ValT>
00049 long dev_radix_sort_by_key_tmpsz(KeyT *keys_d, ValT *vals_d, long nitems) {
00050 size_t tsz = 0;
00051 cub::DeviceRadixSort::SortPairs(NULL, tsz,
00052 (const KeyT *) NULL, (KeyT *) NULL,
00053 (const ValT *) NULL, (ValT *) NULL,
00054 nitems, 0, sizeof(KeyT) * 8, 0, false);
00055 return (long) tsz;
00056 }
00057
00058
00059 template <typename KeyT, typename ValT>
00060 int dev_radix_sort_by_key(KeyT *keys_d, ValT *vals_d, long nitems,
00061 KeyT *keyswork_d, ValT *valswork_d,
00062 void *sortwork_d, long tlsz,
00063 KeyT min_key, KeyT max_key) {
00064 int tmp_autoallocate=0;
00065 int keys_autoallocate=0;
00066 int vals_autoallocate=0;
00067 size_t tsz = tlsz;
00068
00069 if (keyswork_d == NULL) {
00070 keys_autoallocate=1;
00071
00072 cudaMalloc(&keyswork_d, nitems * sizeof(KeyT));
00073 }
00074
00075 if (valswork_d == NULL) {
00076 vals_autoallocate=1;
00077
00078 cudaMalloc(&valswork_d, nitems * sizeof(ValT));
00079 }
00080
00081 if (sortwork_d == NULL) {
00082 tmp_autoallocate=1;
00083 cub::DeviceRadixSort::SortPairs(NULL, tsz,
00084 (const KeyT *) NULL, (KeyT *) NULL,
00085 (const ValT *) NULL, (ValT *) NULL,
00086 nitems, 0, sizeof(KeyT) * 8, 0, false);
00087
00088 cudaMalloc(&sortwork_d, tsz);
00089 }
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106 int begin_bit = 0;
00107 int end_bit = sizeof(KeyT) * 8;
00108
00109
00110
00111 if (max_key != ((KeyT) 0)) {
00112
00113 KeyT less = min_key - 1;
00114 KeyT lsb_val = (less | min_key) ^ less;
00115 int lsb = int(log2((double) lsb_val));
00116
00117 int msb = int(log2((double) max_key)+0.5);
00118
00119 begin_bit = lsb;
00120 end_bit = msb + 1;
00121 }
00122
00123
00124
00125 cub::DeviceRadixSort::SortPairs(NULL, tsz,
00126 keys_d, keyswork_d, vals_d, valswork_d,
00127 nitems, begin_bit, end_bit, 0, false);
00128
00129
00130 cudaMemcpyAsync(keys_d, keyswork_d, nitems * sizeof(KeyT), cudaMemcpyDeviceToDevice, 0);
00131 cudaMemcpyAsync(vals_d, valswork_d, nitems * sizeof(ValT), cudaMemcpyDeviceToDevice, 0);
00132
00133 if (tmp_autoallocate) {
00134 cudaFree(sortwork_d);
00135 }
00136
00137 cudaStreamSynchronize(0);
00138
00139 if (keys_autoallocate) {
00140 cudaFree(keyswork_d);
00141 }
00142
00143 if (vals_autoallocate) {
00144 cudaFree(valswork_d);
00145 }
00146
00147 return 0;
00148 }
00149
00150
00151 #else
00152
00153
00154
00155
00156 template <typename KeyT, typename ValT>
00157 long dev_radix_sort_by_key_tmpsz(KeyT *keys_d, ValT *vals_d, long nitems) {
00158 return 0;
00159 }
00160
00161
00162 template <typename KeyT, typename ValT>
00163 int dev_radix_sort_by_key(KeyT *keys_d, ValT *vals_d, long nitems,
00164 KeyT *keyswork_d, ValT *valswork_d,
00165 void *sortwork_d, long tsz,
00166 KeyT min_key, KeyT max_key) {
00167
00168 try {
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181 thrust::sort_by_key(thrust::device_ptr<unsigned int>(keys_d),
00182 thrust::device_ptr<unsigned int>(keys_d + nitems),
00183 thrust::device_ptr<unsigned int>(vals_d));
00184 }
00185 catch (std::bad_alloc) {
00186 printf("CUDA Thrust memory allocation failed: %s line %d\n", __FILE__, __LINE__);
00187 return -1;
00188 }
00189 catch (thrust::system::system_error) {
00190 printf("CUDA Thrust sort_by_key() failed: %s line %d\n", __FILE__, __LINE__);
00191 return -1;
00192 }
00193
00194 return 0;
00195 }
00196
00197 #endif
00198
00199
00200
00201
00202
00203
00204 #define INST_DEV_RADIX_SORT_BY_KEY_TMPSZ(KT, VT) template long dev_radix_sort_by_key_tmpsz<KT, VT>(KT*, VT*, long);
00205 #define INST_DEV_RADIX_SORT_BY_KEY(KT, VT) template int dev_radix_sort_by_key<KT, VT>(KT*, VT*, long, KT*, VT*, void *, long, KT, KT);
00206
00207 INST_DEV_RADIX_SORT_BY_KEY_TMPSZ(unsigned int, unsigned int);
00208 INST_DEV_RADIX_SORT_BY_KEY(unsigned int, unsigned int);
00209
00210