diff --git a/Common/CUDA/GD_AwTV.cu b/Common/CUDA/GD_AwTV.cu index d98c13c1..e899b196 100644 --- a/Common/CUDA/GD_AwTV.cu +++ b/Common/CUDA/GD_AwTV.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for Steepest descend in POCS-type algorithms. @@ -61,11 +62,11 @@ #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",cudaGetErrorString(__err));\ + hipDeviceReset();\ + mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",hipGetErrorString(__err));\ } \ } while (0) @@ -378,16 +379,16 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // allocate memory in each GPU for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - cudaMalloc((void**)&d_image[dev] , mem_img_each_GPU); - cudaMemset( d_image[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); - cudaMemset( d_dimgTV[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); - cudaMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); - cudaMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); - cudaMemset( d_norm2aux[dev],0 , mem_auxiliary); + hipMalloc((void**)&d_image[dev] , mem_img_each_GPU); + hipMemset( d_image[dev],0 , mem_img_each_GPU); + hipMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); + hipMemset( d_dimgTV[dev],0 , mem_img_each_GPU); + hipMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); + hipMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); + hipMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); + hipMemset( d_norm2aux[dev],0 , mem_auxiliary); cudaCheckErrors("Malloc error"); @@ -397,7 +398,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma if(splits>1){ mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); }else{ - cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); + hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); } @@ -406,12 +407,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ - cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); + hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -420,12 +421,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // Create streams int nStream_device=2; int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); + hipStreamCreate(&stream[i+dev*nStream_device]); } } cudaCheckErrors("Stream creation fail"); @@ -437,7 +438,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma double totalsum; float sum_curr_spl; float * sumnorm2; - cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); + hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float)); unsigned int curr_slices; unsigned long long curr_pixels; @@ -476,28 +477,28 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma if(i==0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result. if (splits>1 & i>0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } cudaCheckErrors("Memcpy failure on multi split"); @@ -509,7 +510,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); + hipStreamSynchronize(stream[dev*nStream_device+1]); + reduceNorm2 <<>>(d_norm2[dev], d_norm2aux[dev], total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { - reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } else { - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Reduction error"); @@ -586,7 +587,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Scalar operations error"); //SUBSTRACT GRADIENT ////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split0){ - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); + hipSetDevice(gpuids[dev-1]); + hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); } } }else{ // We need to take it out :( for(dev=0; dev2){ - cudaHostUnregister(img); - cudaHostUnregister(dst); + hipHostUnregister(img); + hipHostUnregister(dst); } for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; cudaCheckErrors("Memory free"); -// cudaDeviceReset(); +// hipDeviceReset(); } void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ @@ -697,8 +698,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ size_t memtotal; const int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree1){ mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); }else{ - cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); + hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); } @@ -390,12 +391,12 @@ do { \ // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ - cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); + hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -404,12 +405,12 @@ do { \ // Create streams int nStream_device=2; int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); + hipStreamCreate(&stream[i+dev*nStream_device]); } } cudaCheckErrors("Stream creation fail"); @@ -421,7 +422,7 @@ do { \ double totalsum; float sum_curr_spl; float * sumnorm2; - cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); + hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float)); unsigned int curr_slices; unsigned long long curr_pixels; @@ -460,28 +461,28 @@ do { \ if(i==0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result. if (splits>1 & i>0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } cudaCheckErrors("Memcpy failure on multi split"); @@ -493,7 +494,7 @@ do { \ dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); + hipStreamSynchronize(stream[dev*nStream_device+1]); + reduceNorm2 <<>>(d_norm2[dev], d_norm2aux[dev], total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { - reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } else { - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Reduction error"); @@ -570,7 +571,7 @@ do { \ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Scalar operations error"); //SUBSTRACT GRADIENT ////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split0){ - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); + hipSetDevice(gpuids[dev-1]); + hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); } } }else{ // We need to take it out :( for(dev=0; dev2){ - cudaHostUnregister(img); - cudaHostUnregister(dst); + hipHostUnregister(img); + hipHostUnregister(dst); } for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Memory free"); - cudaDeviceReset(); + hipDeviceReset(); } void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ @@ -686,8 +687,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ size_t memtotal; int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree #include -#include +#include GpuIds::~GpuIds() { free(m_piDeviceIds); m_piDeviceIds = nullptr; @@ -52,12 +52,12 @@ void GpuIds::SetAllGpus(int iTotalDeviceCount) { bool GpuIds::AreEqualDevices() const { int deviceCount = this->GetLength(); - const int devicenamelength = 256; // The length 256 is fixed by spec of cudaDeviceProp::name + const int devicenamelength = 256; // The length 256 is fixed by spec of hipDeviceProp_t::name char devicename[devicenamelength]; - cudaDeviceProp deviceProp; + hipDeviceProp_t deviceProp; for (int dev = 0; dev < deviceCount; dev++) { - // cudaSetDevice(m_piDeviceIds[dev]); - cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]); + // hipSetDevice(m_piDeviceIds[dev]); + hipGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]); if (dev>0) { if (strcmp(devicename, deviceProp.name) != 0) { return false; diff --git a/Common/CUDA/PICCS.cu b/Common/CUDA/PICCS.cu index 481ede08..e447b375 100644 --- a/Common/CUDA/PICCS.cu +++ b/Common/CUDA/PICCS.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for Steepest descend in POCS-type algorithms. @@ -60,10 +61,10 @@ Codes : https://github.com/CERN/TIGRE #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("ERROR in: %s \n",msg);\ - mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("err",hipGetErrorString(__err));\ } \ } while (0) @@ -263,9 +264,9 @@ do { \ bool isnan_cuda(float* vec, size_t size){ bool*d_nan; bool h_nan; - cudaMalloc((void **)&d_nan, sizeof (bool)); + hipMalloc((void **)&d_nan, sizeof (bool)); isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan); - cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost); + hipMemcpy(&h_nan, d_nan, sizeof(bool), hipMemcpyDeviceToHost); return h_nan; } @@ -281,24 +282,24 @@ bool isnan_cuda(float* vec, size_t size){ float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2; // memory for image - cudaMalloc(&d_image, mem_size); - cudaMalloc(&d_prior, mem_size); + hipMalloc(&d_image, mem_size); + hipMalloc(&d_prior, mem_size); cudaCheckErrors("Malloc Image error"); - cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice); + hipMemcpy(d_image, img, mem_size, hipMemcpyHostToDevice); + hipMemcpy(d_prior, prior, mem_size, hipMemcpyHostToDevice); cudaCheckErrors("Memory Malloc and Memset: SRC"); // memory for df - cudaMalloc(&d_dimgTV, mem_size); - cudaMalloc(&d_dpiccsTV, mem_size); + hipMalloc(&d_dimgTV, mem_size); + hipMalloc(&d_dpiccsTV, mem_size); cudaCheckErrors("Memory Malloc and Memset: TV"); - cudaMalloc(&d_norm2, mem_size); + hipMalloc(&d_norm2, mem_size); cudaCheckErrors("Memory Malloc and Memset: TV"); - cudaMalloc(&d_aux_image, mem_size); + hipMalloc(&d_aux_image, mem_size); cudaCheckErrors("Memory Malloc and Memset: TV"); // memory for L2norm auxiliar - cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS); + hipMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS); cudaCheckErrors("Memory Malloc and Memset: NORMAux"); @@ -315,64 +316,64 @@ bool isnan_cuda(float* vec, size_t size){ for(unsigned int i=0;i>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Gradient"); // mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio), total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Multiplication error"); substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Substraction error"); gradientTV<<>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Gradient"); // mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Multiplication error"); // mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); //NOMRALIZE via reduction //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); - cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice); + hipMemcpy(d_norm2, d_dimgTV, mem_size, hipMemcpyDeviceToDevice); cudaCheckErrors("Copy from gradient call error"); reduceNorm2 << > >(d_norm2, d_aux_small, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("reduce1"); if (dimgridRed > 1) { reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("reduce2"); - cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy"); + hipMemcpy(&sumnorm2, d_norm2, sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("hipMemcpy"); } else { - cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy"); + hipMemcpy(&sumnorm2, d_aux_small, sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("hipMemcpy"); } // mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2)); //MULTIPLY HYPERPARAMETER sqrt(sumnorm2) multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2), total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Multiplication error"); //SUBSTRACT GRADIENT substractArrays <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Substraction error"); // mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false"); // mexPrintf("\n"); @@ -381,18 +382,18 @@ bool isnan_cuda(float* vec, size_t size){ cudaCheckErrors("TV minimization"); - cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost); + hipMemcpy(dst, d_image, mem_size, hipMemcpyDeviceToHost); cudaCheckErrors("Copy result back"); - cudaFree(d_image); - cudaFree(d_dpiccsTV); - cudaFree(d_aux_image); - cudaFree(d_aux_small); - cudaFree(d_prior); - cudaFree(d_norm2); + hipFree(d_image); + hipFree(d_dpiccsTV); + hipFree(d_aux_image); + hipFree(d_aux_small); + hipFree(d_prior); + hipFree(d_norm2); cudaCheckErrors("Memory free"); - cudaDeviceReset(); + hipDeviceReset(); } diff --git a/Common/CUDA/RandomNumberGenerator.cu b/Common/CUDA/RandomNumberGenerator.cu index d7d1224a..e4e7c283 100644 --- a/Common/CUDA/RandomNumberGenerator.cu +++ b/Common/CUDA/RandomNumberGenerator.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for random number generator @@ -45,40 +46,41 @@ #include #include -#include -#include -#include +#include +#include +#include +#include #include "gpuUtils.hpp" #include "RandomNumberGenerator.hpp" #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\ + hipDeviceReset();\ + mexErrMsgIdAndTxt("RandomNumberGenerator:",hipGetErrorString(__err));\ } \ } while (0) -__global__ void setup_kernel(curandState *state) { +__global__ void setup_kernel(hiprandState *state) { int idx = threadIdx.x + blockIdx.x * blockDim.x; /* Each thread gets same seed, a different sequence number, no offset */ - curand_init(1234, idx, 0, &state[idx]); + hiprand_init(1234, idx, 0, &state[idx]); } -__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) { +__global__ void GeneratePoisson(hiprandState *state, const float* pfIn, size_t uiLen, float* pfOut) { int idx = threadIdx.x + blockIdx.x * blockDim.x; /* Copy state to local memory for efficiency */ - curandState localState = state[idx]; + hiprandState localState = state[idx]; int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); for (int iI = 0; iI < iIter; ++iI) { size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; if (uiPos < uiLen) { /* Poisson */ - unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); + unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]); pfOut[uiPos] = (float)uiPoisson; } } @@ -86,7 +88,7 @@ __global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t ui state[idx] = localState; } -__global__ void GeneratePoissonAddGaussian(curandState *state, +__global__ void GeneratePoissonAddGaussian(hiprandState *state, const float* pfIn, size_t uiLen, float fGaussMu, @@ -95,15 +97,15 @@ __global__ void GeneratePoissonAddGaussian(curandState *state, { int idx = threadIdx.x + blockIdx.x * blockDim.x; /* Copy state to local memory for efficiency */ - curandState localState = state[idx]; + hiprandState localState = state[idx]; int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); for (int iI = 0; iI < iIter; ++iI) { size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; if (uiPos < uiLen) { /* Poisson */ - unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); + unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]); /* Gaussian */ - float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu; + float fNormal = hiprand_normal(&localState) * fGaussSigma + fGaussMu; pfOut[uiPos] = fNormal + (float)uiPoisson; } } @@ -127,31 +129,31 @@ void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpu // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut); float* d_pfIn = nullptr; float* d_pfOut = nullptr; - cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); - cudaCheckErrors("poisson_1d fail cudaMalloc 1"); - cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); - cudaCheckErrors("poisson_1d fail cudaMalloc 2"); - cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); - cudaCheckErrors("poisson_1d fail cudaMemcpy 1"); + hipMalloc((void **)&d_pfIn, uiLen * sizeof(float)); + cudaCheckErrors("poisson_1d fail hipMalloc 1"); + hipMalloc((void **)&d_pfOut, uiLen * sizeof(float)); + cudaCheckErrors("poisson_1d fail hipMalloc 2"); + hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice); + cudaCheckErrors("poisson_1d fail hipMemcpy 1"); // float fMin, fMax; // GetMinMax(pfIn, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - curandState *curandStates = nullptr; + hiprandState *curandStates = nullptr; const int kiBlockDim = 1024; // Threads per Block const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; - cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); - cudaCheckErrors("poisson_1d fail cudaMalloc 3"); + hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState)); + cudaCheckErrors("poisson_1d fail hipMalloc 3"); setup_kernel<<>>(curandStates); GeneratePoisson<<>>(curandStates, d_pfIn, uiLen, d_pfOut); - cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("poisson_1d fail cudaMemcpy 2"); + hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("poisson_1d fail hipMemcpy 2"); // GetMinMax(pfOut, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - cudaFree(d_pfIn); d_pfIn = nullptr; - cudaFree(d_pfOut); d_pfOut = nullptr; - cudaFree(curandStates); curandStates = nullptr; + hipFree(d_pfIn); d_pfIn = nullptr; + hipFree(d_pfOut); d_pfOut = nullptr; + hipFree(curandStates); curandStates = nullptr; } void poisson_gaussian_1d(const float* pfIn, @@ -164,30 +166,30 @@ void poisson_gaussian_1d(const float* pfIn, // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut); float* d_pfIn = nullptr; float* d_pfOut = nullptr; - cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1"); - cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2"); - cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); - cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1"); + hipMalloc((void **)&d_pfIn, uiLen * sizeof(float)); + cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 1"); + hipMalloc((void **)&d_pfOut, uiLen * sizeof(float)); + cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 2"); + hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice); + cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 1"); // float fMin, fMax; // GetMinMax(pfIn, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - curandState *curandStates = nullptr; + hiprandState *curandStates = nullptr; const int kiBlockDim = 64; // Threads per Block const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; - cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3"); + hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState)); + cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 3"); setup_kernel<<>>(curandStates); GeneratePoissonAddGaussian<<>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut); - cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2"); + hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 2"); // GetMinMax(pfOut, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - cudaFree(d_pfIn); d_pfIn = nullptr; - cudaFree(d_pfOut); d_pfOut = nullptr; - cudaFree(curandStates); curandStates = nullptr; + hipFree(d_pfIn); d_pfIn = nullptr; + hipFree(d_pfOut); d_pfOut = nullptr; + hipFree(curandStates); curandStates = nullptr; } diff --git a/Common/CUDA/Siddon_projection.cu b/Common/CUDA/Siddon_projection.cu index 2a025f8c..94b9eb1d 100644 --- a/Common/CUDA/Siddon_projection.cu +++ b/Common/CUDA/Siddon_projection.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for ray-voxel intersection based projection @@ -48,18 +49,18 @@ */ #include -#include -#include +#include +#include #include "Siddon_projection.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("Ax:Siddon_projection",hipGetErrorString(__err));\ } \ } while (0) @@ -94,7 +95,7 @@ do { \ * **/ - void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc); + void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc); __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device @@ -111,7 +112,7 @@ __global__ void kernelPixelDetector( Geometry geo, float* detector, const int currProjSetNumber, const int totalNoOfProjections, - cudaTextureObject_t tex){ + hipTextureObject_t tex){ unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; @@ -229,16 +230,16 @@ __global__ void kernelPixelDetector( Geometry geo, float ac=am; //eq (28), unit anlges float axu,ayu,azu; - axu=__frcp_rd(fabsf(ray.x)); - ayu=__frcp_rd(fabsf(ray.y)); - azu=__frcp_rd(fabsf(ray.z)); + axu=__frcp_rn(fabsf(ray.x)); + ayu=__frcp_rn(fabsf(ray.y)); + azu=__frcp_rn(fabsf(ray.z)); // eq(29), direction of update float iu,ju,ku; iu=(source.x< pixel1D.x)? 1.0f : -1.0f; ju=(source.y< pixel1D.y)? 1.0f : -1.0f; ku=(source.z< pixel1D.z)? 1.0f : -1.0f; - float maxlength=__fsqrt_rd(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ); + float maxlength=__fsqrt_rn(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ); float sum=0.0f; unsigned long Np=(imax-imin+1)+(jmax-jmin+1)+(kmax-kmin+1); // Number of intersections // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed @@ -311,10 +312,10 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * if (!fits_in_memory){ dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); - cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); + hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); + hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); cudaCheckErrors("cudaMallocauxiliarty projections fail"); } } @@ -323,12 +324,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // This is happening regarthless if the image fits on memory float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); - cudaMemset(dProjection[dev*2+i] ,0,num_bytes_proj); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); + hipMemset(dProjection[dev*2+i] ,0,num_bytes_proj); + cudaCheckErrors("hipMalloc projections fail"); } } @@ -338,13 +339,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ - cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } #endif cudaCheckErrors("Error pinning memory"); @@ -354,18 +355,18 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // auxiliary variables Point3D source, deltaU, deltaV, uvOrigin; Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); cudaCheckErrors("Error allocating auxiliary constant memory"); // Create Streams for overlapping memcopy and compute int nStreams=deviceCount*2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i+dev*2]); + hipStreamCreate(&stream[i+dev*2]); } } @@ -376,8 +377,8 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management. int projection_this_block; - cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; - cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; + hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount]; + hipArray **d_cuArrTex = new hipArray*[deviceCount]; for (unsigned int sp=0;sp>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); } @@ -450,7 +451,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // 1) grab previous results and put them in the auxiliary variable dProjection_accum for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -463,12 +464,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * else projection_this_block=PROJ_PER_BLOCK; - cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); + hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]); } // 2) take the results from current compute call and add it to the code in execution. for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -481,7 +482,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * else projection_this_block=PROJ_PER_BLOCK; - cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished + hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); } } // end accumulation case, where the image needs to be split @@ -490,7 +491,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * if (i>0){ for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on previous set on this GPU proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; if (dev+1==deviceCount) { //is it the last device? @@ -510,13 +511,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * else { projection_this_block=PROJ_PER_BLOCK; } - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } } // Make sure Computation on kernels has finished before we launch the next batch. for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*2]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*2]); } } @@ -524,7 +525,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // We still have the last set of projections to get out of GPUs for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -533,106 +534,106 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. + hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } // Make sure everyone has done their bussiness before the next image split: - cudaDeviceSynchronize(); + hipDeviceSynchronize(); } // End image split loop. cudaCheckErrors("Main loop fail"); /////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texImg[dev]); - cudaFreeArray(d_cuArrTex[dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texImg[dev]); + hipFreeArray(d_cuArrTex[dev]); } delete[] texImg; texImg = 0; delete[] d_cuArrTex; d_cuArrTex = 0; // Freeing Stage for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection[dev*2]); - cudaFree(dProjection[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection[dev*2]); + hipFree(dProjection[dev*2+1]); } free(dProjection); if(!fits_in_memory){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection_accum[dev*2]); - cudaFree(dProjection_accum[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection_accum[dev*2]); + hipFree(dProjection_accum[dev*2+1]); } free(dProjection_accum); } freeGeoArray(splits,geoArray); - cudaFreeHost(projParamsArrayHost); + hipHostFree(projParamsArrayHost); for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ - cudaHostUnregister(img); + hipHostUnregister(img); } - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); #endif - //cudaDeviceReset(); + //hipDeviceReset(); return 0; } -void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc) +void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc) { //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); const unsigned int num_devices = gpuids.GetLength(); if(alloc){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params); } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModePoint; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Texture object creation fail"); } @@ -842,8 +843,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ const int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree -#include -#include +#include +#include #include "Siddon_projection_parallel.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",hipGetErrorString(__err));\ } \ } while (0) // Declare the texture reference. -void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); +void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream); #define MAXTREADS 1024 @@ -105,7 +106,7 @@ __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is o __global__ void kernelPixelDetector_parallel( Geometry geo, - float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){ + float* detector, const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex){ unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; @@ -282,23 +283,23 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float); float** dProjection=(float **)malloc(2*sizeof(float *)); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[i], num_bytes); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[i], num_bytes); + cudaCheckErrors("hipMalloc projections fail"); } int nStreams=2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i]); + hipStreamCreate(&stream[i]); } // Texture object variables - cudaTextureObject_t *texImg = 0; - cudaArray **d_cuArrTex = 0; - texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); + hipTextureObject_t *texImg = 0; + hipArray **d_cuArrTex = 0; + texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*)); CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg [0],stream); cudaCheckErrors("Texture allocation fail"); @@ -310,7 +311,7 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); // 16x16 gave the best performance empirically // Funnily that makes it compatible with most GPUs..... @@ -349,36 +350,36 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa } - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]); + hipStreamSynchronize(stream[0]); kernelPixelDetector_parallel<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); // copy result to host if (i>0) - cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]); } - cudaDeviceSynchronize(); + hipDeviceSynchronize(); int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; - cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]); - cudaDestroyTextureObject(texImg[0]); - cudaFreeArray(d_cuArrTex[0]); + hipDestroyTextureObject(texImg[0]); + hipFreeArray(d_cuArrTex[0]); free(texImg); texImg = 0; free(d_cuArrTex); d_cuArrTex = 0; cudaCheckErrors("Unbind fail"); - cudaFree(dProjection[0]); - cudaFree(dProjection[1]); + hipFree(dProjection[0]); + hipFree(dProjection[1]); free(dProjection); - cudaFreeHost(projParamsArrayHost); - cudaCheckErrors("cudaFree d_imagedata fail"); + hipHostFree(projParamsArrayHost); + cudaCheckErrors("hipFree d_imagedata fail"); for (int i = 0; i < 2; ++i){ - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); } -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } @@ -482,41 +483,41 @@ void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uv *source=S2; } -void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; +void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0); - cudaMemcpy3DParms copyParams = {0}; + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[0]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[1]); //Array creation End - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModePoint; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); } diff --git a/Common/CUDA/errors.hpp b/Common/CUDA/errors.hpp index 05518b20..16bece09 100644 --- a/Common/CUDA/errors.hpp +++ b/Common/CUDA/errors.hpp @@ -1,4 +1,4 @@ -#define CUDA_SUCCESS 0 +#define hipSuccess 0 #define ERR_CUDA 1 #define ERR_NO_CAPABLE_DEVICES 2 diff --git a/Common/CUDA/gpuUtils.cu b/Common/CUDA/gpuUtils.cu index 8f2754e4..910b7a58 100644 --- a/Common/CUDA/gpuUtils.cu +++ b/Common/CUDA/gpuUtils.cu @@ -1,7 +1,7 @@ #include "gpuUtils.hpp" -#include -#include +#include +#include #include #include @@ -34,11 +34,11 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha return iCudaDeviceCount; } - cudaError_t err; - cudaDeviceProp propDevice; + hipError_t err; + hipDeviceProp_t propDevice; int nMatch = 0; for (int iId = 0; iId < iCudaDeviceCount; ++iId) { - err = cudaGetDeviceProperties(&propDevice, iId); + err = hipGetDeviceProperties(&propDevice, iId); iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name); if (strcmp(propDevice.name, kacGPUName) == 0) { piDeviceIds[nMatch] = iId; @@ -55,16 +55,16 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha void GetGpuName(int iDeviceId, char* pcName) { memset(pcName, 0, 128); - cudaError_t err; - cudaDeviceProp propDevice; + hipError_t err; + hipDeviceProp_t propDevice; int id = iDeviceId; - err = cudaGetDeviceProperties(&propDevice, id); + err = hipGetDeviceProperties(&propDevice, id); memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char)); } int GetGpuCount() { int iCudaDeviceCount = 0; - cudaGetDeviceCount(&iCudaDeviceCount); + hipGetDeviceCount(&iCudaDeviceCount); return iCudaDeviceCount; } diff --git a/Common/CUDA/improvedForwardProjections.cu b/Common/CUDA/improvedForwardProjections.cu index 0f32be72..7c5fbddd 100644 --- a/Common/CUDA/improvedForwardProjections.cu +++ b/Common/CUDA/improvedForwardProjections.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * CUDA function for optimized proton CT radiographies * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP) @@ -20,19 +21,19 @@ Coded by: Stefanie Kaser, Benjamin Kirchmayer --------------------------------------------------------------------------*/ -#include +#include #include "mex.h" -#include +#include #include "improvedForwardProjections.hpp" #include #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\ } \ } while (0) @@ -937,43 +938,43 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po } //Allocate Memory on GPU - cudaMalloc( (void**) &dPosIn, sizeInputs ); - cudaMalloc( (void**) &dPosOut, sizeInputs ); - cudaMalloc( (void**) &ddirIn, sizeInputs ); - cudaMalloc( (void**) &ddirOut, sizeInputs ); - cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); - cudaMalloc( (void**) &dhist1, detectorMem ); - cudaMalloc( (void**) &dhist2, detectorMem ); - cudaMalloc( (void**) &dnumEntries, sizeof(int)); - cudaMalloc( (void**) &ddetectorX, sizeof(int)); - cudaMalloc( (void**) &ddetectorY, sizeof(int)); - cudaMalloc( (void**) &dpixelSize, 2*sizeof(float)); - cudaMalloc( (void**) &dDetectDistIn, sizeof(float)); - cudaMalloc( (void**) &dDetectDistOut, sizeof(float)); - cudaMalloc( (void**) &dEin, sizeof(float)); - cudaMalloc( (void**) &dReject, sizeof(float)); - cudaMalloc( (void**) &dHull, 5*sizeof(float)); - cudaError_t _err_alloc = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err_alloc)); + hipMalloc( (void**) &dPosIn, sizeInputs ); + hipMalloc( (void**) &dPosOut, sizeInputs ); + hipMalloc( (void**) &ddirIn, sizeInputs ); + hipMalloc( (void**) &ddirOut, sizeInputs ); + hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); + hipMalloc( (void**) &dhist1, detectorMem ); + hipMalloc( (void**) &dhist2, detectorMem ); + hipMalloc( (void**) &dnumEntries, sizeof(int)); + hipMalloc( (void**) &ddetectorX, sizeof(int)); + hipMalloc( (void**) &ddetectorY, sizeof(int)); + hipMalloc( (void**) &dpixelSize, 2*sizeof(float)); + hipMalloc( (void**) &dDetectDistIn, sizeof(float)); + hipMalloc( (void**) &dDetectDistOut, sizeof(float)); + hipMalloc( (void**) &dEin, sizeof(float)); + hipMalloc( (void**) &dReject, sizeof(float)); + hipMalloc( (void**) &dHull, 5*sizeof(float)); + hipError_t _err_alloc = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err_alloc)); cudaCheckErrors("GPU Allocation failed!"); //Copy Arrays to GPU - cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice); - cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice); - cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice); + hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice); + hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice); + hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice); cudaCheckErrors("Host to device transport failed!"); @@ -984,8 +985,8 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po ParticleKernel<<>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject); - cudaError_t _err = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err)); + hipError_t _err = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err)); cudaCheckErrors("Kernel fail!"); //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); @@ -993,12 +994,12 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po //sumHist<<>>(dhist1, dhist2); //Copy result from device to host - //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); - //cudaError_t _errcp = cudaGetLastError(); - //mexPrintf("%s \n", cudaGetErrorString(_errcp)); + //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost); + //hipError_t _errcp = hipGetLastError(); + //mexPrintf("%s \n", hipGetErrorString(_errcp)); cudaCheckErrors("Device to host transport failed!"); for(int j = 0; j -#include +#include +#include #include #ifndef improvedForwardProjections_H #define improvedForwardProjections_H diff --git a/Common/CUDA/improvedForwardProjections_cone.cu b/Common/CUDA/improvedForwardProjections_cone.cu index 7a4f6b46..d11657a9 100644 --- a/Common/CUDA/improvedForwardProjections_cone.cu +++ b/Common/CUDA/improvedForwardProjections_cone.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * CUDA function for optimized proton CT radiographies * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP) @@ -21,19 +22,19 @@ --------------------------------------------------------------------------*/ -#include +#include #include "mex.h" -#include +#include #include "improvedForwardProjections.hpp" // #include // #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\ } \ } while (0) @@ -1133,45 +1134,45 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float } //Allocate Memory on GPU - cudaMalloc( (void**) &dPosIn, sizeInputs ); - cudaMalloc( (void**) &dPosOut, sizeInputs ); - cudaMalloc( (void**) &ddirIn, sizeInputs ); - cudaMalloc( (void**) &ddirOut, sizeInputs ); - cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); - cudaMalloc( (void**) &dhist1, detectorMem ); - cudaMalloc( (void**) &dhist2, detectorMem ); - cudaMalloc( (void**) &dnumEntries, sizeof(int)); - cudaMalloc( (void**) &ddetectorX, sizeof(int)); - cudaMalloc( (void**) &ddetectorY, sizeof(int)); - cudaMalloc( (void**) &dpixelSize, 2*sizeof(float)); - cudaMalloc( (void**) &dDetectDistIn, sizeof(float)); - cudaMalloc( (void**) &dDetectDistOut, sizeof(float)); - cudaMalloc( (void**) &dSourceDist, sizeof(float)); - cudaMalloc( (void**) &dEin, sizeof(float)); - cudaMalloc( (void**) &dReject, sizeof(float)); - cudaMalloc( (void**) &dHull, 5*sizeof(float)); - cudaError_t _err_alloc = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err_alloc)); + hipMalloc( (void**) &dPosIn, sizeInputs ); + hipMalloc( (void**) &dPosOut, sizeInputs ); + hipMalloc( (void**) &ddirIn, sizeInputs ); + hipMalloc( (void**) &ddirOut, sizeInputs ); + hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); + hipMalloc( (void**) &dhist1, detectorMem ); + hipMalloc( (void**) &dhist2, detectorMem ); + hipMalloc( (void**) &dnumEntries, sizeof(int)); + hipMalloc( (void**) &ddetectorX, sizeof(int)); + hipMalloc( (void**) &ddetectorY, sizeof(int)); + hipMalloc( (void**) &dpixelSize, 2*sizeof(float)); + hipMalloc( (void**) &dDetectDistIn, sizeof(float)); + hipMalloc( (void**) &dDetectDistOut, sizeof(float)); + hipMalloc( (void**) &dSourceDist, sizeof(float)); + hipMalloc( (void**) &dEin, sizeof(float)); + hipMalloc( (void**) &dReject, sizeof(float)); + hipMalloc( (void**) &dHull, 5*sizeof(float)); + hipError_t _err_alloc = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err_alloc)); cudaCheckErrors("GPU Allocation failed!"); //Copy Arrays to GPU - cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice); - cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dSourceDist, &sourcePos, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice); - cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice); + hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice); + hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dSourceDist, &sourcePos, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice); + hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice); cudaCheckErrors("Host to device transport failed!"); @@ -1182,8 +1183,8 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float ParticleKernelCone<<>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist); - cudaError_t _err = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err)); + hipError_t _err = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err)); cudaCheckErrors("Kernel fail!"); //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); @@ -1191,12 +1192,12 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float //sumHist<<>>(dhist1, dhist2); //Copy result from device to host - //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); - //cudaError_t _errcp = cudaGetLastError(); - //mexPrintf("%s \n", cudaGetErrorString(_errcp)); + //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost); + //hipError_t _errcp = hipGetLastError(); + //mexPrintf("%s \n", hipGetErrorString(_errcp)); cudaCheckErrors("Device to host transport failed!"); for(int j = 0; j -#include -#include +#include +#include #include "ray_interpolated_projection.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\ + hipDeviceReset();\ + mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",hipGetErrorString(__err));\ } \ } while (0) @@ -100,7 +101,7 @@ do { \ * * **/ - void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate); + void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate); __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device @@ -119,7 +120,7 @@ template float* detector, const int currProjSetNumber, const int totalNoOfProjections, - cudaTextureObject_t tex){ + hipTextureObject_t tex){ unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; @@ -161,7 +162,7 @@ template P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); // Length is the ray length in normalized space - float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z)); + float length=__fsqrt_rn((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z)); //now legth is an integer of Nsamples that are required on this line length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer vectX=__fdividef(P.x -source.x,length); @@ -255,10 +256,10 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c if (!fits_in_memory){ dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); - cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); + hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); + hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); cudaCheckErrors("cudaMallocauxiliarty projections fail"); } } @@ -267,12 +268,12 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c // This is happening regarthless if the image fits on memory float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); - cudaMemset(dProjection[dev*2+i] ,0,num_bytes_proj); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); + hipMemset(dProjection[dev*2+i] ,0,num_bytes_proj); + cudaCheckErrors("hipMalloc projections fail"); } } @@ -284,34 +285,34 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & splits>1){ - cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); #endif Point3D source, deltaU, deltaV, uvOrigin; Point3D* projParamsArrayHost = 0; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); float* projFloatsArrayHost = 0; - cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); + hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); cudaCheckErrors("Error allocating auxiliary constant memory"); // Create Streams for overlapping memcopy and compute int nStream_device=2; int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); + hipStreamCreate(&stream[i+dev*nStream_device]); } } @@ -324,8 +325,8 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c - cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; - cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; + hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount]; + hipArray **d_cuArrTex = new hipArray*[deviceCount]; for (unsigned int sp=0;sp=nangles) @@ -419,12 +420,12 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) else projection_this_block=PROJ_PER_BLOCK; - cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); + hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]); } // 2) take the results from current compute call and add it to the code in execution. for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -436,7 +437,7 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) else projection_this_block=PROJ_PER_BLOCK; - cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished + hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); } } // end accumulation case, where the image needs to be split @@ -446,7 +447,7 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c { for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on previous set on this GPU proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; if (dev+1==deviceCount) { //is it the last device? @@ -466,21 +467,21 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c else { projection_this_block=PROJ_PER_BLOCK; } - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } } // Make sure Computation on kernels has finished before we launch the next batch. for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*2]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*2]); } } // End noOfKernelCalls (i) loop. // We still have the last set of projections to get out of GPUs for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -489,15 +490,15 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. + hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } // Make sure everyone has done their bussiness before the next image split: for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // End image split loop. @@ -505,99 +506,99 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c /////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texImg[dev]); - cudaFreeArray(d_cuArrTex[dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texImg[dev]); + hipFreeArray(d_cuArrTex[dev]); } delete[] texImg; texImg = 0; delete[] d_cuArrTex; d_cuArrTex = 0; // Freeing Stage for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection[dev*2]); - cudaFree(dProjection[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection[dev*2]); + hipFree(dProjection[dev*2+1]); } free(dProjection); if(!fits_in_memory){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection_accum[dev*2]); - cudaFree(dProjection_accum[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection_accum[dev*2]); + hipFree(dProjection_accum[dev*2+1]); } free(dProjection_accum); } freeGeoArray(splits,geoArray); - cudaFreeHost(projParamsArrayHost); - cudaFreeHost(projFloatsArrayHost); + hipHostFree(projParamsArrayHost); + hipHostFree(projFloatsArrayHost); for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & splits>1){ - cudaHostUnregister(img); + hipHostUnregister(img); } #endif - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } -void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate) +void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate) { const unsigned int num_devices = gpuids.GetLength(); //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); if(allocate){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor + //hipArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); cudaCheckErrors("Texture memory allocation fail"); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaMemcpy3DParms copyParams = {0}; - cudaSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params); //cudaCheckErrors("Texture memory data copy fail"); //Array creation End } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; if (geo.accuracy>1){ - texDescr.filterMode = cudaFilterModePoint; + texDescr.filterMode = hipFilterModePoint; geo.accuracy=1; } else{ - texDescr.filterMode = cudaFilterModeLinear; + texDescr.filterMode = hipFilterModeLinear; } - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); cudaCheckErrors("Texture object creation fail"); } } @@ -828,8 +829,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ size_t memtotal; int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree -#include -#include +#include +#include #include "ray_interpolated_projection_parallel.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",hipGetErrorString(__err));\ } \ } while (0) @@ -96,7 +97,7 @@ do { \ * * **/ -void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); +void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream); __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device @@ -104,7 +105,7 @@ __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on __global__ void kernelPixelDetector_parallel_interpolated( Geometry geo, float* detector, - const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) + const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex) { // Point3D source , // Point3D deltaU, @@ -199,23 +200,23 @@ int interpolation_projection_parallel(float * img, Geometry geo, float** resul size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float); float** dProjection=(float **)malloc(2*sizeof(float *)); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[i], num_bytes); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[i], num_bytes); + cudaCheckErrors("hipMalloc projections fail"); } // allocate streams for memory and compute int nStreams=2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i]); + hipStreamCreate(&stream[i]); } // Texture object variables - cudaTextureObject_t *texImg = 0; - cudaArray **d_cuArrTex = 0; - texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); + hipTextureObject_t *texImg = 0; + hipArray **d_cuArrTex = 0; + texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*)); CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream); cudaCheckErrors("Texture allocation fail"); @@ -226,9 +227,9 @@ int interpolation_projection_parallel(float * img, Geometry geo, float** resul Point3D source, deltaU, deltaV, uvOrigin; Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); float* projFloatsArrayHost; - cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); + hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); // 16x16 gave the best performance empirically // Funnily that makes it compatible with most GPUs..... @@ -266,39 +267,39 @@ int interpolation_projection_parallel(float * img, Geometry geo, float** resul projFloatsArrayHost[2*j+1]=floor(maxdist); } - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projFloatsArrayDev), projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]); + hipStreamSynchronize(stream[0]); kernelPixelDetector_parallel_interpolated<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); // copy result to host if (i>0) - cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]); } - cudaDeviceSynchronize(); + hipDeviceSynchronize(); int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; - cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]); - cudaDestroyTextureObject(texImg[0]); - cudaFreeArray(d_cuArrTex[0]); + hipDestroyTextureObject(texImg[0]); + hipFreeArray(d_cuArrTex[0]); free(texImg); texImg = 0; free(d_cuArrTex); d_cuArrTex = 0; cudaCheckErrors("Unbind fail"); - cudaFree(dProjection[0]); - cudaFree(dProjection[1]); + hipFree(dProjection[0]); + hipFree(dProjection[1]); free(dProjection); - cudaFreeHost(projParamsArrayHost); - cudaFreeHost(projFloatsArrayHost); + hipHostFree(projParamsArrayHost); + hipHostFree(projFloatsArrayHost); - cudaCheckErrors("cudaFree d_imagedata fail"); + cudaCheckErrors("hipFree d_imagedata fail"); for (int i = 0; i < 2; ++i){ - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); } -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } @@ -410,40 +411,40 @@ void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* u *source=S; } -void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; +void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0); - cudaMemcpy3DParms copyParams = {0}; + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[0]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[1]); //Array creation End - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); - -} \ No newline at end of file + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + +} diff --git a/Common/CUDA/tv_proximal.cu b/Common/CUDA/tv_proximal.cu index 32ae99c2..87d5407f 100644 --- a/Common/CUDA/tv_proximal.cu +++ b/Common/CUDA/tv_proximal.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * MATLAB MEX functions for TV image denoising. Check inputs and parses @@ -57,17 +58,17 @@ #include "tv_proximal.hpp" #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - cudaDeviceReset();\ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ + hipDeviceReset();\ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",hipGetErrorString(__err));\ } \ } while (0) void cpy_from_host(float* device_array,float* host_array, unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, unsigned long long pixels_per_slice, unsigned int buffer_length, - cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size); + hipStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size); __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n) @@ -263,11 +264,11 @@ void cpy_from_host(float* device_array,float* host_array, // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif if (isHostRegisterSupported & splits>1){ - cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + hipHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); + hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -282,21 +283,21 @@ void cpy_from_host(float* device_array,float* host_array, if (buffer_length0){ // U - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer_u, d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]); - cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]); - cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]); - cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]); + hipSetDevice(gpuids[dev-1]); + hipMemcpyAsync(buffer_u, d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]); + hipMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]); + hipMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]); + hipMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]); - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]); - cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]); - cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]); - cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]); - cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+1]); + hipMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+1]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+2]); + hipMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+2]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+3]); + hipMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+3]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+4]); + hipMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+4]); } @@ -567,22 +568,22 @@ void cpy_from_host(float* device_array,float* host_array, }else{ // Vopy all the U variable into the host. for(dev=0; dev1 && buffer_length1){ - cudaHostUnregister(src); - cudaHostUnregister(dst); + hipHostUnregister(src); + hipHostUnregister(dst); } for(dev=0; dev -#include -#include +#include +#include #include "voxel_backprojection.hpp" #include "TIGRE_common.hpp" #include @@ -55,10 +56,10 @@ // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\ } \ } while (0) @@ -91,7 +92,7 @@ do { \ * **/ - void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate); + void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, int nStreamDevice,bool allocate); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -134,7 +135,7 @@ __constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL]; // Description: Main FDK backprojection kernel //______________________________________________________________________________ -__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) +__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex) { // Old kernel call signature: @@ -246,7 +247,7 @@ __global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,co weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO); - weight=__frcp_rd(weight*weight); + weight=__frcp_rn(weight*weight); // Get Value in the computed (U,V) and multiply by the corresponding weight. // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) @@ -323,16 +324,16 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ - cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } if (isHostRegisterSupported ){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); } #endif cudaCheckErrors("Error pinning memory"); @@ -348,20 +349,20 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); float** dimage=(float**)malloc(deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMalloc((void**)&dimage[dev], num_bytes_img); - cudaCheckErrors("cudaMalloc fail"); + hipSetDevice(gpuids[dev]); + hipMalloc((void**)&dimage[dev], num_bytes_img); + cudaCheckErrors("hipMalloc fail"); } //If it is the first time, lets make sure our image is zeroed. int nStreamDevice=2; int nStreams=deviceCount*nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i+dev*nStreamDevice]); + hipStreamCreate(&stream[i+dev*nStreamDevice]); } } @@ -371,16 +372,16 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa // Kernel auxiliary variables Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D)); float* projSinCosArrayHost; - cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float)); + hipHostMalloc((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float)); // Texture object variables - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); + hipTextureObject_t *texProj; + hipArray **d_cuArrTex; + texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*)); // Auxiliary Host page-locked memory for fast and asycnornous memcpy. @@ -401,8 +402,8 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa for(unsigned int img_slice=0;img_slice>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); } // END for @@ -551,8 +552,8 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa } // END sub-split of current projection chunk for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // END projection splits @@ -560,15 +561,15 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa // Now we need to take the image out of the GPU for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); // We do not need to sycnronize because the array dealocators already do. num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); - cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); + hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); cudaCheckErrors("Main loop fail"); } @@ -582,38 +583,38 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa if (!two_buffers_used && i==1) break; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texProj[i*deviceCount+dev]); - cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texProj[i*deviceCount+dev]); + hipFreeArray(d_cuArrTex[i*deviceCount+dev]); } } cudaCheckErrors("cudadestroy textures result fail"); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dimage[dev]); + hipSetDevice(gpuids[dev]); + hipFree(dimage[dev]); } - cudaFreeHost(projSinCosArrayHost); - cudaFreeHost(projParamsArrayHost); + hipHostFree(projSinCosArrayHost); + hipHostFree(projParamsArrayHost); free(partial_projection); free(proj_split_size); freeGeoArray(split_image*deviceCount,geoArray); #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ - cudaHostUnregister(result); + hipHostUnregister(result); } if (isHostRegisterSupported){ - cudaHostUnregister(projections); + hipHostUnregister(projections); } #endif for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); - //cudaDeviceReset(); // For the Nvidia Visual Profiler + //hipDeviceReset(); // For the Nvidia Visual Profiler return 0; } // END voxel_backprojection @@ -664,52 +665,52 @@ void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsign } -void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){ +void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; #if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles); #else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles); #endif const unsigned int num_devices = gpuids.GetLength(); if (allocate){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); } //Array creation End for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); } } @@ -903,8 +904,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ const int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree -#include -#include +#include +#include #include "voxel_backprojection2.hpp" #include "TIGRE_common.hpp" #include @@ -55,10 +56,10 @@ // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\ } \ } while (0) @@ -92,7 +93,7 @@ do { \ **/ // this definitionmust go here. -void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate); +void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate); __global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){ size_t idx = threadIdx.x + blockIdx.x * blockDim.x; @@ -139,7 +140,7 @@ __constant__ float projSinCosArray2Dev[5*PROJ_PER_KERNEL]; // Description: Main FDK backprojection kernel //______________________________________________________________________________ -__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) +__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex) { unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y; @@ -271,7 +272,7 @@ __global__ void kernelPixelBackprojection(const Geometry geo, float* image,const realD.y=-realDaux.x*sinalpha + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x) float L,lsq; - L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always. + L = __fsqrt_rn( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always. lsq = (realS.x-realvoxel.x)*(realS.x-realvoxel.x) + (realS.y-realvoxel.y)*(realS.y-realvoxel.y) + (realS.z-realvoxel.z)*(realS.z-realvoxel.z); @@ -355,9 +356,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); float** dimage=(float**)malloc(deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMalloc((void**)&dimage[dev], num_bytes_img); - cudaCheckErrors("cudaMalloc fail"); + hipSetDevice(gpuids[dev]); + hipMalloc((void**)&dimage[dev], num_bytes_img); + cudaCheckErrors("hipMalloc fail"); } @@ -366,15 +367,15 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & split_image>1){ - cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } if (isHostRegisterSupported ){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -385,27 +386,27 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float //If it is the first time, lets make sure our image is zeroed. int nStreamDevice=2; int nStreams=deviceCount*nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i+dev*nStreamDevice]); + hipStreamCreate(&stream[i+dev*nStreamDevice]); } } // Kernel auxiliary variables Point3D* projParamsArray2Host; - cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D)); float* projSinCosArray2Host; - cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float)); + hipHostMalloc((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float)); // Texture object variables - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); + hipTextureObject_t *texProj; + hipArray **d_cuArrTex; + texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*)); @@ -425,8 +426,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float // // Initialize the memory if its the first time. for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemset(dimage[dev],0,num_bytes_img); + hipSetDevice(gpuids[dev]); + hipMemset(dimage[dev],0,num_bytes_img); cudaCheckErrors("memset fail"); } @@ -478,8 +479,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*nStreamDevice+1]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*nStreamDevice+1]); } for (dev = 0; dev < deviceCount; dev++){ @@ -489,7 +490,7 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0) break; - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); @@ -566,9 +567,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float } // END for (preparing params for kernel call) // Copy the prepared parameter arrays to constant memory to make it available for the kernel - cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaStreamSynchronize(stream[dev*nStreamDevice]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArray2Dev), projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArray2Dev), projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]); + hipStreamSynchronize(stream[dev*nStreamDevice]); kernelPixelBackprojection<<>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); } // END for @@ -581,24 +582,24 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float } // END projection splits for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>( geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV)); } // Now we need to take the image out of the GPU for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*nStreamDevice]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*nStreamDevice]); num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); - cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); + hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); } } // end image splits for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } @@ -607,40 +608,40 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) if (!two_buffers_used && i==1) break; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texProj[i*deviceCount+dev]); - cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texProj[i*deviceCount+dev]); + hipFreeArray(d_cuArrTex[i*deviceCount+dev]); } } free(d_cuArrTex); free(texProj); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dimage[dev]); + hipSetDevice(gpuids[dev]); + hipFree(dimage[dev]); } free(dimage); - cudaFreeHost(projSinCosArray2Host); - cudaFreeHost(projParamsArray2Host); + hipHostFree(projSinCosArray2Host); + hipHostFree(projParamsArray2Host); free(partial_projection); free(proj_split_size); freeGeoArray(split_image*deviceCount,geoArray); #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & split_image>1){ - cudaHostUnregister(result); + hipHostUnregister(result); } if (isHostRegisterSupported){ - cudaHostUnregister(projections); + hipHostUnregister(projections); } #endif for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); -// cudaDeviceReset(); // For the Nvidia Visual Profiler +// hipDeviceReset(); // For the Nvidia Visual Profiler return 0; } // END voxel_backprojection @@ -649,52 +650,52 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float -void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){ +void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; int num_devices = gpuids.GetLength(); #if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles); #else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles); #endif if (allocate){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); } //Array creation End for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); } } #ifndef BACKPROJECTION_HPP @@ -826,8 +827,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ const int gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree -#include -#include +#include +#include #include "voxel_backprojection.hpp" #include "voxel_backprojection_parallel.hpp" @@ -57,10 +58,10 @@ // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\ } \ } while (0) @@ -92,7 +93,7 @@ do { \ * * **/ -void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate); +void CreateTextureParallel( float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool allocate); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call @@ -135,7 +136,7 @@ __constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL]; // Description: Main FDK backprojection kernel //______________________________________________________________________________ -__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex) +__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,hipTextureObject_t tex) { // Old kernel call signature: @@ -286,9 +287,9 @@ __global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* im int voxel_backprojection_parallel(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids) { if (gpuids.GetLength() == 0) { - cudaSetDevice(0); + hipSetDevice(0); } else { - cudaSetDevice(gpuids[0]); + hipSetDevice(gpuids[0]); } /* @@ -298,10 +299,10 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re //If it is the first time, lets make sure our image is zeroed. int nStreamDevice=2; int nStreams=nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i]); + hipStreamCreate(&stream[i]); } @@ -310,10 +311,10 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif if (isHostRegisterSupported){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -321,22 +322,22 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // Allocate result image memory size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float); float* dimage; - cudaMalloc((void**)&dimage, num_bytes); - cudaMemset(dimage,0,num_bytes); - cudaCheckErrors("cudaMalloc fail"); + hipMalloc((void**)&dimage, num_bytes); + hipMemset(dimage,0,num_bytes); + cudaCheckErrors("hipMalloc fail"); Point3D* projParamsArrayHostParallel; - cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D)); float* projSinCosArrayHostParallel; - cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float)); + hipHostMalloc((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float)); // Texture buffer objects - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*)); + hipTextureObject_t *texProj; + hipArray **d_cuArrTex; + texProj =(hipTextureObject_t*)malloc(2*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(2*sizeof(hipArray*)); @@ -389,7 +390,7 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re (proj_block_split<2));// Only allocate if its the first 2 calls - cudaStreamSynchronize(stream[0+1]); + hipStreamSynchronize(stream[0+1]); @@ -464,9 +465,9 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // Copy the prepared parameter arrays to constant memory to make it available for the kernel - cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); - cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArrayDevParallel), projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDevParallel), projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]); + hipStreamSynchronize(stream[0]); kernelPixelBackprojection_parallel<<>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]); } // END for @@ -475,9 +476,9 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // END Main reconstruction loop: go through projections (rotation angles) and backproject ////////////////////////////////////////////////////////////////////////////////////// } - cudaDeviceSynchronize(); - cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy result fail"); + hipDeviceSynchronize(); + hipMemcpy(result, dimage, num_bytes, hipMemcpyDeviceToHost); + cudaCheckErrors("hipMemcpy result fail"); free(partial_projection); free(proj_split_size); @@ -486,23 +487,23 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) if (!two_buffers_used && i==1) break; - cudaDestroyTextureObject(texProj[i]); - cudaFreeArray(d_cuArrTex[i]); + hipDestroyTextureObject(texProj[i]); + hipFreeArray(d_cuArrTex[i]); } free(texProj); free(d_cuArrTex); - cudaFreeHost(projSinCosArrayHostParallel); - cudaFreeHost(projParamsArrayHostParallel); + hipHostFree(projSinCosArrayHostParallel); + hipHostFree(projParamsArrayHostParallel); - cudaFree(dimage); + hipFree(dimage); if (isHostRegisterSupported){ - cudaHostUnregister(projections); + hipHostUnregister(projections); } for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } // END voxel_backprojection @@ -583,45 +584,45 @@ void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* } // END computeDeltasCube -void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc) +void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool alloc) { - //cudaArray Descriptor + //hipArray Descriptor #if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles); #else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles); #endif - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array if (alloc){ - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0); cudaCheckErrors("Texture memory allocation fail"); } - cudaMemcpy3DParms copyParams = {0}; + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[0]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[0+1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[0+1]); cudaCheckErrors("Texture memory data copy fail"); //Array creation End - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); cudaCheckErrors("Texture object creation fail"); -} \ No newline at end of file +} diff --git a/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip new file mode 100644 index 00000000..650a9815 --- /dev/null +++ b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip @@ -0,0 +1,21 @@ +#include +#include +#include +#include + +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + if (nrhs != 0) { + mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "No input requred."); + return; + } + if (nlhs != 1) { + mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "Too many output arguments. Returns one integer."); + return; + } + int iCount = GetGpuCount(); + size_t dims[2] = {1,1}; + plhs[0] = mxCreateNumericArray(2, dims, mxUINT32_CLASS, mxREAL); + *((int*)mxGetData(plhs[0])) = iCount; +} diff --git a/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip new file mode 100644 index 00000000..c56ca29b --- /dev/null +++ b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip @@ -0,0 +1,29 @@ +#include +#include + +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + // Usage: name = getGpuName_mex(int iId) + if (nrhs != 1) { + mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:invalidNumInputs", "One input required."); + return; + } else if(nlhs > 1) { + mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:maxlhs", "Too many output arguments."); + return; + } + + int iId = 0; + if (mxIsDouble(prhs[0])) { + mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:inputNotInt", "Input must be an integer."); + return; + } else { + iId = *((int*)mxGetData(prhs[0])); + } + int iCount = GetGpuCount(); + char* pcName = (char*)mxCalloc(128, sizeof(char)); + if (iId < iCount) { + GetGpuName(iId, pcName); + } + plhs[0] = mxCreateString(pcName); +} diff --git a/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip new file mode 100644 index 00000000..670c2d3e --- /dev/null +++ b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip @@ -0,0 +1,28 @@ +#define _CRT_SECURE_NO_WARNINGS + +#include +#include + +// Purpose: To fast read .xim files +// Method: based on ReadXim.m by Fredrik Nordström 2015 +// Date: 2017.07 +// Author: Yi Du, yi.du@hotmail.com + +#ifndef STR_XIM +#define STR_XIM +//struct XimPara +typedef struct XimPara +{ + char FileName[256]; + int ImgWidth; // Image Width + int ImgHeight; // Image Height + int PixelNO; + + int BytesPerPixel; // Determine how to read the data + int Compression_Indicator; // Data number in Rec Image Matrix + + double GantryRtn; // Gantry rotation angle + int KVNormChamber; // KV norm chamber reading, date: 2022-05-23 +}XimPara; +#endif + diff --git a/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip new file mode 100644 index 00000000..453c4278 --- /dev/null +++ b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip @@ -0,0 +1,357 @@ +#define _CRT_SECURE_NO_WARNINGS + +#include "io64.h" +#include +#include +#include +#include +#include +#include +//**** C data types are defined in tmwtypes.h +#include +#include "mex.h" +#include +#include "matrix.h" +#include "XimPara.hpp" + +#define GET_BIT(x,bit) ((x & (1 << bit)) >>bit) + +// Purpose: To fast read .xim files +// Method: based on ReadXim.m by Fredrik Nordström 2015 +// Date: 2017.07 +// Author: Yi Du, yi.du@hotmail.com + + +int cReadXim(char *XimFullFile, XimPara *XimStr, int *XimImg); + +void mexFunction( + int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + //check input variable + if (mxIsChar(prhs[0]) != 1) + mexErrMsgIdAndTxt( "MATLAB:revord:inputNotString", + "Input must be a string."); + + // .xim filename + char *filename; + filename = mxArrayToString(prhs[0]); + //mexPrintf("%s\n", filename); + + // file open + FILE *fid = fopen(filename, "rb"); + if(fid == NULL) + { + mexErrMsgIdAndTxt("%s fopen failed.\n", filename); + //getchar(); + //exit(1); + } + + // Parameter structure + XimPara *para = new XimPara[1]; + + // file pointer position + //fpos_t position = {0}; + + // Skip useless information + // 8 * sizeof(char) + sizeof(int32_t); + long int position = 8*sizeof(char) + sizeof(int32_T); + fseek ( fid , position , SEEK_SET ); +// setFilePos(fid, (fpos_t*) &position); + // Read ImgWidth & ImgHeight (int32) + fread(&(para->ImgWidth), sizeof(int32_T), 1, fid); + fread(&(para->ImgHeight), sizeof(int32_T), 1, fid); + fclose(fid); + + para->PixelNO = para->ImgWidth * para->ImgHeight; + + int *frame; + plhs[0] = mxCreateNumericMatrix(para->ImgWidth, para->ImgHeight, mxINT32_CLASS, mxREAL); + frame = (int*)mxGetPr(plhs[0]); + + // empty file return + if (para->PixelNO == 0) + { + plhs[1] = mxCreateDoubleScalar(10000); + mexPrintf("%s is an empty file\n", filename); + return; + } + + /******* Kernel Function *********/ + cReadXim(filename, para, frame); + + /**** KVSourceRtn is the only parameter-of-interest to return ****/ + // KVSourceRtn = GantryRtn + 90 deg; + double KVSourceRtn = para->GantryRtn + 90; + plhs[1] = mxCreateDoubleScalar(KVSourceRtn); + + double NormChamberReading = para->KVNormChamber * 1.0; + plhs[2] = mxCreateDoubleScalar(NormChamberReading); + +} + +/************* Kernel Funtion to read .xim ***************/ +// Kernel function +int cReadXim(char *XimFullFile, + XimPara *XimStr, + int *XimImg) +{ + // Read the .xim file name + +// char *ptr = strrchr(XimFullFile, '\\'); +// sprintf(XimStr->FileName, "%s", ptr + 1); + + // ****** Open .xim File Pointer ***********// + FILE *fid = fopen(XimFullFile, "rb"); + + // Syntax Parsing + if (fid == NULL) + { + mexErrMsgIdAndTxt("Error: file %s doesn't exist, at all\n", XimFullFile); + //getchar(); + //exit(1); + } + + // ******* Stage 1: Portal Image Data ****// + // Skip useless information + fseek(fid, 8 * sizeof(char) + sizeof(int32_T), SEEK_CUR); + + // Read ImgWidth & ImgHeight + fread(&(XimStr->ImgWidth), sizeof(int32_T), 1, fid); + fread(&(XimStr->ImgHeight), sizeof(int32_T), 1, fid); + XimStr->PixelNO = (XimStr->ImgWidth)*(XimStr->ImgHeight); + + // Skip the useless information: bits_per_pixel + fseek(fid, sizeof(int32_T), SEEK_CUR); + + // Load .xim file compression parameters + fread(&(XimStr->BytesPerPixel), sizeof(int32_T), 1, fid); + fread(&(XimStr->Compression_Indicator), sizeof(int32_T), 1, fid); + + // Load .xim Pixel Data + if (1 == XimStr->Compression_Indicator) + { + int LookUpTableSize = 0; + fread(&LookUpTableSize, sizeof(int), 1, fid); + + int *LookUpTable = new int[XimStr->ImgHeight * XimStr->ImgWidth]; + memset(LookUpTable, 0, XimStr->ImgHeight * XimStr->ImgWidth * sizeof(int)); + + // Load the LookUpTable data + for (int ii = 0; ii < LookUpTableSize; ii++) + { + // Load in the 8-bit date + // Updated: 2021-11-05, Yi Du + uint8_T tmp =0; + fread(&tmp, 1, 1, fid); + int Bit2[4] = { 0 }; + Bit2[0] = GET_BIT(tmp,0) + GET_BIT(tmp,1) *2; + Bit2[1] = GET_BIT(tmp,2) + GET_BIT(tmp,3) *2; + Bit2[2] = GET_BIT(tmp,4) + GET_BIT(tmp,5) *2; + Bit2[3] = GET_BIT(tmp,6) + GET_BIT(tmp,7) *2; + + // extract the lookup_table data + for (int jj = 0; jj < 4; jj++) + { + LookUpTable[ii * 4 + jj] = Bit2[jj]; + } + + /** Old Code with bug + int Bit2[4] = { 0 }; + + // extract the lookup_table data + for (int jj = 0; jj < 8; jj = jj +2) + { + Bit2[jj/2] = ((tmp & 1 << jj) != 0); + // It's 4, because 1 unsigned __int8 in tmp is represented by 4 ints in LookUpTable. + LookUpTable[ii * 4 + jj / 2] = Bit2[jj / 2]; + + //printf("Index = %d, LookUpTable = %d\n", ii * 4 + jj / 2, LookUpTable[ii * 4 + jj / 2]); + } + **/ + } + + // Skip compressed_pixel_buffer_size: passed + fseek(fid, sizeof(int32_T), SEEK_CUR); + + // Allocate memory for XimImg + fread(XimImg, sizeof(int32_T), (XimStr->ImgWidth) + 1, fid); + + // load the compressed pixel data + int delta = 0; + int LUT_Pos = 0; + + // Be very careful with all data types!!! + int8_T tmp8 = 0; + int16_T tmp16 = 0; + int32_T tmp32 = 0; + + for (int ImgTag = XimStr->ImgWidth + 1; + ImgTag < (XimStr->ImgHeight) * (XimStr->ImgWidth); + ImgTag++) + { + if (0 == LookUpTable[LUT_Pos]) + { + fread(&tmp8, sizeof(int8_T), 1, fid); + delta = int(tmp8); + } + else if (1 == LookUpTable[LUT_Pos]) + { + fread(&tmp16, sizeof(int16_T), 1, fid); + delta = int(tmp16); + } + else + { + fread(&tmp32, sizeof(int32_T), 1, fid); + delta = int(tmp32); + } + + XimImg[ImgTag] = delta + XimImg[ImgTag - 1] + + XimImg[ImgTag - XimStr->ImgWidth] + - XimImg[ImgTag - XimStr->ImgWidth - 1]; + + LUT_Pos = LUT_Pos + 1; + } + + // Skip uncompressed_pixel_buffer_size + fseek(fid, sizeof(int32_T), SEEK_CUR); + + } + else + { + // Be careful: the code block for uncompressed pixel data readout hasn't been tested yet. + // Date: 2017-09-12 + int BufferSize = 0; + fread(&BufferSize, sizeof(int), 1, fid); + + switch (XimStr->BytesPerPixel) + { + case 1: + { + uint8_t *buffer8 = new uint8_t[XimStr->ImgWidth * XimStr->ImgHeight]; + memset(buffer8, 0, sizeof(uint8_t)* XimStr->ImgWidth * XimStr->ImgHeight); + fread(buffer8, sizeof(uint8_t), BufferSize, fid); + for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight;ii++) + { + XimImg[ii] = int(buffer8[ii]); + } + break; + } + case 2: + { + uint16_t *buffer16 = new uint16_t[XimStr->ImgWidth * XimStr->ImgHeight]; + memset(buffer16, 0, sizeof(uint16_t)* XimStr->ImgWidth * XimStr->ImgHeight); + fread(buffer16, sizeof(uint16_t), BufferSize / 2, fid); + for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight; ii++) + { + XimImg[ii] = int(buffer16[ii]); + } + break; + } + default: + { + fread(XimImg, sizeof(int), BufferSize / 4, fid); + break; + } + } + } + + + // ******* Stage 2: load the gantry angle from the residual property data ****// + // Skip histogram + int tmp = 0; + fread(&tmp, sizeof(int), 1, fid); + if (tmp > 0) + { + fseek(fid, tmp* sizeof(int), SEEK_CUR); + } + + // Decode .xim properties + int nProperties = 0; + fread(&nProperties, sizeof(int), 1, fid); + // Property structure is not NULL + if (nProperties > 0) + { + int pName_len = 0; + // Only load the property name rather than the content + char pName[128] = { 0 }; + int pType = 0; + for (int ii = 0; ii < nProperties; ii++) + { + // load property name length + fread(&pName_len, sizeof(int), 1, fid); + // load property name + fread(pName, sizeof(char)* pName_len, 1, fid); + // load property data type + fread(&pType, sizeof(int), 1, fid); + + //printf("%s\n", pName); + + // extract the Gantry Rotation Angle + if (!strcmp(pName, "GantryRtn")) + { + fread(&(XimStr->GantryRtn), sizeof(double), 1, fid); +// continue; + } + else if(!strcmp(pName, "KVNormChamber")) + { + //printf("KVNormChamber"); + fread(&(XimStr->KVNormChamber), sizeof(int), 1, fid); + break; + } + else + { + switch (pType) + { + case 0: + { + fseek(fid, sizeof(int), SEEK_CUR); + break; + } + case 1: + { + fseek(fid, sizeof(double), SEEK_CUR); + break; + } + case 2: + { + int skiplen = 0; + fread(&skiplen, sizeof(int), 1, fid); + fseek(fid, sizeof(char) * skiplen, SEEK_CUR); + break; + } + case 4: + { + int skiplen = 0; + fread(&skiplen, sizeof(int), 1, fid); + fseek(fid, sizeof(double) * skiplen /8, SEEK_CUR); + break; + } + case 5: + { + int skiplen = 0; + fread(&skiplen, sizeof(int), 1, fid); + fseek(fid, sizeof(int) * skiplen /4, SEEK_CUR); + break; + } + break; + } + } + // reset all the temporary variables + pName_len = 0; + memset(pName, 0, 128*sizeof(char)); + pType = 0; + } + + } + + // ********* END of XIM Reading: Close the File Pointer******* // + if (fclose(fid)) + { + printf("The file `crt_fopen.c' was not closed\n"); + getchar(); + exit(1); + } + + return 1; +} diff --git a/Python/setup.py b/Python/setup.py index 40bc3b3f..eee483e0 100644 --- a/Python/setup.py +++ b/Python/setup.py @@ -20,7 +20,7 @@ if "--no_pinned_memory" in sys.argv[2:] : no_pinned=True sys.argv.pop(sys.argv.index("--no_pinned_memory")) - + if no_pinned: define_macros.append(("NO_PINNED_MEMORY",None)) @@ -48,10 +48,10 @@ ] COMPUTE_CAPABILITY_ARGS = [ - "-gencode=arch=compute_70,code=compute_70", # allows forward compiling - "--ptxas-options=-v", + #"-gencode=arch=compute_70,code=compute_70", # allows forward compiling + #"--ptxas-options=-v", "-c", - "--default-stream=per-thread", + #"--default-stream=per-thread", ] @@ -65,13 +65,14 @@ def get_cuda_version(cuda_home): return version_str.split(" ")[2][:4] else: version_str = subprocess.check_output( - [os.path.join(cuda_home, "bin", "nvcc"), "--version"] + [os.path.join(cuda_home, "bin", "hipcc"), "--version"] ) version_str = str(version_str).replace("\n", "").replace("\r", "") idx = version_str.find("release") return version_str[idx + len("release ") : idx + len("release ") + 4] except: - raise RuntimeError("Cannot read cuda version file") + pass + #raise RuntimeError("Cannot read cuda version file") def locate_cuda(): @@ -81,16 +82,16 @@ def locate_cuda(): and values giving the absolute path to each directory. Starts by looking for the CUDA_HOME or CUDA_PATH env variable. If not found, everything - is based on finding 'nvcc' in the PATH. + is based on finding 'hipcc' in the PATH. """ # Guess #1 - cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or os.environ.get("HIP_PATH") if cuda_home is None: # Guess #2 try: which = "where" if IS_WINDOWS else "which" - nvcc = subprocess.check_output([which, "nvcc"]).decode().rstrip("\r\n") - cuda_home = os.path.dirname(os.path.dirname(nvcc)) + hipcc = subprocess.check_output([which, "hipcc"]).decode().rstrip("\r\n") + cuda_home = os.path.dirname(os.path.dirname(hipcc)) except subprocess.CalledProcessError: # Guess #3 if IS_WINDOWS: @@ -124,24 +125,24 @@ def _is_cuda_file(path): CUDA, CUDA_VERSION = locate_cuda() -cuda_version = 11.0 -try: - cuda_version = float(CUDA_VERSION) -except ValueError: - cuda_list = re.findall('\d+', CUDA_VERSION) - cuda_version = float( str(cuda_list[0] + '.' + cuda_list[1])) - -# Insert CUDA arguments depedning on the version -for item in CC_COMPATIBILITY_TABLE: - support_begin = item[2] - support_end = item[3] - if cuda_version < support_begin: - continue - if cuda_version >= support_end: - continue - str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}" - COMPUTE_CAPABILITY_ARGS.insert(0, str_arg) - +#cuda_version = 11.0 +#try: + #cuda_version = float(CUDA_VERSION) +#except ValueError: + #cuda_list = re.findall("\d+", CUDA_VERSION) + #cuda_version = float(str(cuda_list[0] + "." + cuda_list[1])) +# +## Insert CUDA arguments depedning on the version +#for item in CC_COMPATIBILITY_TABLE: + #support_begin = item[2] + #support_end = item[3] + #if cuda_version < support_begin: + ##continue + #if cuda_version >= support_end: + #continue + #str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}" + #COMPUTE_CAPABILITY_ARGS.insert(0, str_arg) +# # Obtain the numpy include directory. This logic works across numpy versions. try: NUMPY_INCLUDE = numpy.get_include() @@ -153,10 +154,10 @@ def _is_cuda_file(path): COMMON_NVCC_FLAGS = [ - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", - "--expt-relaxed-constexpr", + #"-D__CUDA_NO_HALF_OPERATORS__", + #"-D__CUDA_NO_HALF_CONVERSIONS__", + #"-D__CUDA_NO_HALF2_OPERATORS__", + #"--expt-relaxed-constexpr", ] @@ -211,18 +212,20 @@ def build_extensions(self): def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): # Copy before we make any modifications. cflags = copy.deepcopy(extra_postargs) + cflags.append("-D__HIP_PLATFORM_AMD__") try: original_compiler = self.compiler.compiler_so if _is_cuda_file(src): - nvcc = _join_cuda_home("bin", "nvcc") - if not isinstance(nvcc, list): - nvcc = [nvcc] - self.compiler.set_executable("compiler_so", nvcc) + hipcc = _join_cuda_home("bin", "hipcc") + if not isinstance(hipcc, list): + hipcc = [hipcc] + self.compiler.set_executable("compiler_so", hipcc) + self.compiler.set_executable("compiler", hipcc) if isinstance(cflags, dict): - cflags = cflags["nvcc"] + cflags = cflags["hipcc"] cflags = ( COMMON_NVCC_FLAGS - + ["--compiler-options", "'-fPIC'"] + + ["-fPIC"] + cflags + COMPUTE_CAPABILITY_ARGS ) @@ -237,6 +240,7 @@ def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): finally: # Put the original compiler back in place. self.compiler.set_executable("compiler_so", original_compiler) + self.compiler.set_executable("compiler", original_compiler) def win_wrap_compile( sources, @@ -269,9 +273,9 @@ def spawn(cmd, cflags): src = src_list[0] obj = obj_list[0] if _is_cuda_file(src): - nvcc = _join_cuda_home("bin", "nvcc") + hipcc = _join_cuda_home("bin", "hipcc") if isinstance(cflags, dict): - cflags = cflags["nvcc"] + cflags = cflags["hipcc"] elif not isinstance(cflags, list): cflags = [] @@ -287,7 +291,7 @@ def spawn(cmd, cflags): elif len(macro) == 1: cflags += ["--undefine-macro", macro[0]] - cmd = [nvcc, "-c", src, "-o", obj] + include_list + cflags + cmd = [hipcc, "-c", src, "-o", obj] + include_list + cflags elif isinstance(cflags, dict): cflags = COMMON_MSVC_FLAGS # + self.cflags['cxx'] cmd += cflags @@ -372,7 +376,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -395,7 +399,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -416,7 +420,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -437,7 +441,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -458,7 +462,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -475,7 +479,7 @@ def include_headers(filename_list, sdist=False): sdist=sys.argv[1] == "sdist", ), library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -496,10 +500,10 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64", "hiprand"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, - include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], + include_dirs=[NUMPY_INCLUDE, CUDA["include"],"../Common/CUDA/"], )