From 650a939e66a7c838ddb3bb859ec5864f42c5c96d Mon Sep 17 00:00:00 2001 From: purepani Date: Mon, 10 Mar 2025 19:13:52 -0500 Subject: [PATCH 1/3] Generates HIP from CUDA --- Common/CUDA/GD_AwTV.cu | 151 +- Common/CUDA/GD_AwTV.cu.prehip | 713 ++++++++++ Common/CUDA/GD_AwTV.hpp.prehip | 62 + Common/CUDA/GD_TV.cu | 155 +-- Common/CUDA/GD_TV.cu.prehip | 702 ++++++++++ Common/CUDA/GD_TV.hpp.prehip | 61 + Common/CUDA/GpuIds.cpp | 10 +- Common/CUDA/GpuIds.cpp.prehip | 70 + Common/CUDA/GpuIds.hpp.prehip | 17 + Common/CUDA/PICCS.cu | 77 +- Common/CUDA/PICCS.cu.prehip | 398 ++++++ Common/CUDA/PICCS.hpp.prehip | 61 + Common/CUDA/RandomNumberGenerator.cu | 89 +- Common/CUDA/RandomNumberGenerator.cu.prehip | 193 +++ Common/CUDA/RandomNumberGenerator.hpp.prehip | 49 + Common/CUDA/Siddon_projection.cu | 157 +-- Common/CUDA/Siddon_projection.cu.prehip | 859 ++++++++++++ Common/CUDA/Siddon_projection.hpp.prehip | 66 + Common/CUDA/Siddon_projection_parallel.cu | 99 +- .../CUDA/Siddon_projection_parallel.cu.prehip | 540 ++++++++ .../Siddon_projection_parallel.hpp.prehip | 65 + Common/CUDA/TIGRE_common.cpp.prehip | 20 + Common/CUDA/TIGRE_common.hpp.prehip | 24 + Common/CUDA/errors.hpp | 2 +- Common/CUDA/errors.hpp.prehip | 10 + Common/CUDA/gpuUtils.cu | 18 +- Common/CUDA/gpuUtils.cu.prehip | 70 + Common/CUDA/gpuUtils.hpp.prehip | 18 + Common/CUDA/improvedForwardProjections.cu | 127 +- .../CUDA/improvedForwardProjections.cu.prehip | 1032 ++++++++++++++ Common/CUDA/improvedForwardProjections.hpp | 5 +- .../improvedForwardProjections.hpp.prehip | 263 ++++ .../CUDA/improvedForwardProjections_cone.cu | 131 +- .../improvedForwardProjections_cone.cu.prehip | 1230 +++++++++++++++++ Common/CUDA/projection.cpp.prehip | 35 + Common/CUDA/projection.hpp.prehip | 9 + Common/CUDA/ray_interpolated_projection.cu | 165 +-- .../ray_interpolated_projection.cu.prehip | 843 +++++++++++ .../ray_interpolated_projection.hpp.prehip | 66 + .../ray_interpolated_projection_parallel.cu | 105 +- ...interpolated_projection_parallel.cu.prehip | 449 ++++++ ...nterpolated_projection_parallel.hpp.prehip | 65 + Common/CUDA/tv_proximal.cu | 241 ++-- Common/CUDA/tv_proximal.cu.prehip | 693 ++++++++++ Common/CUDA/tv_proximal.hpp.prehip | 57 + Common/CUDA/types_TIGRE.hpp.prehip | 109 ++ Common/CUDA/voxel_backprojection.cu | 149 +- Common/CUDA/voxel_backprojection.cu.prehip | 920 ++++++++++++ Common/CUDA/voxel_backprojection.hpp.prehip | 59 + Common/CUDA/voxel_backprojection2.cu | 149 +- Common/CUDA/voxel_backprojection2.cu.prehip | 844 +++++++++++ Common/CUDA/voxel_backprojection2.hpp.prehip | 64 + Common/CUDA/voxel_backprojection_parallel.cu | 117 +- .../voxel_backprojection_parallel.cu.prehip | 627 +++++++++ .../voxel_backprojection_parallel.hpp.prehip | 57 + .../Utilities/GPU/getGpuCount_mex.cpp.prehip | 21 + .../Utilities/GPU/getGpuName_mex.cpp.prehip | 29 + .../IO/VarianCBCT/XimPara.hpp.prehip | 28 + .../IO/VarianCBCT/mexReadXim.cpp.prehip | 357 +++++ .../cuda_interface/AddNoise.cpp.prehip | 126 ++ .../cuda_interface/Atb_mex.cpp.prehip | 367 +++++ .../cuda_interface/AwminTV.cpp.prehip | 137 ++ .../cuda_interface/Ax_mex.cpp.prehip | 338 +++++ .../cuda_interface/minPICCS.cpp.prehip | 147 ++ .../Utilities/cuda_interface/minTV.cpp.prehip | 132 ++ .../pCTCubicSpline_mex.cpp.prehip | 124 ++ .../cuda_interface/tvDenoise.cpp.prehip | 147 ++ 67 files changed, 14354 insertions(+), 966 deletions(-) create mode 100644 Common/CUDA/GD_AwTV.cu.prehip create mode 100644 Common/CUDA/GD_AwTV.hpp.prehip create mode 100644 Common/CUDA/GD_TV.cu.prehip create mode 100644 Common/CUDA/GD_TV.hpp.prehip create mode 100644 Common/CUDA/GpuIds.cpp.prehip create mode 100644 Common/CUDA/GpuIds.hpp.prehip create mode 100644 Common/CUDA/PICCS.cu.prehip create mode 100644 Common/CUDA/PICCS.hpp.prehip create mode 100644 Common/CUDA/RandomNumberGenerator.cu.prehip create mode 100644 Common/CUDA/RandomNumberGenerator.hpp.prehip create mode 100644 Common/CUDA/Siddon_projection.cu.prehip create mode 100644 Common/CUDA/Siddon_projection.hpp.prehip create mode 100644 Common/CUDA/Siddon_projection_parallel.cu.prehip create mode 100644 Common/CUDA/Siddon_projection_parallel.hpp.prehip create mode 100644 Common/CUDA/TIGRE_common.cpp.prehip create mode 100644 Common/CUDA/TIGRE_common.hpp.prehip create mode 100644 Common/CUDA/errors.hpp.prehip create mode 100644 Common/CUDA/gpuUtils.cu.prehip create mode 100644 Common/CUDA/gpuUtils.hpp.prehip create mode 100644 Common/CUDA/improvedForwardProjections.cu.prehip create mode 100644 Common/CUDA/improvedForwardProjections.hpp.prehip create mode 100644 Common/CUDA/improvedForwardProjections_cone.cu.prehip create mode 100644 Common/CUDA/projection.cpp.prehip create mode 100644 Common/CUDA/projection.hpp.prehip create mode 100644 Common/CUDA/ray_interpolated_projection.cu.prehip create mode 100644 Common/CUDA/ray_interpolated_projection.hpp.prehip create mode 100644 Common/CUDA/ray_interpolated_projection_parallel.cu.prehip create mode 100644 Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip create mode 100644 Common/CUDA/tv_proximal.cu.prehip create mode 100644 Common/CUDA/tv_proximal.hpp.prehip create mode 100644 Common/CUDA/types_TIGRE.hpp.prehip create mode 100644 Common/CUDA/voxel_backprojection.cu.prehip create mode 100644 Common/CUDA/voxel_backprojection.hpp.prehip create mode 100644 Common/CUDA/voxel_backprojection2.cu.prehip create mode 100644 Common/CUDA/voxel_backprojection2.hpp.prehip create mode 100644 Common/CUDA/voxel_backprojection_parallel.cu.prehip create mode 100644 Common/CUDA/voxel_backprojection_parallel.hpp.prehip create mode 100644 MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip create mode 100644 MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip create mode 100644 MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip create mode 100644 MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/minTV.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip create mode 100644 MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip diff --git a/Common/CUDA/GD_AwTV.cu b/Common/CUDA/GD_AwTV.cu index d98c13c1..03956111 100644 --- a/Common/CUDA/GD_AwTV.cu +++ b/Common/CUDA/GD_AwTV.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for Steepest descend in POCS-type algorithms. @@ -61,11 +62,11 @@ #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",cudaGetErrorString(__err));\ + hipDeviceReset();\ + mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",hipGetErrorString(__err));\ } \ } while (0) @@ -378,16 +379,16 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // allocate memory in each GPU for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - cudaMalloc((void**)&d_image[dev] , mem_img_each_GPU); - cudaMemset( d_image[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); - cudaMemset( d_dimgTV[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); - cudaMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); - cudaMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); - cudaMemset( d_norm2aux[dev],0 , mem_auxiliary); + hipMalloc((void**)&d_image[dev] , mem_img_each_GPU); + hipMemset( d_image[dev],0 , mem_img_each_GPU); + hipMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); + hipMemset( d_dimgTV[dev],0 , mem_img_each_GPU); + hipMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); + hipMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); + hipMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); + hipMemset( d_norm2aux[dev],0 , mem_auxiliary); cudaCheckErrors("Malloc error"); @@ -397,7 +398,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma if(splits>1){ mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); }else{ - cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); + hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); } @@ -406,12 +407,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ - cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); + hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -420,12 +421,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma // Create streams int nStream_device=2; int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); + hipStreamCreate(&stream[i+dev*nStream_device]); } } cudaCheckErrors("Stream creation fail"); @@ -437,7 +438,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma double totalsum; float sum_curr_spl; float * sumnorm2; - cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); + hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float)); unsigned int curr_slices; unsigned long long curr_pixels; @@ -476,28 +477,28 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma if(i==0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result. if (splits>1 & i>0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } cudaCheckErrors("Memcpy failure on multi split"); @@ -509,7 +510,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } else { - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Reduction error"); @@ -586,7 +587,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Scalar operations error"); //SUBSTRACT GRADIENT ////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split0){ - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); + hipSetDevice(gpuids[dev-1]); + hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); } } }else{ // We need to take it out :( for(dev=0; dev2){ - cudaHostUnregister(img); - cudaHostUnregister(dst); + hipHostUnregister(img); + hipHostUnregister(dst); } for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; cudaCheckErrors("Memory free"); -// cudaDeviceReset(); +// hipDeviceReset(); } void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ @@ -697,8 +698,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ size_t memtotal; const int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree= 0 && z= 0 && y= 0 && x= cols || y >= rows || z >= depth ) + return; + + + float df[3] ={0.f,0.f,0.f}; + float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k} + float dfj[3]={0.f,0.f,0.f}; + float dfk[3]={0.f,0.f,0.f}; + gradient(f,df ,z ,y ,x , depth,rows,cols); + gradient(f,dfi ,z ,y ,x+1, depth,rows,cols); + gradient(f,dfj ,z ,y+1,x , depth,rows,cols); + gradient(f,dfk ,z+1,y ,x , depth,rows,cols); + float eps=0.00000001; //% avoid division by zero + + float wx=__expf(-(df[0]/delta)*(df[0]/delta)); + float wy=__expf(-(df[1]/delta)*(df[1]/delta)); + float wz=__expf(-(df[2]/delta)*(df[2]/delta)); + + float wxi=__expf(-(dfi[0]/delta)*(dfi[0]/delta)); + float wyi=__expf(-(dfi[1]/delta)*(dfi[1]/delta)); + float wzi=__expf(-(dfi[2]/delta)*(dfi[2]/delta)); + + float wxj=__expf(-(dfj[0]/delta)*(dfj[0]/delta)); + float wyj=__expf(-(dfj[1]/delta)*(dfj[1]/delta)); + float wzj=__expf(-(dfj[2]/delta)*(dfj[2]/delta)); + + float wxk=__expf(-(dfk[0]/delta)*(dfk[0]/delta)); + float wyk=__expf(-(dfk[1]/delta)*(dfk[1]/delta)); + float wzk=__expf(-(dfk[2]/delta)*(dfk[2]/delta)); + + + // this hsould do the trick I think + + dftv[idx]=(wx*df[0]+wy*df[1]+wz*df[2])/(sqrt(wx*df[0] *df[0] +wy*df[1] *df[1] +wz*df[2] *df[2])+eps) + -wzi*dfi[2]/(sqrt(wxi*dfi[0]*dfi[0]+wyi*dfi[1]*dfi[1]+wzi*dfi[2]*dfi[2]) +eps) // I wish I coudl precompute this, but if I do then Id need to recompute the gradient. + -wyj*dfj[1]/(sqrt(wxj*dfj[0]*dfj[0]+wyj*dfj[1]*dfj[1]+wzj*dfj[2]*dfj[2]) +eps) + -wxk*dfk[0]/(sqrt(wxk*dfk[0]*dfk[0]+wyk*dfk[1]*dfk[1]+wzk*dfk[2]*dfk[2]) +eps); + + + return; + + } + + __device__ void warpReduce(volatile float *sdata, size_t tid) { + sdata[tid] += sdata[tid + 32]; + sdata[tid] += sdata[tid + 16]; + sdata[tid] += sdata[tid + 8]; + sdata[tid] += sdata[tid + 4]; + sdata[tid] += sdata[tid + 2]; + sdata[tid] += sdata[tid + 1]; + } + + __global__ void reduceNorm2(float *g_idata, float *g_odata, size_t n){ + extern __shared__ volatile float sdata[]; + //http://stackoverflow.com/a/35133396/1485872 + size_t tid = threadIdx.x; + size_t i = blockIdx.x*blockDim.x + tid; + size_t gridSize = blockDim.x*gridDim.x; + float mySum = 0; + float value=0; + while (i < n) { + value=g_idata[i]; //avoid reading twice + mySum += value*value; + i += gridSize; + } + sdata[tid] = mySum; + __syncthreads(); + + if (tid < 512) + sdata[tid] += sdata[tid + 512]; + __syncthreads(); + if (tid < 256) + sdata[tid] += sdata[tid + 256]; + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + +#if (__CUDART_VERSION >= 9000) + if ( tid < 32 ) + { + mySum = sdata[tid] + sdata[tid + 32]; + for (int offset = warpSize/2; offset > 0; offset /= 2) { + mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); + } + } +#else + if (tid < 32) { + warpReduce(sdata, tid); + mySum = sdata[0]; + } +#endif + if (tid == 0) g_odata[blockIdx.x] = mySum; + } + + __global__ void reduceSum(float *g_idata, float *g_odata, size_t n){ + extern __shared__ volatile float sdata[]; + //http://stackoverflow.com/a/35133396/1485872 + size_t tid = threadIdx.x; + size_t i = blockIdx.x*blockDim.x + tid; + size_t gridSize = blockDim.x*gridDim.x; + float mySum = 0; + // float value=0; + while (i < n) { + mySum += g_idata[i]; + i += gridSize; + } + sdata[tid] = mySum; + __syncthreads(); + + if (tid < 512) + sdata[tid] += sdata[tid + 512]; + __syncthreads(); + if (tid < 256) + sdata[tid] += sdata[tid + 256]; + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + +#if (__CUDART_VERSION >= 9000) + if ( tid < 32 ) + { + mySum = sdata[tid] + sdata[tid + 32]; + for (int offset = warpSize/2; offset > 0; offset /= 2) { + mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); + } + } +#else + if (tid < 32) { + warpReduce(sdata, tid); + mySum = sdata[0]; + } +#endif + if (tid == 0) g_odata[blockIdx.x] = mySum; + } + + + + +// main function +void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter,const float delta, const GpuIds& gpuids){ + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","There are no available device(s) that support CUDA\n"); + } + // + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + int dev; + + // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. + // check free memory + size_t mem_GPU_global; + checkFreeMemory(gpuids, &mem_GPU_global); + + + + // %5 of free memory should be enough, we have almost no variables in these kernels + size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; + size_t mem_slice_image = sizeof(float)* image_size[0] * image_size[1] ; + size_t mem_size_image = sizeof(float)* total_pixels; + size_t mem_auxiliary = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS; + + // Decide how are we handling the distribution of computation + size_t mem_img_each_GPU; + + unsigned int buffer_length=2; + //Does everything fit in the GPU? + unsigned int slices_per_split; + + // if it is a thin problem (no need to split), just use one GPU + if (image_size[2]<4){deviceCount=1;} + + unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this. + if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary) { + // We only need to split if we have extra GPUs + slices_per_split=(image_size[2]+deviceCount-1)/deviceCount; + mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2)); + }else{ + // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all + size_t mem_free=mem_GPU_global-mem_auxiliary; + + splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free)); + // Now, there is an overhead here, as each splits should have 2 slices more, to account for overlap of images. + // lets make sure these 2 slices fit, if they do not, add 1 to splits. + slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); + mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); + + // if the new stuff does not fit in the GPU, it means we are in the edge case where adding that extra slice will overflow memory + if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ + // one more split should do the job, as its an edge case. + splits++; + //recompute for later + slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap + mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); + } + + + // How many EXTRA buffer slices should be able to fit in here??!?! + // Only do it if there are splits needed. + if(splits>1){ + mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary); + unsigned int extra_buff=(mem_free/mem_slice_image); + buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down. + buffer_length=max(buffer_length,2);// minimum 2 + buffer_length=min(MAX_BUFFER,buffer_length); + + mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2); + + }else{ + buffer_length=2; + } + + // Assert + if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ + mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n"); + } + } + + + // Assert + + if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){ + mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n"); + } + + + + + + + float** d_image= (float**)malloc(deviceCount*sizeof(float*)); + float** d_dimgTV= (float**)malloc(deviceCount*sizeof(float*)); + float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*)); + float** d_norm2= (float**)malloc(deviceCount*sizeof(float*)); + + // allocate memory in each GPU + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + + cudaMalloc((void**)&d_image[dev] , mem_img_each_GPU); + cudaMemset( d_image[dev],0 , mem_img_each_GPU); + cudaMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); + cudaMemset( d_dimgTV[dev],0 , mem_img_each_GPU); + cudaMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); + cudaMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); + cudaMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); + cudaMemset( d_norm2aux[dev],0 , mem_auxiliary); + cudaCheckErrors("Malloc error"); + + + } + unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1]; + float* buffer; + if(splits>1){ + mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); + }else{ + cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); + } + + + + // Lets try to make the host memory pinned: + // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + // splits>2 is completely empirical observation + if (isHostRegisterSupported & splits>2){ + cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + } + cudaCheckErrors("Error pinning memory"); + + + + // Create streams + int nStream_device=2; + int nStreams=deviceCount*nStream_device; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + for (int i = 0; i < nStream_device; ++i){ + cudaStreamCreate(&stream[i+dev*nStream_device]); + } + } + cudaCheckErrors("Stream creation fail"); + + + // For the reduction + + double totalsum_prev; + double totalsum; + float sum_curr_spl; + float * sumnorm2; + cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); + + unsigned int curr_slices; + unsigned long long curr_pixels; + size_t linear_idx_start; + unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); + unsigned long long* offset_host =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); + unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); + bool is_first_chunk; + bool is_last_chunk; + for(unsigned int i=0;i1){ + totalsum_prev=0; + } + for(unsigned int sp=0;sp1 & i>0){ + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + + + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + } + cudaCheckErrors("Memcpy failure on multi split"); + + for(unsigned int ib=0; (ib<(buffer_length-1)) && ((i+ib)>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0],delta); + + } + + + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); + + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { + reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); + cudaStreamSynchronize(stream[dev*nStream_device]); + cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + } + else { + cudaStreamSynchronize(stream[dev*nStream_device]); + cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + } + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + cudaCheckErrors("Reduction error"); + + + // Accumulate the norm accross devices + sum_curr_spl=0; + // this is CPU code + for (dev = 0; dev < deviceCount; dev++){ + sum_curr_spl+=sumnorm2[dev]; + } + sum_curr_spl+=0.0000001f; // avoid division by zero + + // If we have more than one splits, lets use the result from prior calls + if(i>0 && splits>1){ + // this is already stored: + //totalsum=totalsum_prev; + }else{ + totalsum=sum_curr_spl; + } + + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels); + //MULTIPLY HYPERPARAMETER + multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + cudaCheckErrors("Scalar operations error"); + + //SUBSTRACT GRADIENT + ////////////////////////////////////////////// + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels); + } + } + + // Synchronize mathematics, make sure bounding pixels are correct + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + + if(splits==1){ + for(dev=0; dev0){ + cudaSetDevice(gpuids[dev-1]); + cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); + cudaSetDevice(gpuids[dev]); + cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); + } + } + }else{ + + // We need to take it out :( + for(dev=0; dev2){ + cudaHostUnregister(img); + cudaHostUnregister(dst); + } + for (int i = 0; i < nStreams; ++i) + cudaStreamDestroy(stream[i]) ; + cudaCheckErrors("Memory free"); +// cudaDeviceReset(); + } + +void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ + size_t memfree; + size_t memtotal; + const int deviceCount = gpuids.GetLength(); + for (int dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemGetInfo(&memfree,&memtotal); + if(dev==0) *mem_GPU_global=memfree; + if(memfree1){ mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); }else{ - cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); + hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); } @@ -390,12 +391,12 @@ do { \ // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // splits>2 is completely empirical observation if (isHostRegisterSupported & splits>2){ - cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); + hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -404,12 +405,12 @@ do { \ // Create streams int nStream_device=2; int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); + hipStreamCreate(&stream[i+dev*nStream_device]); } } cudaCheckErrors("Stream creation fail"); @@ -421,7 +422,7 @@ do { \ double totalsum; float sum_curr_spl; float * sumnorm2; - cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); + hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float)); unsigned int curr_slices; unsigned long long curr_pixels; @@ -460,28 +461,28 @@ do { \ if(i==0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result. if (splits>1 & i>0){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } cudaCheckErrors("Memcpy failure on multi split"); @@ -493,7 +494,7 @@ do { \ dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } else { - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + hipStreamSynchronize(stream[dev*nStream_device]); + hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Reduction error"); @@ -570,7 +571,7 @@ do { \ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Scalar operations error"); //SUBSTRACT GRADIENT ////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); curr_slices=((sp*deviceCount+dev+1)*slices_per_split0){ - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); + hipSetDevice(gpuids[dev-1]); + hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost); + hipSetDevice(gpuids[dev]); + hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); } } }else{ // We need to take it out :( for(dev=0; dev2){ - cudaHostUnregister(img); - cudaHostUnregister(dst); + hipHostUnregister(img); + hipHostUnregister(dst); } for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Memory free"); - cudaDeviceReset(); + hipDeviceReset(); } void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ @@ -686,8 +687,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ size_t memtotal; int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree= 0 && z= 0 && y= 0 && x= cols || y >= rows || z >= depth ) + return; + + + float df[3] ={0.f,0.f,0.f}; + float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k} + float dfj[3]={0.f,0.f,0.f}; + float dfk[3]={0.f,0.f,0.f}; + gradient(f,df ,z ,y ,x , depth,rows,cols); + gradient(f,dfi ,z ,y ,x+1, depth,rows,cols); + gradient(f,dfj ,z ,y+1,x , depth,rows,cols); + gradient(f,dfk ,z+1,y ,x , depth,rows,cols); + float eps=0.00000001; //% avoid division by zero + + dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps) + -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps) // I wish I coudl precompute this, but if I do then Id need to recompute the gradient. + -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps) + -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps); + return; + + } + + __device__ void warpReduce(volatile float *sdata, size_t tid) { + sdata[tid] += sdata[tid + 32]; + sdata[tid] += sdata[tid + 16]; + sdata[tid] += sdata[tid + 8]; + sdata[tid] += sdata[tid + 4]; + sdata[tid] += sdata[tid + 2]; + sdata[tid] += sdata[tid + 1]; + } + + __global__ void reduceNorm2(float *g_idata, float *g_odata, size_t n){ + extern __shared__ volatile float sdata[]; + //http://stackoverflow.com/a/35133396/1485872 + size_t tid = threadIdx.x; + size_t i = blockIdx.x*blockDim.x + tid; + size_t gridSize = blockDim.x*gridDim.x; + float mySum = 0; + float value=0; + while (i < n) { + value=g_idata[i]; //avoid reading twice + mySum += value*value; + i += gridSize; + } + sdata[tid] = mySum; + __syncthreads(); + + if (tid < 512) + sdata[tid] += sdata[tid + 512]; + __syncthreads(); + if (tid < 256) + sdata[tid] += sdata[tid + 256]; + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + +#if (__CUDART_VERSION >= 9000) + if ( tid < 32 ) + { + mySum = sdata[tid] + sdata[tid + 32]; + for (int offset = warpSize/2; offset > 0; offset /= 2) { + mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); + } + } +#else + if (tid < 32) { + warpReduce(sdata, tid); + mySum = sdata[0]; + } +#endif + if (tid == 0) g_odata[blockIdx.x] = mySum; + } + + __global__ void reduceSum(float *g_idata, float *g_odata, size_t n){ + extern __shared__ volatile float sdata[]; + //http://stackoverflow.com/a/35133396/1485872 + size_t tid = threadIdx.x; + size_t i = blockIdx.x*blockDim.x + tid; + size_t gridSize = blockDim.x*gridDim.x; + float mySum = 0; + // float value=0; + while (i < n) { + mySum += g_idata[i]; + i += gridSize; + } + sdata[tid] = mySum; + __syncthreads(); + + if (tid < 512) + sdata[tid] += sdata[tid + 512]; + __syncthreads(); + if (tid < 256) + sdata[tid] += sdata[tid + 256]; + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + +#if (__CUDART_VERSION >= 9000) + if ( tid < 32 ) + { + mySum = sdata[tid] + sdata[tid + 32]; + for (int offset = warpSize/2; offset > 0; offset /= 2) { + mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); + } + } +#else + if (tid < 32) { + warpReduce(sdata, tid); + mySum = sdata[0]; + } +#endif + if (tid == 0) g_odata[blockIdx.x] = mySum; + } + + + + +// main function + void pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter, const GpuIds& gpuids){ + + + + + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("GD_TV:GPU","There are no available device(s) that support CUDA\n"); + } + // + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("minimizeTV:GD_TV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + + int dev; + + // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. + + size_t mem_GPU_global; + checkFreeMemory(gpuids, &mem_GPU_global); + + + + // %5 of free memory should be enough, we have almost no variables in these kernels + size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; + size_t mem_slice_image = sizeof(float)* image_size[0] * image_size[1] ; + size_t mem_size_image = sizeof(float)* total_pixels; + size_t mem_auxiliary = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS; + + // Decide how are we handling the distribution of computation + size_t mem_img_each_GPU; + + unsigned int buffer_length=2; + //Does everything fit in the GPU? + unsigned int slices_per_split; + + // if it is a thin problem (no need to split), just use one GPU + if (image_size[2]<4){deviceCount=1;} + + unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this. + if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary){ + // We only need to split if we have extra GPUs + slices_per_split=(image_size[2]+deviceCount-1)/deviceCount; + mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2)); + }else{ + // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all + size_t mem_free=mem_GPU_global-mem_auxiliary; + + splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free)); + // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images. + // lets make sure these 2 slices fit, if they do not, add 1 to splits. + slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); + mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); + + // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory + if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ + // one more split should do the job, as its an edge case. + splits++; + //recompute for later + slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap + mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); + } + + + // How many EXTRA buffer slices should be able to fit in here??!?! + // Only do it if there are splits needed. + if(splits>1){ + mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary); + unsigned int extra_buff=(mem_free/mem_slice_image); + buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down. + buffer_length=max(buffer_length,2);// minimum 2 + buffer_length=min(MAX_BUFFER,buffer_length); + + mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2); + + }else{ + buffer_length=2; + } + + // Assert + if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ + mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n"); + } + } + + + // Assert + + if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){ + mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n"); + } + + + + + + + float** d_image= (float**)malloc(deviceCount*sizeof(float*)); + float** d_dimgTV= (float**)malloc(deviceCount*sizeof(float*)); + float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*)); + float** d_norm2= (float**)malloc(deviceCount*sizeof(float*)); + + // allocate memory in each GPU + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + + cudaMalloc((void**)&d_image[dev] , mem_img_each_GPU); + cudaMemset( d_image[dev],0 , mem_img_each_GPU); + cudaMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); + cudaMemset( d_dimgTV[dev],0 , mem_img_each_GPU); + cudaMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); + cudaMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); + cudaMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); + cudaMemset( d_norm2aux[dev],0 , mem_auxiliary); + cudaCheckErrors("Malloc error"); + + + } + unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1]; + float* buffer; + if(splits>1){ + mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); + }else{ + cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); + } + + + + // Lets try to make the host memory pinned: + // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + // splits>2 is completely empirical observation + if (isHostRegisterSupported & splits>2){ + cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + } + cudaCheckErrors("Error pinning memory"); + + + + // Create streams + int nStream_device=2; + int nStreams=deviceCount*nStream_device; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + for (int i = 0; i < nStream_device; ++i){ + cudaStreamCreate(&stream[i+dev*nStream_device]); + } + } + cudaCheckErrors("Stream creation fail"); + + + // For the reduction + + double totalsum_prev; + double totalsum; + float sum_curr_spl; + float * sumnorm2; + cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); + + unsigned int curr_slices; + unsigned long long curr_pixels; + size_t linear_idx_start; + unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); + unsigned long long* offset_host =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); + unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); + bool is_first_chunk; + bool is_last_chunk; + for(unsigned int i=0;i1){ + totalsum_prev=0; + } + for(unsigned int sp=0;sp1 & i>0){ + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); + + + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + } + cudaCheckErrors("Memcpy failure on multi split"); + + for(unsigned int ib=0; (ib<(buffer_length-1)) && ((i+ib)>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0]); + + } + + + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); + + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { + reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); + cudaStreamSynchronize(stream[dev*nStream_device]); + cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + } + else { + cudaStreamSynchronize(stream[dev*nStream_device]); + cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); + } + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + cudaCheckErrors("Reduction error"); + + + // Accumulate the norm accross devices + sum_curr_spl=0; + // this is CPU code + for (dev = 0; dev < deviceCount; dev++){ + sum_curr_spl+=sumnorm2[dev]; + } + sum_curr_spl+=0.0000001f; // avoid division by zero + + // If we have more than one splits, lets use the result from prior calls + if(i>0 && splits>1){ + // this is already stored: + //totalsum=totalsum_prev; + }else{ + totalsum=sum_curr_spl; + } + + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels); + //MULTIPLY HYPERPARAMETER + multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + cudaCheckErrors("Scalar operations error"); + + //SUBSTRACT GRADIENT + ////////////////////////////////////////////// + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels); + } + } + + // Synchronize mathematics, make sure bounding pixels are correct + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + + if(splits==1){ + for(dev=0; dev0){ + cudaSetDevice(gpuids[dev-1]); + cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); + cudaSetDevice(gpuids[dev]); + cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); + } + } + }else{ + + // We need to take it out :( + for(dev=0; dev2){ + cudaHostUnregister(img); + cudaHostUnregister(dst); + } + for (int i = 0; i < nStreams; ++i) + cudaStreamDestroy(stream[i]) ; + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + cudaCheckErrors("Memory free"); + cudaDeviceReset(); + } + +void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ + size_t memfree; + size_t memtotal; + int deviceCount = gpuids.GetLength(); + for (int dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemGetInfo(&memfree,&memtotal); + if(dev==0) *mem_GPU_global=memfree; + if(memfree #include -#include +#include GpuIds::~GpuIds() { free(m_piDeviceIds); m_piDeviceIds = nullptr; @@ -52,12 +52,12 @@ void GpuIds::SetAllGpus(int iTotalDeviceCount) { bool GpuIds::AreEqualDevices() const { int deviceCount = this->GetLength(); - const int devicenamelength = 256; // The length 256 is fixed by spec of cudaDeviceProp::name + const int devicenamelength = 256; // The length 256 is fixed by spec of hipDeviceProp_t::name char devicename[devicenamelength]; - cudaDeviceProp deviceProp; + hipDeviceProp_t deviceProp; for (int dev = 0; dev < deviceCount; dev++) { - // cudaSetDevice(m_piDeviceIds[dev]); - cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]); + // hipSetDevice(m_piDeviceIds[dev]); + hipGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]); if (dev>0) { if (strcmp(devicename, deviceProp.name) != 0) { return false; diff --git a/Common/CUDA/GpuIds.cpp.prehip b/Common/CUDA/GpuIds.cpp.prehip new file mode 100644 index 00000000..e9e622cc --- /dev/null +++ b/Common/CUDA/GpuIds.cpp.prehip @@ -0,0 +1,70 @@ +#include "GpuIds.hpp" +#include +#include +#include + +GpuIds::~GpuIds() { + free(m_piDeviceIds); m_piDeviceIds = nullptr; + m_iCount = 0; +} +GpuIds::GpuIds() : m_piDeviceIds (nullptr), m_iCount(0) { + +} +void GpuIds::SetIds(int iCount, int* piDeviceIds) { + if (iCount > 0 && piDeviceIds != 0) { + if (m_piDeviceIds) { + free(m_piDeviceIds); m_piDeviceIds = nullptr; + m_iCount = 0; + } + m_piDeviceIds = (int*)malloc(iCount * sizeof(int)); + if (m_piDeviceIds) { + for (int iI = 0; iI < iCount; ++iI) { + m_piDeviceIds[iI] = piDeviceIds[iI]; + } + m_iCount = iCount; + } + } +} + +int GpuIds::GetLength() const { + return m_iCount; +} +int& GpuIds::operator[](int iIndex){ + return m_piDeviceIds[iIndex]; +} +int GpuIds::operator[](int iIndex) const { + return m_piDeviceIds[iIndex]; +} + +void GpuIds::SetAllGpus(int iTotalDeviceCount) { + // Set all GPUs for compatibility + // Makeup valid GpuIds. + int* aiIds = nullptr; + if (iTotalDeviceCount == 0) { + (int*)malloc(iTotalDeviceCount*sizeof(int)); + for (int iI = 0; iI < iTotalDeviceCount; ++iI) { + aiIds[iI] = iI; + } + } + SetIds(iTotalDeviceCount, aiIds); + free(aiIds); aiIds = 0; +} + +bool GpuIds::AreEqualDevices() const { + int deviceCount = this->GetLength(); + const int devicenamelength = 256; // The length 256 is fixed by spec of cudaDeviceProp::name + char devicename[devicenamelength]; + cudaDeviceProp deviceProp; + for (int dev = 0; dev < deviceCount; dev++) { + // cudaSetDevice(m_piDeviceIds[dev]); + cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]); + if (dev>0) { + if (strcmp(devicename, deviceProp.name) != 0) { + return false; + } + } + memset(devicename, 0, devicenamelength); + strcpy(devicename, deviceProp.name); + } + return true; +} diff --git a/Common/CUDA/GpuIds.hpp.prehip b/Common/CUDA/GpuIds.hpp.prehip new file mode 100644 index 00000000..e0223f86 --- /dev/null +++ b/Common/CUDA/GpuIds.hpp.prehip @@ -0,0 +1,17 @@ + +#ifndef GPUIDS_H +#define GPUIDS_H +struct GpuIds { + int* m_piDeviceIds; + int m_iCount; + ~GpuIds(); + GpuIds(); + void SetIds(int iCount, int* piDeviceIds); + int GetLength() const; + void SetAllGpus(int iTotalDeviceCount); + int& operator[](int iIndex); + int operator[](int iIndex) const; + bool AreEqualDevices() const; +}; +#endif + diff --git a/Common/CUDA/PICCS.cu b/Common/CUDA/PICCS.cu index 481ede08..e447b375 100644 --- a/Common/CUDA/PICCS.cu +++ b/Common/CUDA/PICCS.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for Steepest descend in POCS-type algorithms. @@ -60,10 +61,10 @@ Codes : https://github.com/CERN/TIGRE #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("ERROR in: %s \n",msg);\ - mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("err",hipGetErrorString(__err));\ } \ } while (0) @@ -263,9 +264,9 @@ do { \ bool isnan_cuda(float* vec, size_t size){ bool*d_nan; bool h_nan; - cudaMalloc((void **)&d_nan, sizeof (bool)); + hipMalloc((void **)&d_nan, sizeof (bool)); isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan); - cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost); + hipMemcpy(&h_nan, d_nan, sizeof(bool), hipMemcpyDeviceToHost); return h_nan; } @@ -281,24 +282,24 @@ bool isnan_cuda(float* vec, size_t size){ float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2; // memory for image - cudaMalloc(&d_image, mem_size); - cudaMalloc(&d_prior, mem_size); + hipMalloc(&d_image, mem_size); + hipMalloc(&d_prior, mem_size); cudaCheckErrors("Malloc Image error"); - cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice); + hipMemcpy(d_image, img, mem_size, hipMemcpyHostToDevice); + hipMemcpy(d_prior, prior, mem_size, hipMemcpyHostToDevice); cudaCheckErrors("Memory Malloc and Memset: SRC"); // memory for df - cudaMalloc(&d_dimgTV, mem_size); - cudaMalloc(&d_dpiccsTV, mem_size); + hipMalloc(&d_dimgTV, mem_size); + hipMalloc(&d_dpiccsTV, mem_size); cudaCheckErrors("Memory Malloc and Memset: TV"); - cudaMalloc(&d_norm2, mem_size); + hipMalloc(&d_norm2, mem_size); cudaCheckErrors("Memory Malloc and Memset: TV"); - cudaMalloc(&d_aux_image, mem_size); + hipMalloc(&d_aux_image, mem_size); cudaCheckErrors("Memory Malloc and Memset: TV"); // memory for L2norm auxiliar - cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS); + hipMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS); cudaCheckErrors("Memory Malloc and Memset: NORMAux"); @@ -315,64 +316,64 @@ bool isnan_cuda(float* vec, size_t size){ for(unsigned int i=0;i>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Gradient"); // mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio), total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Multiplication error"); substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Substraction error"); gradientTV<<>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Gradient"); // mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Multiplication error"); // mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); //NOMRALIZE via reduction //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); - cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice); + hipMemcpy(d_norm2, d_dimgTV, mem_size, hipMemcpyDeviceToDevice); cudaCheckErrors("Copy from gradient call error"); reduceNorm2 << > >(d_norm2, d_aux_small, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("reduce1"); if (dimgridRed > 1) { reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("reduce2"); - cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy"); + hipMemcpy(&sumnorm2, d_norm2, sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("hipMemcpy"); } else { - cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy"); + hipMemcpy(&sumnorm2, d_aux_small, sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("hipMemcpy"); } // mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2)); //MULTIPLY HYPERPARAMETER sqrt(sumnorm2) multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2), total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Multiplication error"); //SUBSTRACT GRADIENT substractArrays <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); cudaCheckErrors("Substraction error"); // mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false"); // mexPrintf("\n"); @@ -381,18 +382,18 @@ bool isnan_cuda(float* vec, size_t size){ cudaCheckErrors("TV minimization"); - cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost); + hipMemcpy(dst, d_image, mem_size, hipMemcpyDeviceToHost); cudaCheckErrors("Copy result back"); - cudaFree(d_image); - cudaFree(d_dpiccsTV); - cudaFree(d_aux_image); - cudaFree(d_aux_small); - cudaFree(d_prior); - cudaFree(d_norm2); + hipFree(d_image); + hipFree(d_dpiccsTV); + hipFree(d_aux_image); + hipFree(d_aux_small); + hipFree(d_prior); + hipFree(d_norm2); cudaCheckErrors("Memory free"); - cudaDeviceReset(); + hipDeviceReset(); } diff --git a/Common/CUDA/PICCS.cu.prehip b/Common/CUDA/PICCS.cu.prehip new file mode 100644 index 00000000..481ede08 --- /dev/null +++ b/Common/CUDA/PICCS.cu.prehip @@ -0,0 +1,398 @@ +/*------------------------------------------------------------------------- + * + * CUDA functions for Steepest descend in POCS-type algorithms. + * + * This file will iteratively minimize by stepest descend the total variation + * of the input image, with the parameters given, using GPUs. + * + * CODE by Ander Biguri + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + + + + + + + +#define MAXTHREADS 1024 + +#include "PICCS.hpp" + + + + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("ERROR in: %s \n",msg);\ + mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\ + } \ +} while (0) + +// CUDA kernels +//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927 + __global__ void divideArrayScalar(float* vec,float scalar,const size_t n) + { + unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x; + for(; i= 0 && z= 0 && y= 0 && x= cols || y >= rows || z >= depth ) + return; + + float df[3] ={0,0,0}; + float dfi[3]={0,0,0}; // dfi== \partial f_{i+1,j,k} + float dfj[3]={0,0,0}; + float dfk[3]={0,0,0}; + gradient(f,df ,z ,y ,x , depth,rows,cols); + gradient(f,dfi ,z ,y ,x+1, depth,rows,cols); + gradient(f,dfj ,z ,y+1,x , depth,rows,cols); + gradient(f,dfk ,z+1,y ,x , depth,rows,cols); + float eps=0.000001; //% avoid division by zero + dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps) + -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps) // I wish I coudl precompute this, but if I do then Id need to recompute the gradient. + -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps) + -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps); + + } + + __device__ void warpReduce(volatile float *sdata, size_t tid) { + sdata[tid] += sdata[tid + 32]; + sdata[tid] += sdata[tid + 16]; + sdata[tid] += sdata[tid + 8]; + sdata[tid] += sdata[tid + 4]; + sdata[tid] += sdata[tid + 2]; + sdata[tid] += sdata[tid + 1]; + } + + __global__ void reduceNorm2(float *g_idata, float *g_odata, size_t n){ + extern __shared__ volatile float sdata[]; + //http://stackoverflow.com/a/35133396/1485872 + size_t tid = threadIdx.x; + size_t i = blockIdx.x*blockDim.x + tid; + size_t gridSize = blockDim.x*gridDim.x; + float mySum = 0; + float value=0; + while (i < n) { + value=g_idata[i]; //avoid reading twice + mySum += value*value; + i += gridSize; + } + sdata[tid] = mySum; + __syncthreads(); + + if (tid < 512) + sdata[tid] += sdata[tid + 512]; + __syncthreads(); + if (tid < 256) + sdata[tid] += sdata[tid + 256]; + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + +#if (__CUDART_VERSION >= 9000) + if ( tid < 32 ) + { + mySum = sdata[tid] + sdata[tid + 32]; + for (int offset = warpSize/2; offset > 0; offset /= 2) { + mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); + } + } +#else + if (tid < 32) { + warpReduce(sdata, tid); + mySum = sdata[0]; + } +#endif + if (tid == 0) g_odata[blockIdx.x] = mySum; + } + __global__ void reduceSum(float *g_idata, float *g_odata, size_t n){ + extern __shared__ volatile float sdata[]; + //http://stackoverflow.com/a/35133396/1485872 + size_t tid = threadIdx.x; + size_t i = blockIdx.x*blockDim.x + tid; + size_t gridSize = blockDim.x*gridDim.x; + float mySum = 0; + // float value=0; + while (i < n) { + mySum += g_idata[i]; + i += gridSize; + } + sdata[tid] = mySum; + __syncthreads(); + + if (tid < 512) + sdata[tid] += sdata[tid + 512]; + __syncthreads(); + if (tid < 256) + sdata[tid] += sdata[tid + 256]; + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + +#if (__CUDART_VERSION >= 9000) + if ( tid < 32 ) + { + mySum = sdata[tid] + sdata[tid + 32]; + for (int offset = warpSize/2; offset > 0; offset /= 2) { + mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); + } + } +#else + if (tid < 32) { + warpReduce(sdata, tid); + mySum = sdata[0]; + } +#endif + if (tid == 0) g_odata[blockIdx.x] = mySum; + } + + +bool isnan_cuda(float* vec, size_t size){ + bool*d_nan; + bool h_nan; + cudaMalloc((void **)&d_nan, sizeof (bool)); + isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan); + cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost); + return h_nan; + +} + +// main function + void piccs_tv(const float* img,const float* prior, float* dst,float alpha,float ratio, const long* image_size, int maxIter, const GpuIds& gpuids){ + + + + + size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; + size_t mem_size = sizeof(float) * total_pixels; + + float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2; + // memory for image + cudaMalloc(&d_image, mem_size); + cudaMalloc(&d_prior, mem_size); + + cudaCheckErrors("Malloc Image error"); + cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice); + cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice); + cudaCheckErrors("Memory Malloc and Memset: SRC"); + // memory for df + cudaMalloc(&d_dimgTV, mem_size); + cudaMalloc(&d_dpiccsTV, mem_size); + cudaCheckErrors("Memory Malloc and Memset: TV"); + cudaMalloc(&d_norm2, mem_size); + cudaCheckErrors("Memory Malloc and Memset: TV"); + cudaMalloc(&d_aux_image, mem_size); + cudaCheckErrors("Memory Malloc and Memset: TV"); + + // memory for L2norm auxiliar + cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS); + cudaCheckErrors("Memory Malloc and Memset: NORMAux"); + + + + // For the gradient + dim3 blockGrad(10, 10, 10); + dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (image_size[2]+blockGrad.z-1)/blockGrad.z); + + // For the reduction + float sumnorm2; + size_t dimblockRed = MAXTHREADS; + size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS; + + + for(unsigned int i=0;i>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]); + cudaDeviceSynchronize(); + cudaCheckErrors("Gradient"); +// mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); + + + multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio), total_pixels); + cudaDeviceSynchronize(); + cudaCheckErrors("Multiplication error"); + + substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels); + cudaDeviceSynchronize(); + cudaCheckErrors("Substraction error"); + + gradientTV<<>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]); + cudaDeviceSynchronize(); + cudaCheckErrors("Gradient"); +// mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); + + multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio, total_pixels); + cudaDeviceSynchronize(); + cudaCheckErrors("Multiplication error"); +// mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); + + + addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels); + cudaDeviceSynchronize(); + //NOMRALIZE via reduction + //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); + cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice); + cudaCheckErrors("Copy from gradient call error"); + reduceNorm2 << > >(d_norm2, d_aux_small, total_pixels); + cudaDeviceSynchronize(); + cudaCheckErrors("reduce1"); + if (dimgridRed > 1) { + reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed); + cudaDeviceSynchronize(); + cudaCheckErrors("reduce2"); + cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrors("cudaMemcpy"); + + } + else { + cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrors("cudaMemcpy"); + } +// mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2)); + //MULTIPLY HYPERPARAMETER sqrt(sumnorm2) + multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2), total_pixels); + cudaDeviceSynchronize(); + cudaCheckErrors("Multiplication error"); + //SUBSTRACT GRADIENT + substractArrays <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels); + cudaDeviceSynchronize(); + cudaCheckErrors("Substraction error"); +// mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false"); +// mexPrintf("\n"); + sumnorm2=0; + } + + cudaCheckErrors("TV minimization"); + + cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost); + cudaCheckErrors("Copy result back"); + + cudaFree(d_image); + cudaFree(d_dpiccsTV); + cudaFree(d_aux_image); + cudaFree(d_aux_small); + cudaFree(d_prior); + cudaFree(d_norm2); + + + cudaCheckErrors("Memory free"); + cudaDeviceReset(); + } + diff --git a/Common/CUDA/PICCS.hpp.prehip b/Common/CUDA/PICCS.hpp.prehip new file mode 100644 index 00000000..e3592dbb --- /dev/null +++ b/Common/CUDA/PICCS.hpp.prehip @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * Header for CUDA functions for Steepest descend in POCS-type algorithms. + * + * This file has the required headers for POCS_TV.cu + * + * CODE by Ander Biguri + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + + + + + + + +#ifndef GD_TV_HPP +#define GD_TV_HPP +#include "TIGRE_common.hpp" +#include "GpuIds.hpp" + +void piccs_tv(const float* img,const float* prior, float* dst,float alpha, float ratio, const long* image_size, int maxIter, const GpuIds& gpuids); + + +#endif \ No newline at end of file diff --git a/Common/CUDA/RandomNumberGenerator.cu b/Common/CUDA/RandomNumberGenerator.cu index d7d1224a..5910b407 100644 --- a/Common/CUDA/RandomNumberGenerator.cu +++ b/Common/CUDA/RandomNumberGenerator.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for random number generator @@ -45,40 +46,40 @@ #include #include -#include -#include -#include +#include +#include +#include #include "gpuUtils.hpp" #include "RandomNumberGenerator.hpp" #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\ + hipDeviceReset();\ + mexErrMsgIdAndTxt("RandomNumberGenerator:",hipGetErrorString(__err));\ } \ } while (0) -__global__ void setup_kernel(curandState *state) { +__global__ void setup_kernel(hiprandState *state) { int idx = threadIdx.x + blockIdx.x * blockDim.x; /* Each thread gets same seed, a different sequence number, no offset */ - curand_init(1234, idx, 0, &state[idx]); + hiprand_init(1234, idx, 0, &state[idx]); } -__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) { +__global__ void GeneratePoisson(hiprandState *state, const float* pfIn, size_t uiLen, float* pfOut) { int idx = threadIdx.x + blockIdx.x * blockDim.x; /* Copy state to local memory for efficiency */ - curandState localState = state[idx]; + hiprandState localState = state[idx]; int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); for (int iI = 0; iI < iIter; ++iI) { size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; if (uiPos < uiLen) { /* Poisson */ - unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); + unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]); pfOut[uiPos] = (float)uiPoisson; } } @@ -86,7 +87,7 @@ __global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t ui state[idx] = localState; } -__global__ void GeneratePoissonAddGaussian(curandState *state, +__global__ void GeneratePoissonAddGaussian(hiprandState *state, const float* pfIn, size_t uiLen, float fGaussMu, @@ -95,15 +96,15 @@ __global__ void GeneratePoissonAddGaussian(curandState *state, { int idx = threadIdx.x + blockIdx.x * blockDim.x; /* Copy state to local memory for efficiency */ - curandState localState = state[idx]; + hiprandState localState = state[idx]; int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); for (int iI = 0; iI < iIter; ++iI) { size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; if (uiPos < uiLen) { /* Poisson */ - unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); + unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]); /* Gaussian */ - float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu; + float fNormal = hiprand_normal(&localState) * fGaussSigma + fGaussMu; pfOut[uiPos] = fNormal + (float)uiPoisson; } } @@ -127,31 +128,31 @@ void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpu // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut); float* d_pfIn = nullptr; float* d_pfOut = nullptr; - cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); - cudaCheckErrors("poisson_1d fail cudaMalloc 1"); - cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); - cudaCheckErrors("poisson_1d fail cudaMalloc 2"); - cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); - cudaCheckErrors("poisson_1d fail cudaMemcpy 1"); + hipMalloc((void **)&d_pfIn, uiLen * sizeof(float)); + cudaCheckErrors("poisson_1d fail hipMalloc 1"); + hipMalloc((void **)&d_pfOut, uiLen * sizeof(float)); + cudaCheckErrors("poisson_1d fail hipMalloc 2"); + hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice); + cudaCheckErrors("poisson_1d fail hipMemcpy 1"); // float fMin, fMax; // GetMinMax(pfIn, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - curandState *curandStates = nullptr; + hiprandState *curandStates = nullptr; const int kiBlockDim = 1024; // Threads per Block const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; - cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); - cudaCheckErrors("poisson_1d fail cudaMalloc 3"); + hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState)); + cudaCheckErrors("poisson_1d fail hipMalloc 3"); setup_kernel<<>>(curandStates); GeneratePoisson<<>>(curandStates, d_pfIn, uiLen, d_pfOut); - cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("poisson_1d fail cudaMemcpy 2"); + hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("poisson_1d fail hipMemcpy 2"); // GetMinMax(pfOut, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - cudaFree(d_pfIn); d_pfIn = nullptr; - cudaFree(d_pfOut); d_pfOut = nullptr; - cudaFree(curandStates); curandStates = nullptr; + hipFree(d_pfIn); d_pfIn = nullptr; + hipFree(d_pfOut); d_pfOut = nullptr; + hipFree(curandStates); curandStates = nullptr; } void poisson_gaussian_1d(const float* pfIn, @@ -164,30 +165,30 @@ void poisson_gaussian_1d(const float* pfIn, // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut); float* d_pfIn = nullptr; float* d_pfOut = nullptr; - cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1"); - cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2"); - cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); - cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1"); + hipMalloc((void **)&d_pfIn, uiLen * sizeof(float)); + cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 1"); + hipMalloc((void **)&d_pfOut, uiLen * sizeof(float)); + cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 2"); + hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice); + cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 1"); // float fMin, fMax; // GetMinMax(pfIn, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - curandState *curandStates = nullptr; + hiprandState *curandStates = nullptr; const int kiBlockDim = 64; // Threads per Block const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; - cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3"); + hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState)); + cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 3"); setup_kernel<<>>(curandStates); GeneratePoissonAddGaussian<<>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut); - cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2"); + hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost); + cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 2"); // GetMinMax(pfOut, uiLen, fMin, fMax); // printf("fMin, fMax = %f, %f\n", fMin, fMax); - cudaFree(d_pfIn); d_pfIn = nullptr; - cudaFree(d_pfOut); d_pfOut = nullptr; - cudaFree(curandStates); curandStates = nullptr; + hipFree(d_pfIn); d_pfIn = nullptr; + hipFree(d_pfOut); d_pfOut = nullptr; + hipFree(curandStates); curandStates = nullptr; } diff --git a/Common/CUDA/RandomNumberGenerator.cu.prehip b/Common/CUDA/RandomNumberGenerator.cu.prehip new file mode 100644 index 00000000..d7d1224a --- /dev/null +++ b/Common/CUDA/RandomNumberGenerator.cu.prehip @@ -0,0 +1,193 @@ +/*------------------------------------------------------------------------- + * + * CUDA functions for random number generator + * + * Adds noise of Poisson and normal distribution to the input. + * + * CODE by Tomoyuki SADAKANE + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include + +#include "gpuUtils.hpp" +#include "RandomNumberGenerator.hpp" + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + cudaDeviceReset();\ + mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\ + } \ +} while (0) + + +__global__ void setup_kernel(curandState *state) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + /* Each thread gets same seed, a different sequence number, no offset */ + curand_init(1234, idx, 0, &state[idx]); +} + +__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + /* Copy state to local memory for efficiency */ + curandState localState = state[idx]; + int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); + for (int iI = 0; iI < iIter; ++iI) { + size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; + if (uiPos < uiLen) { + /* Poisson */ + unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); + pfOut[uiPos] = (float)uiPoisson; + } + } + /* Copy state back to global memory */ + state[idx] = localState; +} + +__global__ void GeneratePoissonAddGaussian(curandState *state, + const float* pfIn, + size_t uiLen, + float fGaussMu, + float fGaussSigma, + float* pfOut) +{ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + /* Copy state to local memory for efficiency */ + curandState localState = state[idx]; + int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); + for (int iI = 0; iI < iIter; ++iI) { + size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; + if (uiPos < uiLen) { + /* Poisson */ + unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); + /* Gaussian */ + float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu; + pfOut[uiPos] = fNormal + (float)uiPoisson; + } + } + /* Copy state back to global memory */ + state[idx] = localState; +} + + +template +void GetMinMax(const T_value* pfIn, size_t uiLen, T_value& tvMin, T_value& tvMax) { + tvMin = pfIn[0]; + tvMax = pfIn[0]; + T_value tvVal; + for (int iI = 1; iI < uiLen; ++iI) { + tvVal = pfIn[iI]; + if (tvMax < tvVal) { tvMax = tvVal; continue;} + if (tvMin > tvVal) { tvMin = tvVal; continue;} + } +} +void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids) { + // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut); + float* d_pfIn = nullptr; + float* d_pfOut = nullptr; + cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); + cudaCheckErrors("poisson_1d fail cudaMalloc 1"); + cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); + cudaCheckErrors("poisson_1d fail cudaMalloc 2"); + cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); + cudaCheckErrors("poisson_1d fail cudaMemcpy 1"); + + // float fMin, fMax; + // GetMinMax(pfIn, uiLen, fMin, fMax); + // printf("fMin, fMax = %f, %f\n", fMin, fMax); + curandState *curandStates = nullptr; + const int kiBlockDim = 1024; // Threads per Block + const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; + cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); + cudaCheckErrors("poisson_1d fail cudaMalloc 3"); + setup_kernel<<>>(curandStates); + GeneratePoisson<<>>(curandStates, d_pfIn, uiLen, d_pfOut); + cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrors("poisson_1d fail cudaMemcpy 2"); + // GetMinMax(pfOut, uiLen, fMin, fMax); + // printf("fMin, fMax = %f, %f\n", fMin, fMax); + + cudaFree(d_pfIn); d_pfIn = nullptr; + cudaFree(d_pfOut); d_pfOut = nullptr; + cudaFree(curandStates); curandStates = nullptr; +} + +void poisson_gaussian_1d(const float* pfIn, + size_t uiLen, + float fGaussMu, + float fGaussSigma, + float* pfOut, + GpuIds& gpuids) +{ + // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut); + float* d_pfIn = nullptr; + float* d_pfOut = nullptr; + cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); + cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1"); + cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); + cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2"); + cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); + cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1"); + + // float fMin, fMax; + // GetMinMax(pfIn, uiLen, fMin, fMax); + // printf("fMin, fMax = %f, %f\n", fMin, fMax); + curandState *curandStates = nullptr; + const int kiBlockDim = 64; // Threads per Block + const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; + cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); + cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3"); + setup_kernel<<>>(curandStates); + GeneratePoissonAddGaussian<<>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut); + cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2"); + // GetMinMax(pfOut, uiLen, fMin, fMax); + // printf("fMin, fMax = %f, %f\n", fMin, fMax); + + + cudaFree(d_pfIn); d_pfIn = nullptr; + cudaFree(d_pfOut); d_pfOut = nullptr; + cudaFree(curandStates); curandStates = nullptr; +} diff --git a/Common/CUDA/RandomNumberGenerator.hpp.prehip b/Common/CUDA/RandomNumberGenerator.hpp.prehip new file mode 100644 index 00000000..4ba68d8d --- /dev/null +++ b/Common/CUDA/RandomNumberGenerator.hpp.prehip @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * Header CUDA functions for random number generator + * + * Adds noise of Poisson and normal distribution to the input. + * + * CODE by Tomoyuki SADAKANE + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + +#include "TIGRE_common.hpp" +#include "GpuIds.hpp" +void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids); +void poisson_gaussian_1d(const float* pfPoissonL, size_t uiLen, float fGaussMu, float fGaussSigma, float* pfOut, GpuIds& gpuids); diff --git a/Common/CUDA/Siddon_projection.cu b/Common/CUDA/Siddon_projection.cu index 2a025f8c..8e551626 100644 --- a/Common/CUDA/Siddon_projection.cu +++ b/Common/CUDA/Siddon_projection.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for ray-voxel intersection based projection @@ -48,18 +49,18 @@ */ #include -#include -#include +#include +#include #include "Siddon_projection.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("Ax:Siddon_projection",hipGetErrorString(__err));\ } \ } while (0) @@ -94,7 +95,7 @@ do { \ * **/ - void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc); + void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc); __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device @@ -111,7 +112,7 @@ __global__ void kernelPixelDetector( Geometry geo, float* detector, const int currProjSetNumber, const int totalNoOfProjections, - cudaTextureObject_t tex){ + hipTextureObject_t tex){ unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; @@ -311,10 +312,10 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * if (!fits_in_memory){ dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); - cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); + hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); + hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); cudaCheckErrors("cudaMallocauxiliarty projections fail"); } } @@ -323,12 +324,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // This is happening regarthless if the image fits on memory float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); - cudaMemset(dProjection[dev*2+i] ,0,num_bytes_proj); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); + hipMemset(dProjection[dev*2+i] ,0,num_bytes_proj); + cudaCheckErrors("hipMalloc projections fail"); } } @@ -338,13 +339,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ - cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } #endif cudaCheckErrors("Error pinning memory"); @@ -354,18 +355,18 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // auxiliary variables Point3D source, deltaU, deltaV, uvOrigin; Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); cudaCheckErrors("Error allocating auxiliary constant memory"); // Create Streams for overlapping memcopy and compute int nStreams=deviceCount*2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i+dev*2]); + hipStreamCreate(&stream[i+dev*2]); } } @@ -376,8 +377,8 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management. int projection_this_block; - cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; - cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; + hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount]; + hipArray **d_cuArrTex = new hipArray*[deviceCount]; for (unsigned int sp=0;sp>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); } @@ -450,7 +451,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // 1) grab previous results and put them in the auxiliary variable dProjection_accum for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -463,12 +464,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * else projection_this_block=PROJ_PER_BLOCK; - cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); + hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]); } // 2) take the results from current compute call and add it to the code in execution. for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -481,7 +482,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * else projection_this_block=PROJ_PER_BLOCK; - cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished + hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); } } // end accumulation case, where the image needs to be split @@ -490,7 +491,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * if (i>0){ for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on previous set on this GPU proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; if (dev+1==deviceCount) { //is it the last device? @@ -510,13 +511,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * else { projection_this_block=PROJ_PER_BLOCK; } - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } } // Make sure Computation on kernels has finished before we launch the next batch. for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*2]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*2]); } } @@ -524,7 +525,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * // We still have the last set of projections to get out of GPUs for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -533,106 +534,106 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const * projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. + hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } // Make sure everyone has done their bussiness before the next image split: - cudaDeviceSynchronize(); + hipDeviceSynchronize(); } // End image split loop. cudaCheckErrors("Main loop fail"); /////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texImg[dev]); - cudaFreeArray(d_cuArrTex[dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texImg[dev]); + hipFreeArray(d_cuArrTex[dev]); } delete[] texImg; texImg = 0; delete[] d_cuArrTex; d_cuArrTex = 0; // Freeing Stage for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection[dev*2]); - cudaFree(dProjection[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection[dev*2]); + hipFree(dProjection[dev*2+1]); } free(dProjection); if(!fits_in_memory){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection_accum[dev*2]); - cudaFree(dProjection_accum[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection_accum[dev*2]); + hipFree(dProjection_accum[dev*2+1]); } free(dProjection_accum); } freeGeoArray(splits,geoArray); - cudaFreeHost(projParamsArrayHost); + hipHostFree(projParamsArrayHost); for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ - cudaHostUnregister(img); + hipHostUnregister(img); } - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); #endif - //cudaDeviceReset(); + //hipDeviceReset(); return 0; } -void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc) +void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc) { //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); const unsigned int num_devices = gpuids.GetLength(); if(alloc){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params); } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModePoint; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } cudaCheckErrors("Texture object creation fail"); } @@ -842,8 +843,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ const int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree +#include +#include +#include "Siddon_projection.hpp" +#include "TIGRE_common.hpp" +#include + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\ + } \ +} while (0) + + +#define MAXTREADS 1024 +#define PROJ_PER_BLOCK 9 +#define PIXEL_SIZE_BLOCK 9 + /*GEOMETRY DEFINITION + * + * Detector plane, behind + * |-----------------------------| + * | | + * | | + * | | + * | | + * | +--------+ | + * | / /| | + * A Z | / / |*D | + * | | +--------+ | | + * | | | | | | + * | | | *O | + | + * --->y | | | / | + * / | | |/ | + * V X | +--------+ | + * |-----------------------------| + * + * *S + * + * + * + * + * + **/ + + void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc); + +__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device + + +__global__ void vecAddInPlace(float *a, float *b, unsigned long n) +{ + int idx = blockIdx.x*blockDim.x+threadIdx.x; + // Make sure we do not go out of bounds + if (idx < n) + a[idx] = a[idx] + b[idx]; +} + +__global__ void kernelPixelDetector( Geometry geo, + float* detector, + const int currProjSetNumber, + const int totalNoOfProjections, + cudaTextureObject_t tex){ + + + unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; + unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; + unsigned long long projNumber=threadIdx.z; + + + if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) + return; + +#if IS_FOR_MATLAB_TIGRE + size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#else + size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#endif + unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array (for a given GPU) + + if(indAlpha>=totalNoOfProjections) + return; + + Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection + Point3D deltaU = projParamsArrayDev[4*projNumber+1]; + Point3D deltaV = projParamsArrayDev[4*projNumber+2]; + Point3D source = projParamsArrayDev[4*projNumber+3]; + + /////// Get coordinates XYZ of pixel UV + unsigned long pixelV = geo.nDetecV-v-1; + unsigned long pixelU = u; + Point3D pixel1D; + pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); + pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); + pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); + /////// + // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516 + ////// + // Also called Jacobs algorithms + Point3D ray; + // vector of Xray + ray.x=pixel1D.x-source.x; + ray.y=pixel1D.y-source.y; + ray.z=pixel1D.z-source.z; + float eps=0.001; + ray.x=(fabsf(ray.x) Nvoxel+1 + + axm=fminf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x)); + aym=fminf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y)); + azm=fminf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z)); + axM=fmaxf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x)); + ayM=fmaxf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y)); + azM=fmaxf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z)); + + float am=fmaxf(fmaxf(axm,aym),azm); + float aM=fminf(fminf(axM,ayM),azM); + + // line intersects voxel space -> am=aM) + detector[idx]=0; + + // Compute max/min image INDEX for intersection eq(11-19) + // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version + float imin,imax,jmin,jmax,kmin,kmax; + // for X + if( source.x(tex, i, j, k); + i=i+iu; + ac=ax; + ax+=axu; + }else if(ay==aminc){ + sum+=(ay-ac)*tex3D(tex, i, j, k); + j=j+ju; + ac=ay; + ay+=ayu; + }else if(az==aminc){ + sum+=(az-ac)*tex3D(tex, i, j, k); + k=k+ku; + ac=az; + az+=azu; + } + aminc=fminf(fminf(ax,ay),az); + } + detector[idx]=sum*maxlength; +} + + +int siddon_ray_projection(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUselect","There are no available device(s) that support CUDA\n"); + } + // + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("Ax:Siddon_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + int dev; + + // Check free memory + size_t mem_GPU_global; + checkFreeMemory(gpuids, &mem_GPU_global); + + size_t mem_image= (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); + size_t mem_proj= (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float); + + // Does everything fit in the GPUs? + const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to + // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. +#ifndef NO_PINNED_MEMORY + if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ + cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + } +#endif + cudaCheckErrors("Error pinning memory"); + + + + // auxiliary variables + Point3D source, deltaU, deltaV, uvOrigin; + Point3D* projParamsArrayHost; + cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + cudaCheckErrors("Error allocating auxiliary constant memory"); + + // Create Streams for overlapping memcopy and compute + int nStreams=deviceCount*2; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + for (int i = 0; i < 2; ++i){ + cudaStreamCreate(&stream[i+dev*2]); + + } + } + cudaCheckErrors("Stream creation fail"); + + int nangles_device=(nangles+deviceCount-1)/deviceCount; + int nangles_last_device=(nangles-(deviceCount-1)*nangles_device); + unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK + unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management. + int projection_this_block; + cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; + cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; + + for (unsigned int sp=0;sp=nangles) + break; + if ((i*PROJ_PER_BLOCK+j)>=nangles_device) + break; + geoArray[sp].alpha=angles[proj_global*3]; + geoArray[sp].theta=angles[proj_global*3+1]; + geoArray[sp].psi =angles[proj_global*3+2]; + + + //precomute distances for faster execution + //Precompute per angle constant stuff for speed + computeDeltas_Siddon(geoArray[sp],proj_global, &uvOrigin, &deltaU, &deltaV, &source); + //Ray tracing! + projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection + projParamsArrayHost[4*j+1]=deltaU; + projParamsArrayHost[4*j+2]=deltaV; + projParamsArrayHost[4*j+3]=source; + + } + cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*2]); + cudaStreamSynchronize(stream[dev*2]); + cudaCheckErrors("kernel fail"); + kernelPixelDetector<<>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); + } + + + // Now that the computation is happening, we need to either prepare the memory for + // combining of the projections (splits>1) and start removing previous results. + + + // If our image does not fit in memory then we need to make sure we accumulate previous results too. + // This is done in 2 steps: + // 1)copy previous results back into GPU + // 2)accumulate with current results + // The code to take them out is the same as when there are no splits needed + if( !fits_in_memory&&sp>0) + { + // 1) grab previous results and put them in the auxiliary variable dProjection_accum + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on this set on this GPU + proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; + if(proj_global>=nangles) + break; + + // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... + if(i+1==noOfKernelCalls) //is it the last block? + projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) + nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) + else + projection_this_block=PROJ_PER_BLOCK; + + cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); + } + // 2) take the results from current compute call and add it to the code in execution. + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on this set on this GPU + proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; + if(proj_global>=nangles) + break; + + // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... + if(i+1==noOfKernelCalls) //is it the last block? + projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) + nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) + else + projection_this_block=PROJ_PER_BLOCK; + + cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished + vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); + } + } // end accumulation case, where the image needs to be split + + // Now, lets get out the projections from the previous execution of the kernels. + if (i>0){ + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on previous set on this GPU + proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; + if (dev+1==deviceCount) { //is it the last device? + // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device + if (i-1 < noOfKernelCallsLastDev) { + // The previous set(block) was not empty. + projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global); + } + else { + // The previous set was empty. + // This happens if deviceCount > PROJ_PER_BLOCK+1. + // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199. + // e.g. PROJ_PER_BLOCK = 1, deviceCount = 3, nangles = 7. + break; + } + } + else { + projection_this_block=PROJ_PER_BLOCK; + } + cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + } + } + // Make sure Computation on kernels has finished before we launch the next batch. + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaStreamSynchronize(stream[dev*2]); + } + } + + + // We still have the last set of projections to get out of GPUs + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on this set on this GPU + proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; + if(proj_global>=nangles) + break; + // How many projections are left here? + projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) + nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) + + cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. + cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); + cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + } + // Make sure everyone has done their bussiness before the next image split: + cudaDeviceSynchronize(); + } // End image split loop. + + cudaCheckErrors("Main loop fail"); + /////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////// + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDestroyTextureObject(texImg[dev]); + cudaFreeArray(d_cuArrTex[dev]); + } + delete[] texImg; texImg = 0; + delete[] d_cuArrTex; d_cuArrTex = 0; + // Freeing Stage + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaFree(dProjection[dev*2]); + cudaFree(dProjection[dev*2+1]); + + } + free(dProjection); + + if(!fits_in_memory){ + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaFree(dProjection_accum[dev*2]); + cudaFree(dProjection_accum[dev*2+1]); + + } + free(dProjection_accum); + } + freeGeoArray(splits,geoArray); + cudaFreeHost(projParamsArrayHost); + + + for (int i = 0; i < nStreams; ++i) + cudaStreamDestroy(stream[i]) ; +#ifndef NO_PINNED_MEMORY + if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ + cudaHostUnregister(img); + } + cudaCheckErrors("cudaFree fail"); +#endif + //cudaDeviceReset(); + return 0; +} + + + + +void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc) +{ + //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; + const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const unsigned int num_devices = gpuids.GetLength(); + if(alloc){ + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + + //cudaArray Descriptor + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //cuda Array + cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + } + } + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemcpy3DParms copyParams = {0}; + //Array creation + copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.dstArray = d_cuArrTex[dev]; + copyParams.extent = extent; + copyParams.kind = cudaMemcpyHostToDevice; + cudaMemcpy3DAsync(©Params); + } + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_cuArrTex[dev]; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeBorder; + texDescr.addressMode[1] = cudaAddressModeBorder; + texDescr.addressMode[2] = cudaAddressModeBorder; + texDescr.readMode = cudaReadModeElementType; + cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + + } + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + cudaCheckErrors("Texture object creation fail"); +} + +/* This code generates the geometries needed to split the image properly in + * cases where the entire image does not fit in the memory of the GPU + **/ +void splitImage(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){ + + unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible + for(unsigned int sp=0;spx; + auxPoint.y=point->y; + auxPoint.z=point->z; + + point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x + +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y + +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; + + point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x + +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y + +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; + + point->z=-sin(geo.dPitch[i])*auxPoint.x + +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y + +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z; + +} +void eulerZYZ(Geometry geo, Point3D* point){ + Point3D auxPoint; + auxPoint.x=point->x; + auxPoint.y=point->y; + auxPoint.z=point->z; + + point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+ + (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+ + cos(geo.alpha)*sin(geo.theta)*auxPoint.z; + + point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+ + (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+ + sin(geo.alpha)*sin(geo.theta)*auxPoint.z; + + point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+ + sin(geo.theta)*sin(geo.psi)*auxPoint.y+ + cos(geo.theta)*auxPoint.z; + + +} +//______________________________________________________________________________ +// +// Function: freeGeoArray +// +// Description: Frees the memory from the geometry array for multiGPU. +//______________________________________________________________________________ +void freeGeoArray(unsigned int splits,Geometry* geoArray){ + for(unsigned int sp=0;sp -#include -#include +#include +#include #include "Siddon_projection_parallel.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",hipGetErrorString(__err));\ } \ } while (0) // Declare the texture reference. -void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); +void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream); #define MAXTREADS 1024 @@ -105,7 +106,7 @@ __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is o __global__ void kernelPixelDetector_parallel( Geometry geo, - float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){ + float* detector, const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex){ unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; @@ -282,23 +283,23 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float); float** dProjection=(float **)malloc(2*sizeof(float *)); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[i], num_bytes); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[i], num_bytes); + cudaCheckErrors("hipMalloc projections fail"); } int nStreams=2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i]); + hipStreamCreate(&stream[i]); } // Texture object variables - cudaTextureObject_t *texImg = 0; - cudaArray **d_cuArrTex = 0; - texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); + hipTextureObject_t *texImg = 0; + hipArray **d_cuArrTex = 0; + texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*)); CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg [0],stream); cudaCheckErrors("Texture allocation fail"); @@ -310,7 +311,7 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); // 16x16 gave the best performance empirically // Funnily that makes it compatible with most GPUs..... @@ -349,36 +350,36 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa } - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]); + hipStreamSynchronize(stream[0]); kernelPixelDetector_parallel<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); // copy result to host if (i>0) - cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]); } - cudaDeviceSynchronize(); + hipDeviceSynchronize(); int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; - cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]); - cudaDestroyTextureObject(texImg[0]); - cudaFreeArray(d_cuArrTex[0]); + hipDestroyTextureObject(texImg[0]); + hipFreeArray(d_cuArrTex[0]); free(texImg); texImg = 0; free(d_cuArrTex); d_cuArrTex = 0; cudaCheckErrors("Unbind fail"); - cudaFree(dProjection[0]); - cudaFree(dProjection[1]); + hipFree(dProjection[0]); + hipFree(dProjection[1]); free(dProjection); - cudaFreeHost(projParamsArrayHost); - cudaCheckErrors("cudaFree d_imagedata fail"); + hipHostFree(projParamsArrayHost); + cudaCheckErrors("hipFree d_imagedata fail"); for (int i = 0; i < 2; ++i){ - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); } -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } @@ -482,41 +483,41 @@ void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uv *source=S2; } -void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; +void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); - cudaMemcpy3DParms copyParams = {0}; + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[0]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[1]); //Array creation End - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModePoint; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); } diff --git a/Common/CUDA/Siddon_projection_parallel.cu.prehip b/Common/CUDA/Siddon_projection_parallel.cu.prehip new file mode 100644 index 00000000..25a07e9d --- /dev/null +++ b/Common/CUDA/Siddon_projection_parallel.cu.prehip @@ -0,0 +1,540 @@ +/*------------------------------------------------------------------------- + * + * CUDA functions for ray-voxel intersection based projection + * + * This file has the necessary fucntiosn to perform X-ray parallel projection + * operation given a geaometry, angles and image. It usesthe so-called + * Jacobs algorithm to compute efficiently the length of the x-rays over + * voxel space. Its called Siddon because Jacobs algorithm its just a small + * improvement over the traditional Siddons method. + * + * CODE by Ander Biguri + * + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + + +#include +#include +#include +#include "Siddon_projection_parallel.hpp" +#include "TIGRE_common.hpp" +#include + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\ + } \ +} while (0) + + +// Declare the texture reference. +void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); + + +#define MAXTREADS 1024 +#define PROJ_PER_BLOCK 9 +#define PIXEL_SIZE_BLOCK 9 +/*GEOMETRY DEFINITION + * + * Detector plane, behind + * |-----------------------------| + * | | + * | | + * | | + * | | + * | +--------+ | + * | / /| | + * A Z | / / |*D | + * | | +--------+ | | + * | | | | | | + * | | | *O | + | + * --->y | | | / | + * / | | |/ | + * V X | +--------+ | + * |-----------------------------| + * + * *S + * + * + * + * + * + **/ + + +__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device + + +__global__ void kernelPixelDetector_parallel( Geometry geo, + float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){ + + unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; + unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; + unsigned long long projNumber=threadIdx.z; + + if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) + return; + + unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array + + +#if IS_FOR_MATLAB_TIGRE + size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#else + size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#endif + + if(indAlpha>=totalNoOfProjections) + return; + + Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection + Point3D deltaU = projParamsArrayDev[4*projNumber+1]; + Point3D deltaV = projParamsArrayDev[4*projNumber+2]; + Point3D source = projParamsArrayDev[4*projNumber+3]; + + + /////// Get coordinates XYZ of pixel UV + unsigned long pixelV = geo.nDetecV-v-1; + unsigned long pixelU = u; + Point3D pixel1D; + pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); + pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); + pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); + + + source.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x); + source.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y); + source.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z); + /////// + // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516 + ////// + Point3D ray; + // vector of Xray + ray.x=pixel1D.x-source.x; + ray.y=pixel1D.y-source.y; + ray.z=pixel1D.z-source.z; + // This variables are ommited because + // bx,by,bz ={0,0,0} + // dx,dy,dz ={1,1,1} + // compute parameter values for x-ray parametric equation. eq(3-10) + float axm,aym,azm; + float axM,ayM,azM; + + /************************************** + * + * + * Problem. In paralel beam, often ray.y or ray.x=0; + * This leads to infinities progpagating and breaking everything. + * + * We need to fix it. + * + ***************************************/ + + // In the paper Nx= number of X planes-> Nvoxel+1 + axm=fminf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x); + aym=fminf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y); +// azm=min(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z); + axM=fmaxf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x); + ayM=fmaxf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y); +// azM=max(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z); + float am=(fmaxf(axm,aym)); + float aM=(fminf(axM,ayM)); + + // line intersects voxel space -> am=aM) + detector[idx]=0.0f; + + // Compute max/min image INDEX for intersection eq(11-19) + // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version + float imin,imax,jmin,jmax; + // for X + if( source.x(tex, i, j, k);//(ax-ac)* + i=i+iu; + ac=ax; + ax+=axu; + }else if(ay==aminc){ + sum+=(ay-ac)*tex3D(tex, i, j, k);//(ay-ac)* + j=j+ju; + ac=ay; + ay+=ayu; +// }else if(az==aminc){ +// sum+=(az-ac)*tex3D(tex, i+0.5, j+0.5, k+0.5); +// k=k+ku; +// ac=az; +// az+=azu; + } + aminc=fminf(ay,ax); + } + detector[idx]=maxlength*sum; +} + + +int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ + + + + + + size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float); + float** dProjection=(float **)malloc(2*sizeof(float *)); + for (int i = 0; i < 2; ++i){ + cudaMalloc((void**)&dProjection[i], num_bytes); + cudaCheckErrors("cudaMalloc projections fail"); + } + int nStreams=2; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + + for (int i = 0; i < 2; ++i){ + cudaStreamCreate(&stream[i]); + } + + + + // Texture object variables + cudaTextureObject_t *texImg = 0; + cudaArray **d_cuArrTex = 0; + texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); + d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); + + CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg [0],stream); + cudaCheckErrors("Texture allocation fail"); + //Done! Image put into texture memory. + + + + Point3D source, deltaU, deltaV, uvOrigin; + + + Point3D* projParamsArrayHost; + cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + + // 16x16 gave the best performance empirically + // Funnily that makes it compatible with most GPUs..... + int divU,divV,divangle; + divU=PIXEL_SIZE_BLOCK; + divV=PIXEL_SIZE_BLOCK; + + dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1); + + dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK); + + unsigned int proj_global; + unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK + unsigned int i; + for ( i=0; i=nangles) + break; + geo.alpha=angles[proj_global*3]; + geo.theta=angles[proj_global*3+1]; + geo.psi =angles[proj_global*3+2]; + if(geo.alpha==0.0 || abs(geo.alpha-1.5707963267949)<0.0000001){ + geo.alpha=geo.alpha+1.1920929e-07; + } + + //precomute distances for faster execution + //Precompute per angle constant stuff for speed + computeDeltas_Siddon_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source); + //Ray tracing! + projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection + projParamsArrayHost[4*j+1]=deltaU; + projParamsArrayHost[4*j+2]=deltaV; + projParamsArrayHost[4*j+3]=source; + + } + + cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); + cudaStreamSynchronize(stream[0]); + kernelPixelDetector_parallel<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); + // copy result to host + if (i>0) + cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); + } + cudaDeviceSynchronize(); + + int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; + cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); + + + + cudaDestroyTextureObject(texImg[0]); + cudaFreeArray(d_cuArrTex[0]); + free(texImg); texImg = 0; + free(d_cuArrTex); d_cuArrTex = 0; + cudaCheckErrors("Unbind fail"); + cudaFree(dProjection[0]); + cudaFree(dProjection[1]); + free(dProjection); + cudaFreeHost(projParamsArrayHost); + cudaCheckErrors("cudaFree d_imagedata fail"); + + + for (int i = 0; i < 2; ++i){ + cudaStreamDestroy(stream[i]); + } +// cudaDeviceReset(); + return 0; +} + + + +/* This code precomputes The location of the source and the Delta U and delta V (in the warped space) + * to compute the locations of the x-rays. While it seems verbose and overly-optimized, + * it does saves about 30% of each of the kernel calls. Thats something! + **/ +void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){ + Point3D S; + + S.x =geo.DSO[i]; S.y = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); S.z = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); + + //End point + Point3D P,Pu0,Pv0; + + P.x =-(geo.DSD[i]-geo.DSO[i]); P.y = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); P.z = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); + Pu0.x=-(geo.DSD[i]-geo.DSO[i]); Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5); Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); + Pv0.x=-(geo.DSD[i]-geo.DSO[i]); Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1); + // Geometric trasnformations: + P.x=0;Pu0.x=0;Pv0.x=0; + + // Roll pitch yaw + rollPitchYaw(geo,i,&P); + rollPitchYaw(geo,i,&Pu0); + rollPitchYaw(geo,i,&Pv0); + //Now lets translate the points where they should be: + P.x=P.x-(geo.DSD[i]-geo.DSO[i]); + Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]); + Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]); + + S.x=0; + // Roll pitch yaw + rollPitchYaw(geo,i,&S); + //Now lets translate the points where they should be: + S.x=S.x+geo.DSO[i]; + + //1: Offset detector + + //P.x + P.y =P.y +geo.offDetecU[i]; P.z =P.z +geo.offDetecV[i]; + Pu0.y=Pu0.y+geo.offDetecU[i]; Pu0.z=Pu0.z+geo.offDetecV[i]; + Pv0.y=Pv0.y+geo.offDetecU[i]; Pv0.z=Pv0.z+geo.offDetecV[i]; + //S doesnt need to chagne + + + //3: Rotate (around z)! + Point3D Pfinal, Pfinalu0, Pfinalv0; + + Pfinal.x =P.x*cos(geo.alpha)-P.y*sin(geo.alpha); Pfinal.y =P.y*cos(geo.alpha)+P.x*sin(geo.alpha); Pfinal.z =P.z; + Pfinalu0.x=Pu0.x*cos(geo.alpha)-Pu0.y*sin(geo.alpha); Pfinalu0.y=Pu0.y*cos(geo.alpha)+Pu0.x*sin(geo.alpha); Pfinalu0.z=Pu0.z; + Pfinalv0.x=Pv0.x*cos(geo.alpha)-Pv0.y*sin(geo.alpha); Pfinalv0.y=Pv0.y*cos(geo.alpha)+Pv0.x*sin(geo.alpha); Pfinalv0.z=Pv0.z; + + Point3D S2; + S2.x=S.x*cos(geo.alpha)-S.y*sin(geo.alpha); + S2.y=S.y*cos(geo.alpha)+S.x*sin(geo.alpha); + S2.z=S.z; + + //2: Offset image (instead of offseting image, -offset everything else) + + Pfinal.x =Pfinal.x-geo.offOrigX[i]; Pfinal.y =Pfinal.y-geo.offOrigY[i]; Pfinal.z =Pfinal.z-geo.offOrigZ[i]; + Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i]; Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i]; Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i]; + Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i]; Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i]; Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i]; + S2.x=S2.x-geo.offOrigX[i]; S2.y=S2.y-geo.offOrigY[i]; S2.z=S2.z-geo.offOrigZ[i]; + + // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation); + Pfinal.x =Pfinal.x+geo.sVoxelX/2; Pfinal.y =Pfinal.y+geo.sVoxelY/2; Pfinal.z =Pfinal.z +geo.sVoxelZ/2; + Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2; Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2; Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2; + Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2; Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2; Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2; + S2.x =S2.x+geo.sVoxelX/2; S2.y =S2.y+geo.sVoxelY/2; S2.z =S2.z +geo.sVoxelZ/2; + + //4. Scale everything so dVoxel==1 + Pfinal.x =Pfinal.x/geo.dVoxelX; Pfinal.y =Pfinal.y/geo.dVoxelY; Pfinal.z =Pfinal.z/geo.dVoxelZ; + Pfinalu0.x=Pfinalu0.x/geo.dVoxelX; Pfinalu0.y=Pfinalu0.y/geo.dVoxelY; Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ; + Pfinalv0.x=Pfinalv0.x/geo.dVoxelX; Pfinalv0.y=Pfinalv0.y/geo.dVoxelY; Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ; + S2.x =S2.x/geo.dVoxelX; S2.y =S2.y/geo.dVoxelY; S2.z =S2.z/geo.dVoxelZ; + + + + //5. apply COR. Wherever everything was, now its offesetd by a bit + float CORx, CORy; + CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX; + CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY; + Pfinal.x+=CORx; Pfinal.y+=CORy; + Pfinalu0.x+=CORx; Pfinalu0.y+=CORy; + Pfinalv0.x+=CORx; Pfinalv0.y+=CORy; + S2.x+=CORx; S2.y+=CORy; + + // return + + *uvorigin=Pfinal; + + deltaU->x=Pfinalu0.x-Pfinal.x; + deltaU->y=Pfinalu0.y-Pfinal.y; + deltaU->z=Pfinalu0.z-Pfinal.z; + + deltaV->x=Pfinalv0.x-Pfinal.x; + deltaV->y=Pfinalv0.y-Pfinal.y; + deltaV->z=Pfinalv0.z-Pfinal.z; + + *source=S2; +} +void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; + + + const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + + //cudaArray Descriptor + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //cuda Array + cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + + + cudaMemcpy3DParms copyParams = {0}; + //Array creation + copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); + copyParams.dstArray = d_cuArrTex[0]; + copyParams.extent = extent; + copyParams.kind = cudaMemcpyHostToDevice; + cudaMemcpy3DAsync(©Params,stream[1]); + + + //Array creation End + + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_cuArrTex[0]; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModePoint; + texDescr.addressMode[0] = cudaAddressModeBorder; + texDescr.addressMode[1] = cudaAddressModeBorder; + texDescr.addressMode[2] = cudaAddressModeBorder; + texDescr.readMode = cudaReadModeElementType; + cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + +} + +#ifndef PROJECTION_HPP + +float maxDistanceCubeXY(Geometry geo, float alpha,int i){ + /////////// + // Compute initial "t" so we access safely as less as out of bounds as possible. + ////////// + + + float maxCubX,maxCubY; + // Forgetting Z, compute max distance: diagonal+offset + maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX; + maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY; + + return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY); + +} + +#endif diff --git a/Common/CUDA/Siddon_projection_parallel.hpp.prehip b/Common/CUDA/Siddon_projection_parallel.hpp.prehip new file mode 100644 index 00000000..c9c6fc77 --- /dev/null +++ b/Common/CUDA/Siddon_projection_parallel.hpp.prehip @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * Header CUDA functions for ray-voxel intersection based projection + * + * + * CODE by Ander Biguri + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + + + + + +#include "ray_interpolated_projection.hpp" +#include "types_TIGRE.hpp" +#include "GpuIds.hpp" + +#ifndef PROJECTION_PARALLEL_HPP_SIDDON +#define PROJECTION_PARALLEL_HPP_SIDDON +int siddon_ray_projection_parallel(float * img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids); + +//double computeMaxLength(Geometry geo, double alpha); +void computeDeltas_Siddon_parallel(Geometry geo, float alpha,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source); + +//double maxDistanceCubeXY(Geometry geo, double alpha,int i); + +// below, not used +//Geometry nomralizeGeometryImage(Geometry geo); +#endif \ No newline at end of file diff --git a/Common/CUDA/TIGRE_common.cpp.prehip b/Common/CUDA/TIGRE_common.cpp.prehip new file mode 100644 index 00000000..cf98e4b9 --- /dev/null +++ b/Common/CUDA/TIGRE_common.cpp.prehip @@ -0,0 +1,20 @@ +#if defined(IS_FOR_PYTIGRE) +#include +#include +#include +#include "TIGRE_common.hpp" +void mexPrintf(const char* format, ...) { + PRINT_HERE(""); + va_list argpointer; + va_start(argpointer, format); + vprintf(format, argpointer); + va_end(argpointer); +} +void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg) { + PRINT_HERE("%s %s\n", pcTag, pcMsg); + exit(1); +} +void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg) { + PRINT_HERE("%s %s\n", pcTag, pcMsg); +} +#endif // IS_FOR_PYTIGRE diff --git a/Common/CUDA/TIGRE_common.hpp.prehip b/Common/CUDA/TIGRE_common.hpp.prehip new file mode 100644 index 00000000..faf8d7ab --- /dev/null +++ b/Common/CUDA/TIGRE_common.hpp.prehip @@ -0,0 +1,24 @@ +#ifndef _COMMON_HPP_20201017_ +#define _COMMON_HPP_20201017_ + +#define STRINGIFY(n) #n +#define TOSTRING(n) STRINGIFY(n) +#define __HERE__ __FILE__ " (" TOSTRING(__LINE__) "): " +#define PRINT_HERE printf(__HERE__);printf +// #define PRINT_HERE (void*)0 + +#if defined(IS_FOR_PYTIGRE) +#ifndef IS_FOR_MATLAB_TIGRE + #define IS_FOR_MATLAB_TIGRE 0 +#endif // IS_FOR_MATLAB_TIGRE +void mexPrintf(const char*, ...); +void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg); +void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg); +#else +#ifndef IS_FOR_MATLAB_TIGRE + #define IS_FOR_MATLAB_TIGRE 1 +#endif // IS_FOR_MATLAB_TIGRE +#include "mex.h" +#include "tmwtypes.h" +#endif // IS_TIGRE_FOR_PYTHON +#endif // _COMMON_HPP_20201017_ diff --git a/Common/CUDA/errors.hpp b/Common/CUDA/errors.hpp index 05518b20..16bece09 100644 --- a/Common/CUDA/errors.hpp +++ b/Common/CUDA/errors.hpp @@ -1,4 +1,4 @@ -#define CUDA_SUCCESS 0 +#define hipSuccess 0 #define ERR_CUDA 1 #define ERR_NO_CAPABLE_DEVICES 2 diff --git a/Common/CUDA/errors.hpp.prehip b/Common/CUDA/errors.hpp.prehip new file mode 100644 index 00000000..05518b20 --- /dev/null +++ b/Common/CUDA/errors.hpp.prehip @@ -0,0 +1,10 @@ +#define CUDA_SUCCESS 0 +#define ERR_CUDA 1 + +#define ERR_NO_CAPABLE_DEVICES 2 +#define ERR_NO_FREE_DEVICES 3 +#define ERR_BAD_ASSERT 4 +#define ERR_ASSERT_FAIL 5 + + + diff --git a/Common/CUDA/gpuUtils.cu b/Common/CUDA/gpuUtils.cu index 8f2754e4..910b7a58 100644 --- a/Common/CUDA/gpuUtils.cu +++ b/Common/CUDA/gpuUtils.cu @@ -1,7 +1,7 @@ #include "gpuUtils.hpp" -#include -#include +#include +#include #include #include @@ -34,11 +34,11 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha return iCudaDeviceCount; } - cudaError_t err; - cudaDeviceProp propDevice; + hipError_t err; + hipDeviceProp_t propDevice; int nMatch = 0; for (int iId = 0; iId < iCudaDeviceCount; ++iId) { - err = cudaGetDeviceProperties(&propDevice, iId); + err = hipGetDeviceProperties(&propDevice, iId); iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name); if (strcmp(propDevice.name, kacGPUName) == 0) { piDeviceIds[nMatch] = iId; @@ -55,16 +55,16 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha void GetGpuName(int iDeviceId, char* pcName) { memset(pcName, 0, 128); - cudaError_t err; - cudaDeviceProp propDevice; + hipError_t err; + hipDeviceProp_t propDevice; int id = iDeviceId; - err = cudaGetDeviceProperties(&propDevice, id); + err = hipGetDeviceProperties(&propDevice, id); memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char)); } int GetGpuCount() { int iCudaDeviceCount = 0; - cudaGetDeviceCount(&iCudaDeviceCount); + hipGetDeviceCount(&iCudaDeviceCount); return iCudaDeviceCount; } diff --git a/Common/CUDA/gpuUtils.cu.prehip b/Common/CUDA/gpuUtils.cu.prehip new file mode 100644 index 00000000..8f2754e4 --- /dev/null +++ b/Common/CUDA/gpuUtils.cu.prehip @@ -0,0 +1,70 @@ + +#include "gpuUtils.hpp" +#include +#include +#include +#include + +int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage) { + if (pcMessage) { + for (int iI = 0; iI < 65535; ++iI) { + pcMessage[iI] = '\0'; + } + } + if (piDeviceIds == 0 || iIdCountMax == 0) { + return 0; + } + int iMessagePos = 0; + // Count installed GPUs. + int iCudaDeviceCount = GetGpuCount(); + iMessagePos += sprintf(pcMessage + iMessagePos, "Found GPUs: %d\n", iCudaDeviceCount); + if (iCudaDeviceCount == 0) { + // printf("No GPU found\n"); + return 0; + } + + iCudaDeviceCount = min(iCudaDeviceCount, iIdCountMax); + iMessagePos += sprintf(pcMessage + iMessagePos, "Max GPUs: %d\n", iCudaDeviceCount); + if (strlen(kacGPUName) == 0) { + // Semi-compatible mode: + // Return all GPUs + for (int iI = 0; iI < iCudaDeviceCount; ++iI) { + piDeviceIds[iI] = iI; + } + return iCudaDeviceCount; + } + + cudaError_t err; + cudaDeviceProp propDevice; + int nMatch = 0; + for (int iId = 0; iId < iCudaDeviceCount; ++iId) { + err = cudaGetDeviceProperties(&propDevice, iId); + iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name); + if (strcmp(propDevice.name, kacGPUName) == 0) { + piDeviceIds[nMatch] = iId; + ++nMatch; + } + } + + for (int iI = 0; iI < nMatch; ++iI) { + iMessagePos += sprintf(pcMessage + iMessagePos, "%d, ", piDeviceIds[iI]); + } + return nMatch; + +} + +void GetGpuName(int iDeviceId, char* pcName) { + memset(pcName, 0, 128); + cudaError_t err; + cudaDeviceProp propDevice; + int id = iDeviceId; + err = cudaGetDeviceProperties(&propDevice, id); + memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char)); +} + + +int GetGpuCount() { + int iCudaDeviceCount = 0; + cudaGetDeviceCount(&iCudaDeviceCount); + return iCudaDeviceCount; +} diff --git a/Common/CUDA/gpuUtils.hpp.prehip b/Common/CUDA/gpuUtils.hpp.prehip new file mode 100644 index 00000000..38b518cf --- /dev/null +++ b/Common/CUDA/gpuUtils.hpp.prehip @@ -0,0 +1,18 @@ + +#ifndef GPUUTILS_HPP +#define GPUUTILS_HPP +//! @brief # of installed GPUs +int GetGpuCount(); + +//! @brief IDs of GPUs whose name is kacGPUName. +//! @note Call GetGpuCount and allocate sufficient memory for piDeviceIds. +//! @param [in] kacGPUName +//! @param [in, out] piDeviceIds. +//! @param [in] iIdCountMax. Return value of GetGpuCount() +int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage); + +//! @brief GPU name of index iDeviceId. Allocate 128bytes for pcName before call. +void GetGpuName(int iDeviceId, char* pcName); + +#endif // GPUUTILS_HPP + diff --git a/Common/CUDA/improvedForwardProjections.cu b/Common/CUDA/improvedForwardProjections.cu index 0f32be72..7c5fbddd 100644 --- a/Common/CUDA/improvedForwardProjections.cu +++ b/Common/CUDA/improvedForwardProjections.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * CUDA function for optimized proton CT radiographies * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP) @@ -20,19 +21,19 @@ Coded by: Stefanie Kaser, Benjamin Kirchmayer --------------------------------------------------------------------------*/ -#include +#include #include "mex.h" -#include +#include #include "improvedForwardProjections.hpp" #include #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\ } \ } while (0) @@ -937,43 +938,43 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po } //Allocate Memory on GPU - cudaMalloc( (void**) &dPosIn, sizeInputs ); - cudaMalloc( (void**) &dPosOut, sizeInputs ); - cudaMalloc( (void**) &ddirIn, sizeInputs ); - cudaMalloc( (void**) &ddirOut, sizeInputs ); - cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); - cudaMalloc( (void**) &dhist1, detectorMem ); - cudaMalloc( (void**) &dhist2, detectorMem ); - cudaMalloc( (void**) &dnumEntries, sizeof(int)); - cudaMalloc( (void**) &ddetectorX, sizeof(int)); - cudaMalloc( (void**) &ddetectorY, sizeof(int)); - cudaMalloc( (void**) &dpixelSize, 2*sizeof(float)); - cudaMalloc( (void**) &dDetectDistIn, sizeof(float)); - cudaMalloc( (void**) &dDetectDistOut, sizeof(float)); - cudaMalloc( (void**) &dEin, sizeof(float)); - cudaMalloc( (void**) &dReject, sizeof(float)); - cudaMalloc( (void**) &dHull, 5*sizeof(float)); - cudaError_t _err_alloc = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err_alloc)); + hipMalloc( (void**) &dPosIn, sizeInputs ); + hipMalloc( (void**) &dPosOut, sizeInputs ); + hipMalloc( (void**) &ddirIn, sizeInputs ); + hipMalloc( (void**) &ddirOut, sizeInputs ); + hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); + hipMalloc( (void**) &dhist1, detectorMem ); + hipMalloc( (void**) &dhist2, detectorMem ); + hipMalloc( (void**) &dnumEntries, sizeof(int)); + hipMalloc( (void**) &ddetectorX, sizeof(int)); + hipMalloc( (void**) &ddetectorY, sizeof(int)); + hipMalloc( (void**) &dpixelSize, 2*sizeof(float)); + hipMalloc( (void**) &dDetectDistIn, sizeof(float)); + hipMalloc( (void**) &dDetectDistOut, sizeof(float)); + hipMalloc( (void**) &dEin, sizeof(float)); + hipMalloc( (void**) &dReject, sizeof(float)); + hipMalloc( (void**) &dHull, 5*sizeof(float)); + hipError_t _err_alloc = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err_alloc)); cudaCheckErrors("GPU Allocation failed!"); //Copy Arrays to GPU - cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice); - cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice); - cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice); + hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice); + hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice); + hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice); cudaCheckErrors("Host to device transport failed!"); @@ -984,8 +985,8 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po ParticleKernel<<>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject); - cudaError_t _err = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err)); + hipError_t _err = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err)); cudaCheckErrors("Kernel fail!"); //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); @@ -993,12 +994,12 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po //sumHist<<>>(dhist1, dhist2); //Copy result from device to host - //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); - //cudaError_t _errcp = cudaGetLastError(); - //mexPrintf("%s \n", cudaGetErrorString(_errcp)); + //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost); + //hipError_t _errcp = hipGetLastError(); + //mexPrintf("%s \n", hipGetErrorString(_errcp)); cudaCheckErrors("Device to host transport failed!"); for(int j = 0; j +#include "mex.h" +#include +#include "improvedForwardProjections.hpp" +#include +#include + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ + } \ +} while (0) + + +__device__ int SolvePolynomial(float*x, float a, float b, float c){ + // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method + // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 + // Herefore, we have to make a substitution: x = z - a/3 + float p = b - a*a / 3.0; + float q = 2*a*a*a/27.0 - a*b / 3.0 + c; + float disc = q*q/4.0 + p*p*p/27.0; + if(disc > 0){ + float u = cbrt(-0.5*q + sqrt(disc)); + float v = cbrt(-0.5*q - sqrt(disc)); + x[0] = u + v - a/3.0; // don't forget to substitute back z --> x + return 1; + } + else if(disc == 0 && p == 0){ + x[0] = -a/3.0; // don't forget to substitute back z --> x + return 1; + } + else if(disc == 0 && p != 0){ + x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x + x[1] = -3.0*q/(2.0*p) - a/3.0; + return 2; + } + else{ + x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x + x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0; + x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0; + return 3; + } +} + +__device__ float cspline(float t, float a, float b, float c, float d){ + + return a*(t*t*t) + b*(t*t) + c*t +d; + +} + +__device__ void SimpleSort(float* arr, int size_arr){ + // Insertion sorting method + float curr_elem; + int j; + + for (int i=1; i=0 && curr_elem0){ + + float z_1 = -p/2.0 + sqrt(disc); + float z_2 = -p/2.0 - sqrt(disc); + float z_solve; + + if(in_or_out == 1){ + z_solve = min(z_1, z_2); + } + else { + z_solve = max(z_1, z_2); + } + + float x_solve = kx*z_solve + dx; + + float ky = direction[1]; + float dy = position[1] - ky*detOff; + float y_solve = ky*z_solve + dy; + + if(-h/2 <= y_solve && y_solve <= h/2){ + + HullIntercept[0] = x_solve; + HullIntercept[1] = y_solve; + HullIntercept[2] = z_solve; + + return 0; + } + else{ + float z1_h = (1.0/ky) * (0.5*h-dy); + float z2_h = (1.0/ky) * (-0.5*h-dy); + + if(in_or_out == 1){ + z_solve = min(z1_h, z2_h); + if(dy > 0){y_solve = -h*0.5;} + else{y_solve = h*0.5;} + x_solve = kx*z_solve + dx; + } + else { + z_solve = max(z1_h, z2_h); + if(dy < 0){y_solve = -h*0.5;} + else{y_solve = h*0.5;} + x_solve = kx*z_solve + dx; + } + + if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){ + + HullIntercept[0] = x_solve; + HullIntercept[1] = y_solve; + HullIntercept[2] = z_solve; + + return 0; + } + + else{return 1;}} + } +else{return 1;} +} + + +__device__ int MinMax(float* solutions, float a, float b, float c){ + float p = 2*b/(3*a); + float q = c / (3*a); + float disc = 0.25*p*p - q; + if (disc > 0){ + solutions[0] = -0.5*p + sqrt(disc); + solutions[1] = -0.5*p - sqrt(disc); + return 0; + } + solutions[0] = -1; + solutions[1] = -1; + return 1; +} + + +__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag){ + float boundary; + int counter = 0; + int nx, ny; + nx = int(abs(stop[0] - start[0])/pix[0]); + ny = int(abs(stop[1] - start[1])/pix[1]); + if(nx+ny>=maxIntercep){ + *protFlag = false; + return 1;} + + if (int(stop[0]/pix[0]) == int(start[0]/pix[0]) && int(stop[1]/pix[1]) == int(start[1]/pix[1])) { + *protFlag = true; + return 0; + } + + if (int(stop[0]/pix[0]) != int(start[0]/pix[0])) { + float k = direction[0]; + float d = start[0] - k*start[2]; + boundary = trunc( ((stop[0] > start[0]) ? stop[0]:start[0])/pix[0])*pix[0]; + + for (int ix=0; ix start[2] && intercept < stop[2]){ + LinInterceptsVec[ix] = intercept; + counter++; + if (counter >= maxIntercep){ + *protFlag = false; + return counter;} + } + } + } + + if (int(stop[1]/pix[1]) != int(start[1]/pix[1])) { + float k = direction[1]; + float d = start[1] - k*start[2]; + boundary = trunc( ((stop[1] > start[1]) ? stop[1]:start[1])/pix[1])*pix[1]; + for (int iy=nx; iy start[2] && intercept < stop[2]){ + LinInterceptsVec[iy] = intercept; + counter++; + if(counter >= maxIntercep){ + *protFlag = false; + return counter;} + } + } + } + int diff = maxIntercep - counter; + for(int j = 0; j 0){ + float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0]; + if (cand > d[0] && cand > pos1[0]){ + (oneX > zeroX) ? oneX:zeroX=cand; + } + else if(cand < d[0] && cand < pos1[0]){ + (oneX < zeroX) ? oneX:zeroX=cand; + } + } + + if (solutions[1] < 1 && solutions[1] > 0){ + float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0]; + if (cand > oneX && cand > zeroX){ + (oneX > zeroX) ? oneX:zeroX=cand; + } + else if(cand < oneX && cand < zeroX){ + (oneX < zeroX) ? oneX:zeroX=cand; + } + } + } + + + test = MinMax(solutions, a[1], b[1], c[1]); + if (test == 0){ + if (solutions[0] < 1 && solutions[0] > 0){ + float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1]; + if (cand > d[1] && cand > pos1[1]){ + (oneY > zeroY) ? oneY:zeroY=cand; + } + else if(cand < d[1] && cand < pos1[1]){ + (oneY < zeroY) ? oneY:zeroY=cand; + } + } + + if (solutions[1] < 1 && solutions[1] > 0){ + float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1]; + if (cand > oneY && cand > zeroY){ + (oneY > zeroY) ? oneY:zeroY=cand; + } + else if(cand < oneY && cand < zeroY){ + (oneY < zeroY) ? oneY:zeroY=cand; + } + } + } + + nx = int(abs(oneX - zeroX) / pixelSize[0]); + ny = int(abs(oneY - zeroY) / pixelSize[1]); + if (nx + ny == 0) { + *protFlag = true; + return 0; + } + + if ((nx + ny) <= maxIntercep){ + + if (int(oneX/pixelSize[0]) != int(zeroX/pixelSize[0])) { + boundary = trunc( ((oneX > zeroX) ? oneX:zeroX)/pixelSize[0])*pixelSize[0]; + for (int ix=0; ix 0. ){ + if (counter >=maxIntercep){break;} + InterceptsVec[counter] = IntercepX[kx]; + counter++; + } + }//kx + if (counter >=maxIntercep){break;} + } + } + + if ( int(oneY/pixelSize[1]) != int(zeroY/pixelSize[1])) { + boundary = trunc( ((oneY > zeroY) ? oneY:zeroY)/pixelSize[1])*pixelSize[1]; + for (int iy=0; iy 0.) ){ + if (counter >=maxIntercep){break;} + InterceptsVec[counter] = IntercepY[ky]; + counter++; + } + }//ky + if (counter >=maxIntercep){break;} + } + } + + if (counter >= maxIntercep){ // || counter == 0){ + *protFlag = false; + return counter; + }else{ + + + int diff = maxIntercep - counter; + for(int j = 0; j this is too slow! 7 s instead of 1.5 s + tInterceptsVec = new float[customsize]; + delete[] tInterceptsVec;*/ + /*float *ptr; ---> this is too slow! 7.3s instead of 1.5 s + ptr = (float*) malloc(customsize * sizeof(float)); + free(ptr);*/ + + unsigned int protonIndex = blockIdx.x*blockDim.x + threadIdx.x; + float dimX, dimY, lk, lenX, lenY; + float lenZ = abs(*detectDistIn) + abs(*detectDistOut); + dimX = (float) *detectSizeX; + dimY = (float) *detectSizeY; + + //Dereference input parameters + int entries, dSizeX, dSizeY; + // float pix; + + entries = *numOfEntries; + dSizeX = *detectSizeX; + dSizeY = *detectSizeY; + // pix = *pixelSize; + + + if(hull[3] == 0){ + lenX = sqrt((devicePosOut[protonIndex] - devicePosIn[protonIndex]) * (devicePosOut[protonIndex] - devicePosIn[protonIndex]) \ + + lenZ*lenZ); + lenY = sqrt((devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) * (devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) \ + + lenZ*lenZ); + + float lambda0, lambda1, ref_wepl; + ref_wepl = 10 * 0.00244 * powf(*ein, 1.75); + lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl); + lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl); + + float a[2], b[2], c[2], d[2], pos1[2]; + + //Allocate memory for all pointers + // Calculate optimized xdir_in + devicedirIn[protonIndex] = devicedirIn[protonIndex] \ + / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0); // ... dz = 1! + devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0; + + // Calculate optimized ydir_in + devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \ + / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0); // ... dz = 1! + devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0; + + // Calculate optimized xdir_out + devicedirOut[protonIndex] = devicedirOut[protonIndex] \ + / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1! + devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1; + + // Calculate optimized ydir_out + devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \ + / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1! + devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1; + + // Calculate spline parameters + a[0] = devicePosIn[protonIndex]*2. + devicedirIn[protonIndex] - 2.*devicePosOut[protonIndex] + devicedirOut[protonIndex]; + a[1] = devicePosIn[protonIndex + entries]*2. + devicedirIn[protonIndex + entries] - \ + 2.*devicePosOut[protonIndex + entries] + devicedirOut[protonIndex + entries]; + + b[0] = -3.*devicePosIn[protonIndex] -2.*devicedirIn[protonIndex] + 3.*devicePosOut[protonIndex] - devicedirOut[protonIndex]; + b[1] = -3.*devicePosIn[protonIndex + entries] -2.* devicedirIn[protonIndex + entries] \ + + 3.*devicePosOut[protonIndex + entries] - devicedirOut[protonIndex + entries]; + + c[0] = devicedirIn[protonIndex]; + c[1] = devicedirIn[protonIndex + entries]; + + d[0] = devicePosIn[protonIndex]; + d[1] = devicePosIn[protonIndex + entries]; + + pos1[0] = devicePosOut[protonIndex]; + pos1[1] = devicePosOut[protonIndex + entries]; + + /* --------------------------------------------------------------------------------- */ + /* ------------------------ Start without Hull (CS only) -------------------------- */ + /* --------------------------------------------------------------------------------- */ + int count; + bool status = false; + float InterceptsVec[vecSizeCS] = {0}; + + count = calcIntercepts(InterceptsVec, a, b, c, d, pos1, pix, &status, vecSizeCS); + + if (status) { + int indX, indY, linInd; + float tOld = 0.0; + if (count==0){ + indX = int(pos1[0]/pix[0]+dimX/2.); // REPLACE: pos1 by pos0 + indY = int(pos1[1]/pix[1]+dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], 1.0f); + } + + } + else{ + for(int i= 0; i<=count; i++){ + lk = (InterceptsVec[i]- tOld)*lenZ; + if(tOld == 0){ + indX = int(d[0]/pix[0] +dimX/2); + indY = int(d[1]/pix[1] +dimY/2); + linInd = indY + indX*(dSizeY); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVec[i]; + + }else if(i == count){ + lk = lenZ - InterceptsVec[i-1]*lenZ; + indX = int(pos1[0]/pix[0] +dimX/2); + indY = int(pos1[1]/pix[1] +dimY/2); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ)); + } + + }else{ + indX = int(cspline(InterceptsVec[i] - eps, a[0], b[0], c[0], d[0])/pix[0] +dimX/2); + indY = int(cspline(InterceptsVec[i] - eps, a[1], b[1], c[1], d[1])/pix[1] +dimY/2); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVec[i]; + } + + }//i + }//if - Intercepts + } + else{ + atomicAdd(reject, 1.0); + } +/* ------------------------ End no Hull calculation (CS only) -------------------------- */ + } + +else{ + // WEIGHTING FACTORS FOR CHANNELS I + float weight_air_in = 0.00479; + float weight_air_out = 0.00479; + + float HullIn[3], HullOut[3], initpos[3], exitpos[3]; + float initdir[2], exitdir[2]; + + initpos[0] = devicePosIn[protonIndex]; + initpos[1] = devicePosIn[protonIndex + entries]; + initpos[2] = *detectDistIn; + + exitpos[0] = devicePosOut[protonIndex]; + exitpos[1] = devicePosOut[protonIndex + entries]; + exitpos[2] = *detectDistOut; + + initdir[0] = devicedirIn[protonIndex]; + initdir[1] = devicedirIn[protonIndex + entries]; + + exitdir[0] = devicedirOut[protonIndex]; + exitdir[1] = devicedirOut[protonIndex + entries]; + + int check = hullEntryExit(HullIn, initpos, initdir, 1, hull, *detectDistIn); + + if(check == 0){ + check = hullEntryExit(HullOut, exitpos, exitdir, 0, hull, *detectDistOut); + } + + if(check == 0 && HullOut[2] > HullIn[2]){ + /* --------------------------------------------------------------------------------- */ + /* ------------------------ Start with Hull + SL outside -------------------------- */ + /* --------------------------------------------------------------------------------- */ + const int hullIntercep = int(vecSizeCS); + const int airIntercepIn = int(vecSizeIn); + const int airIntercepOut = int(vecSizeOut); + bool status1 = false; + bool status2 = false; + bool status3 = false; + + int countIn, countHull, countOut; + float InterceptsVecOut[airIntercepOut] = {0}; + float InterceptsVecIn[airIntercepIn] = {0}; + float InterceptsVecHull[hullIntercep] = {0}; + lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); + lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); + + countIn = calcInterceptsLinear(InterceptsVecIn, initpos, HullIn, initdir, pix, airIntercepIn, &status1); + countOut = calcInterceptsLinear(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2); + + /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */ + float lambda0, lambda1, ref_wepl; + ref_wepl = 10 * 0.00244 * powf(*ein, 1.75); + lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); + lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); + + float a[2], b[2], c[2], d[2], pos1[2]; + + //Allocate memory for all pointers + // Calculate optimized xdir_in + devicedirIn[protonIndex] = devicedirIn[protonIndex] \ + / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0); // ... dz = 1! + devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0; + + // Calculate optimized ydir_in + devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \ + / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0); // ... dz = 1! + devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0; + + // Calculate optimized xdir_out + devicedirOut[protonIndex] = devicedirOut[protonIndex] \ + / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1! + devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1; + + // Calculate optimized ydir_out + devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \ + / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1! + devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1; + + // Calculate spline parameters + a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex]; + a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \ + 2.*HullOut[1] + devicedirOut[protonIndex + entries]; + + b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex]; + b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \ + + 3.*HullOut[1] - devicedirOut[protonIndex + entries]; + + c[0] = devicedirIn[protonIndex]; + c[1] = devicedirIn[protonIndex + entries]; + + d[0] = HullIn[0]; + d[1] = HullIn[1]; + + pos1[0] = HullOut[0]; + pos1[1] = HullOut[1]; + + countHull = calcIntercepts(InterceptsVecHull, a, b, c, d, pos1, pix, &status3, hullIntercep); + /* -------------------- End CS Preparations! -------------- */ + + if(status1 && status2 && status3){ + float tOld = initpos[2]; + int indX, indY, linInd; + + // WEIGHTING FACTORS FOR CHANNELS II + float weight_water = 1; // p_wepl[protonIndex]/(len_b*weight_air_in); + + // ---------------------------------------- Start with SL from detector to hull + if (countIn == 0){ + indX = int(initpos[0]/pix[0] + dimX/2.); + indY = int(initpos[1]/pix[1] + dimY/2.); + lk = HullIn[2] - initpos[2]; + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + for(int i= 0; i<=countIn; i++){ + lk = InterceptsVecIn[i] - tOld; + if(i == 0){ + indX = int(initpos[0]/pix[0] + dimX/2.); + indY = int(initpos[1]/pix[1] + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecIn[i]; + } + } + else if(i == countIn){ + lk = HullIn[2] - InterceptsVecIn[i-1]; + indX = int(HullIn[0]/pix[0] + dimX/2.); + indY = int(HullIn[1]/pix[1] + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2])))/pix[0] + dimX/2.); + indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2])))/pix[1] + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecIn[i]; + } + } + } + } // end else + // --------------------------- CS within hull + + tOld = 0.0; + if (countHull==0){ + indX = int(HullIn[0]/pix[0] + dimX/2.); + indY = int(HullIn[1]/pix[1] + dimY/2.); + lk = HullOut[2] - HullIn[2]; + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + + } else{ + for(int i= 0; i<=countHull; i++){ + lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]); + if(tOld == 0){ + indX = int(d[0]/pix[0] + dimX/2.); + indY = int(d[1]/pix[1] + dimY/2.); + linInd = indY + indX*(dSizeY); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVecHull[i]; + + }else if(i == countHull){ + lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]); + indX = int(pos1[0]/pix[0] + dimX/2.); + indY = int(pos1[1]/pix[1] + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + + }else{ + indX = int(cspline(InterceptsVecHull[i] -eps, a[0], b[0], c[0], d[0])/pix[0] + dimX/2.); + indY = int(cspline(InterceptsVecHull[i] -eps, a[1], b[1], c[1], d[1])/pix[1] + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVecHull[i]; + } + + }//i + } + + // --------------------------- SL from hull to detector + tOld = HullOut[2]; + if (countOut == 0){ + indX = int(exitpos[0]/pix[0] + dimX/2.); + indY = int(exitpos[1]/pix[1] + dimY/2.); + lk = exitpos[2] - HullOut[2]; + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + for(int i= 0; i<=countOut; i++){ + lk = abs(InterceptsVecOut[i] - tOld); + if(i == 0){ + indX = int(HullOut[0]/pix[0] + dimX/2.); + indY = int(HullOut[1]/pix[1] + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecOut[i]; + } + } + else if(i == countOut){ + lk = exitpos[2] - InterceptsVecOut[i-1]; + indX = int(exitpos[0]/pix[0] + dimX/2.); + indY = int(exitpos[1]/pix[1] + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/pix[0] + dimX/2.); + indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/pix[1] + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecOut[i]; + } + } + } + } // end else + } + else{ + atomicAdd(reject, 1.0); + } + + /* --------------------------- End Hull + SL outside ------------------------------- */ + + } + + else{ + + /* --------------------------------------------------------------------------------- */ + /* ----------------------------- Start with SL only! ------------------------------ */ + /* --------------------------------------------------------------------------------- */ + int count; + bool status = false; + float InterceptsVec[vecSizeCS] = {0}; + + float initpos[3], exitpos[3]; + float mydir[2]; + initpos[0] = devicePosIn[protonIndex]; + initpos[1] = devicePosIn[protonIndex + entries]; + initpos[2] = *detectDistIn; + exitpos[0] = devicePosOut[protonIndex]; + exitpos[1] = devicePosOut[protonIndex + entries]; + exitpos[2] = *detectDistOut; + + mydir[0] = (exitpos[0] - initpos[0])/lenZ; + mydir[1] = (exitpos[1] - initpos[1])/lenZ; // dz = 1 + count = calcInterceptsLinear(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status); + + + if (status) { + int indX, indY, linInd; + float tOld = initpos[2]; + if (count==0){ + indX = int(initpos[0]/pix[0] + dimX/2.); + indY = int(initpos[1]/pix[1] + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*1.0f); + } + + } else{ + for(int i= 0; i<=count; i++){ + lk = InterceptsVec[i] - tOld; + if(tOld == initpos[2]){ + indX = int(initpos[0]/pix[0] + dimX/2.); + indY = int(initpos[1]/pix[1] + dimY/2.); + linInd = indY + indX*(dSizeY); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVec[i]; + + }else if(i == count){ + lk = exitpos[2] - InterceptsVec[i-1]; + indX = int(exitpos[0]/pix[0] + dimX/2.); + indY = int(exitpos[1]/pix[1] + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + + }else{ + indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/pix[0] + dimX/2.); + indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/pix[1] + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVec[i]; + } + + } //i + }//if - Intercepts + } + else{ + // *reject += 1; + atomicAdd(reject, 1.0); + } + /* ------------------------------ End SL only! ------ -------------------------- */ + } + } +} + +__global__ void sumHist(float* hist, float* histNorm){ + + unsigned int index = blockIdx.x*blockDim.x + threadIdx.x; + hist[index] = hist[index]/histNorm[index]; +} + +__host__ void ParticleProjections(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \ + float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \ + float detectDistIn, float detectDistOut, float ein, float* ch_param){ + + /* + Detect Size = 400x400 + Prepare Input for GPU*/ + + const int sizeInputs = 2*numOfEntries*sizeof(float); + const int detectorMem = detectSizeX*detectSizeY*sizeof(float); + float reject = 0.0; + + float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull; + int *dnumEntries, *ddetectorX, *ddetectorY; + float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dEin, *dReject; + + float *hist1, *hist2; + hist1 = new float[detectSizeX*detectSizeY]; + hist2 = new float[detectSizeX*detectSizeY]; + for(int i = 0; i>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ + dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject); + cudaError_t _err = cudaGetLastError(); + mexPrintf("%s \n", cudaGetErrorString(_err)); + cudaCheckErrors("Kernel fail!"); + + //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); + //dim3 block_sum(64,1,1); + //sumHist<<>>(dhist1, dhist2); + + //Copy result from device to host + //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); + cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); + cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); + cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); + //cudaError_t _errcp = cudaGetLastError(); + //mexPrintf("%s \n", cudaGetErrorString(_errcp)); + cudaCheckErrors("Device to host transport failed!"); + + for(int j = 0; j -#include +#include +#include #include #ifndef improvedForwardProjections_H #define improvedForwardProjections_H diff --git a/Common/CUDA/improvedForwardProjections.hpp.prehip b/Common/CUDA/improvedForwardProjections.hpp.prehip new file mode 100644 index 00000000..6da25b63 --- /dev/null +++ b/Common/CUDA/improvedForwardProjections.hpp.prehip @@ -0,0 +1,263 @@ +/*------------------------------------------------------------------------- + * CUDA function for optimized proton CT radiographies + * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP) + * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232) + */ + +/*-------------------------------------------------------------------------- + This file is part of the TIGRE Toolbox + + Copyright (c) 2015, University of Bath and + CERN-European Organization for Nuclear Research + All rights reserved. + + License: Open Source under BSD. + See the full license at + https://github.com/CERN/TIGRE/blob/master/LICENSE + + Contact: tigre.toolbox@gmail.com + Codes: https://github.com/CERN/TIGRE/ + Coded by: Stefanie Kaser, Benjamin Kirchmayer +--------------------------------------------------------------------------*/ + +#include +#include +#include +#ifndef improvedForwardProjections_H +#define improvedForwardProjections_H +#define pi 3.14159265359 +#define eps 1e-8 +#define vecSizeCS 220 +#define vecSizeOut 100 +#define vecSizeIn 10 +#define maxthreads 256 +//#include +//#include + +void ParticleProjections(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \ + int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float ein, float* ch_param); + +__device__ int calcIntercepts(float* InterceptsVec ,float* a, float* b, \ + float* c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep); + +__device__ int SolvePolynomial(float*x, float a, float b, float c); + +__device__ int MinMax(float* solutions, float a, float b, float c); + +__device__ void SimpleSort(float* arr, int size_arr); + +__global__ void ParticleKernel(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \ + float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \ + float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject); + +__device__ int hullEntryExit(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff); + +__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \ + bool* protFlag); + +void ParticleProjectionsCone(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \ + int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float sourcePos, \ + float ein, float* ch_param); + +__device__ int calcInterceptsCone(float* InterceptsVec ,float* a, float* b, \ + float* c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep, \ + float sourcePos, float din, float dout); + +__device__ int SolvePolynomialCone(float*x, float a, float b, float c); + +__device__ void SimpleSortCone(float* arr, int size_arr); + +__device__ int MinMaxCone(float* solutions, float a, float b, float c); + +__global__ void ParticleKernelCone(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \ + float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \ + float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject, \ + float* sourceDist); + +__device__ int hullEntryExitCone(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff); + +__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \ + bool* protFlag, float sourcePos); + +#endif + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Common/CUDA/improvedForwardProjections_cone.cu b/Common/CUDA/improvedForwardProjections_cone.cu index 7a4f6b46..d11657a9 100644 --- a/Common/CUDA/improvedForwardProjections_cone.cu +++ b/Common/CUDA/improvedForwardProjections_cone.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * CUDA function for optimized proton CT radiographies * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP) @@ -21,19 +22,19 @@ --------------------------------------------------------------------------*/ -#include +#include #include "mex.h" -#include +#include #include "improvedForwardProjections.hpp" // #include // #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\ } \ } while (0) @@ -1133,45 +1134,45 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float } //Allocate Memory on GPU - cudaMalloc( (void**) &dPosIn, sizeInputs ); - cudaMalloc( (void**) &dPosOut, sizeInputs ); - cudaMalloc( (void**) &ddirIn, sizeInputs ); - cudaMalloc( (void**) &ddirOut, sizeInputs ); - cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); - cudaMalloc( (void**) &dhist1, detectorMem ); - cudaMalloc( (void**) &dhist2, detectorMem ); - cudaMalloc( (void**) &dnumEntries, sizeof(int)); - cudaMalloc( (void**) &ddetectorX, sizeof(int)); - cudaMalloc( (void**) &ddetectorY, sizeof(int)); - cudaMalloc( (void**) &dpixelSize, 2*sizeof(float)); - cudaMalloc( (void**) &dDetectDistIn, sizeof(float)); - cudaMalloc( (void**) &dDetectDistOut, sizeof(float)); - cudaMalloc( (void**) &dSourceDist, sizeof(float)); - cudaMalloc( (void**) &dEin, sizeof(float)); - cudaMalloc( (void**) &dReject, sizeof(float)); - cudaMalloc( (void**) &dHull, 5*sizeof(float)); - cudaError_t _err_alloc = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err_alloc)); + hipMalloc( (void**) &dPosIn, sizeInputs ); + hipMalloc( (void**) &dPosOut, sizeInputs ); + hipMalloc( (void**) &ddirIn, sizeInputs ); + hipMalloc( (void**) &ddirOut, sizeInputs ); + hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float)); + hipMalloc( (void**) &dhist1, detectorMem ); + hipMalloc( (void**) &dhist2, detectorMem ); + hipMalloc( (void**) &dnumEntries, sizeof(int)); + hipMalloc( (void**) &ddetectorX, sizeof(int)); + hipMalloc( (void**) &ddetectorY, sizeof(int)); + hipMalloc( (void**) &dpixelSize, 2*sizeof(float)); + hipMalloc( (void**) &dDetectDistIn, sizeof(float)); + hipMalloc( (void**) &dDetectDistOut, sizeof(float)); + hipMalloc( (void**) &dSourceDist, sizeof(float)); + hipMalloc( (void**) &dEin, sizeof(float)); + hipMalloc( (void**) &dReject, sizeof(float)); + hipMalloc( (void**) &dHull, 5*sizeof(float)); + hipError_t _err_alloc = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err_alloc)); cudaCheckErrors("GPU Allocation failed!"); //Copy Arrays to GPU - cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice); - cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice); - cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dSourceDist, &sourcePos, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice); - cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice); + hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice); + hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice); + hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice); + hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dSourceDist, &sourcePos, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice); + hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice); cudaCheckErrors("Host to device transport failed!"); @@ -1182,8 +1183,8 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float ParticleKernelCone<<>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist); - cudaError_t _err = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err)); + hipError_t _err = hipGetLastError(); + mexPrintf("%s \n", hipGetErrorString(_err)); cudaCheckErrors("Kernel fail!"); //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); @@ -1191,12 +1192,12 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float //sumHist<<>>(dhist1, dhist2); //Copy result from device to host - //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); - //cudaError_t _errcp = cudaGetLastError(); - //mexPrintf("%s \n", cudaGetErrorString(_errcp)); + //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost); + hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost); + //hipError_t _errcp = hipGetLastError(); + //mexPrintf("%s \n", hipGetErrorString(_errcp)); cudaCheckErrors("Device to host transport failed!"); for(int j = 0; j +#include "mex.h" +#include +#include "improvedForwardProjections.hpp" +// #include +// #include + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ + } \ +} while (0) + + +__device__ int SolvePolynomialCone(float*x, float a, float b, float c){ + // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method + // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 + // Herefore, we have to make a substitution: x = z - a/3 + float p = b - a*a / 3.0; + float q = 2*a*a*a/27.0 - a*b / 3.0 + c; + float disc = q*q/4.0 + p*p*p/27.0; + if(disc > 0){ + float u = cbrt(-0.5*q + sqrt(disc)); + float v = cbrt(-0.5*q - sqrt(disc)); + x[0] = u + v - a/3.0; // don't forget to substitute back z --> x + return 1; + } + else if(disc == 0 && p == 0){ + x[0] = -a/3.0; // don't forget to substitute back z --> x + return 1; + } + else if(disc == 0 && p != 0){ + x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x + x[1] = -3.0*q/(2.0*p) - a/3.0; + return 2; + } + else{ + x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x + x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0; + x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0; + return 3; + } +} + +__device__ float csplineCone(float t, float a, float b, float c, float d){ + + return a*(t*t*t) + b*(t*t) + c*t +d; + +} + +__device__ void SimpleSortCone(float* arr, int size_arr){ + // Insertion sorting method + float curr_elem; + int j; + + for (int i=1; i=0 && curr_elem0){ + + float z_1 = -p/2.0 + sqrt(disc); + float z_2 = -p/2.0 - sqrt(disc); + float z_solve; + + if(in_or_out == 1){ + z_solve = min(z_1, z_2); + } + else { + z_solve = max(z_1, z_2); + } + + float x_solve = kx*z_solve + dx; + + float ky = direction[1]; + float dy = position[1] - ky*detOff; + float y_solve = ky*z_solve + dy; + + if(-h/2 <= y_solve && y_solve <= h/2){ + + HullIntercept[0] = x_solve; + HullIntercept[1] = y_solve; + HullIntercept[2] = z_solve; + + return 0; + } + else{ + float z1_h = (1.0/ky) * (0.5*h-dy); + float z2_h = (1.0/ky) * (-0.5*h-dy); + + if(in_or_out == 1){ + z_solve = min(z1_h, z2_h); + if(dy > 0){y_solve = -h*0.5;} + else{y_solve = h*0.5;} + x_solve = kx*z_solve + dx; + } + else { + z_solve = max(z1_h, z2_h); + if(dy < 0){y_solve = -h*0.5;} + else{y_solve = h*0.5;} + x_solve = kx*z_solve + dx; + } + + if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){ + + HullIntercept[0] = x_solve; + HullIntercept[1] = y_solve; + HullIntercept[2] = z_solve; + + return 0; + } + + else{return 1;}} + } +else{return 1;} +} + + + +__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag, + float sourcePos){ + float tan_alpha, d_channel; + int counter = 0; + int nx, ny; + float sdd = abs(stop[2] - sourcePos); // distance source detector + float sidd = abs(start[2] - sourcePos); // distance sourcce inital detector + int select; + + float pix_start_x = sidd * (pix[0]/sdd); + float pix_start_y = sidd * (pix[1]/sdd); + + nx = int(abs(stop[0]/pix[0] - start[0]/pix_start_x)); + ny = int(abs(stop[1]/pix[1] - start[1]/pix_start_y)); + if(nx+ny>=maxIntercep){ + *protFlag = false; + return 1;} + + if (int(stop[0]/pix[0]) == int(start[0]/pix_start_x) && int(stop[1]/pix[1]) == int(start[1]/pix_start_y)) { + *protFlag = true; + return 0; + } + + if (int(stop[0]/pix[0]) != int(start[0]/pix_start_x)) { + float k = direction[0]; + float d = start[0] - k*start[2]; + if(stop[0]/pix[0] > start[0]/pix_start_x){ + tan_alpha = (trunc(stop[0]/pix[0])*pix[0])/sdd; + d_channel = trunc(stop[0]/pix[0])*pix[0] - tan_alpha * stop[2]; + select = 0; + } + else{ + tan_alpha = (trunc(start[0]/pix_start_x)*pix_start_x)/sidd; + d_channel = trunc(start[0]/pix_start_x)*pix_start_x - tan_alpha * start[2]; + select = 1; + } + + for (int ix=0; ix start[2] && intercept < stop[2]){ + LinInterceptsVec[ix] = intercept; + counter++; + if (counter >= maxIntercep){ + *protFlag = false; + return counter;} + } + } + } + + if (int(stop[1]/pix[1]) != int(start[1]/pix_start_y)) { + float k = direction[1]; + float d = start[1] - k*start[2]; + if(stop[1]/pix[1] > start[1]/pix_start_y){ + tan_alpha = (trunc(stop[1]/pix[1])*pix[1])/sdd; + d_channel = trunc(stop[1]/pix[1])*pix[1] - tan_alpha * stop[2]; + select = 0; + } + else{ + tan_alpha = (trunc(start[1]/pix_start_y)*pix_start_y)/sidd; + d_channel = trunc(start[1]/pix_start_y)*pix_start_y - tan_alpha * start[2]; + select = 1; + } + + for (int iy=nx; iy start[2] && intercept < stop[2]){ + LinInterceptsVec[iy] = intercept; + counter++; + if (counter >= maxIntercep){ + *protFlag = false; + return counter;} + } + } + } + + int diff = maxIntercep - counter; + for(int j = 0; j 0){ + solutions[0] = -0.5*p + sqrt(disc); + solutions[1] = -0.5*p - sqrt(disc); + return 0; + } + solutions[0] = -1; + solutions[1] = -1; + return 1; +} + + + +__device__ int calcInterceptsCone(float* InterceptsVec ,float* a, float* b, \ + float* c, float* d, float* pos1, float* pixelSize, bool* protFlag, int maxIntercep, \ + float sourcePos, float din, float dout){ + + /*Calculates channel Intercepts and the lengths the proton (ion) has spent in the + corresponding channel. + Returns 1 if proton is accepted and 0 if it is rejected due to too many Intercepts + */ + float oneX, oneY, zeroX, zeroY, pix_oneX, pix_oneY, pix_zeroX, pix_zeroY; + float tan_alpha, d_channel; + float sdd_init = abs(dout - sourcePos)/abs(dout-din); // normalize to 1! + float sidd_init = abs(din - sourcePos)/abs(dout-din); + float sdd_x = abs(dout - sourcePos)/abs(dout-din); // normalize to 1! + float sidd_x = abs(din - sourcePos)/abs(dout-din); + float sdd_y = abs(dout - sourcePos)/abs(dout-din); // normalize to 1! + float sidd_y = abs(din - sourcePos)/abs(dout-din); + int select; + float pix_start_x = sidd_init * (pixelSize[0]/sdd_init); + float pix_start_y = sidd_init * (pixelSize[1]/sdd_init); + zeroX = d[0]; + oneX = pos1[0]; + zeroY = d[1]; + oneY = pos1[1]; + pix_zeroX = pix_start_x; + pix_zeroY = pix_start_y; + pix_oneX = pixelSize[0]; + pix_oneY = pixelSize[1]; + + + int status, nx, ny; + float IntercepX[3]; + float IntercepY[3]; + float solutions[2]; + // counter has to be implemented despite the initial discrimination because one can not state beforehand if + // the cubic spline has more than one Intercept with the channel boundary + int counter=0; + + int test = MinMaxCone(solutions, a[0], b[0], c[0]); + if (test == 0){ + if (solutions[0] < 1 && solutions[0] > 0){ + float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0]; + float pix_cand = (sidd_init + solutions[0]) * (pixelSize[0]/sdd_init); + if (cand/pix_cand > d[0]/pix_start_x && cand/pix_cand > pos1[0]/pixelSize[0]){ + (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand; + (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; + (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din); + } + else if(cand/pix_cand < d[0]/pix_start_x && cand/pix_cand < pos1[0]/pixelSize[0]){ + (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand; + (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; + (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din); + } + } + + if (solutions[1] < 1 && solutions[1] > 0){ + float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0]; + float pix_cand = (sidd_init + solutions[1]) * (pixelSize[0]/sdd_init); + if (cand/pix_cand > oneX/pix_oneX && cand/pix_cand > zeroX/pix_zeroX){ + (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand; + (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; + (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din); + } + else if(cand/pix_cand < oneX/pix_oneX && cand/pix_cand < zeroX/pix_zeroX){ + (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand; + (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; + (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din); + } + } + } + + test = MinMaxCone(solutions, a[1], b[1], c[1]); + if (test == 0){ + if (solutions[0] < 1 && solutions[0] > 0){ + float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1]; + float pix_cand = (sidd_init + solutions[0]) * (pixelSize[1]/sdd_init); + if (cand/pix_cand > d[1]/pix_start_y && cand/pix_cand > pos1[1]/pixelSize[1]){ + (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand; + (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; + (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din); + } + else if(cand/pix_cand < d[1]/pix_start_y && cand/pix_cand < pos1[1]/pixelSize[1]){ + (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand; + (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; + (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din); + } + } + + if (solutions[1] < 1 && solutions[1] > 0){ + float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1]; + float pix_cand = (sidd_init + solutions[1]) * (pixelSize[1]/sdd_init); + if (cand/pix_cand > oneY/pix_oneY && cand/pix_cand > zeroY/pix_zeroY){ + (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand; + (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; + (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din); + } + else if(cand/pix_cand < oneY/pix_oneY && cand/pix_cand < zeroY/pix_zeroY){ + (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand; + (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; + (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din); + } + } + } + //Check how many Intercepts will occur approximately + nx = int(abs(oneX/pix_oneX - zeroX/pix_zeroX)); + ny = int(abs(oneY/pix_oneY - zeroY/pix_zeroY)); + + if (nx + ny == 0) { + *protFlag = true; + return 0; + } + if ((nx + ny) <= maxIntercep){ + + if (int(oneX/pix_oneX) != int(zeroX/pix_zeroX)) { + if(oneX/pix_oneX > zeroX/pix_zeroX){ + tan_alpha = (trunc(oneX/pix_oneX)*pix_oneX)/sdd_x; + d_channel = trunc(oneX/pix_oneX)*pix_oneX * (sidd_init/sdd_x); + select = 0; + } + else{ + tan_alpha = (trunc(zeroX/pix_zeroX)*pix_zeroX)/sidd_x; + d_channel = trunc(zeroX/pix_zeroX)*pix_zeroX * (sidd_init/sidd_x); + select = 1; + } + for (int ix=0; ix 0. ){ + if (counter >=maxIntercep){break;} + InterceptsVec[counter] = IntercepX[kx]; + counter++; + } + }//kx + if (counter >=maxIntercep){break;} + } + } + + if ( int(oneY/pix_oneY) != int(zeroY/pix_zeroY)) { + if(oneY/pix_oneY > zeroY/pix_zeroY){ + tan_alpha = (trunc(oneY/pix_oneY)*pix_oneY)/sdd_y; + d_channel = trunc(oneY/pix_oneY)*pix_oneY * (sidd_init/sdd_y); + select = 0; + } + else{ + tan_alpha = (trunc(zeroY/pix_zeroY)*pix_zeroY)/sidd_y; + d_channel = trunc(zeroY/pix_zeroY)*pix_zeroY * (sidd_init/sidd_y); + select = 1; + } + for (int iy=0; iy 0. ){ + if (counter >=maxIntercep){break;} + InterceptsVec[counter] = IntercepY[ky]; + counter++; + } + }//kx + if (counter >=maxIntercep){break;} + } + } + + if (counter >= maxIntercep){ // || counter == 0){ + *protFlag = false; + return counter; + } + + else{ + int diff = maxIntercep - counter; + for(int j = 0; j HullIn[2]){ + /* --------------------------------------------------------------------------------- */ + /* ------------------------ Start with Hull + SL outside -------------------------- */ + /* --------------------------------------------------------------------------------- */ + const int hullIntercep = int(vecSizeCS); + const int airIntercepIn = int(vecSizeIn); + const int airIntercepOut = int(vecSizeOut); + bool status1 = false; + bool status2 = false; + bool status3 = false; + + int countIn, countHull, countOut; + float InterceptsVecOut[airIntercepOut] = {0}; + float InterceptsVecIn[airIntercepIn] = {0}; + float InterceptsVecHull[hullIntercep] = {0}; + lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); + lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); + + float newpix[2]; + newpix[0] = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + newpix[1] = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + countIn = calcInterceptsLinearCone(InterceptsVecIn, initpos, HullIn, initdir, newpix, airIntercepIn, &status1, *sourceDist); + countOut = calcInterceptsLinearCone(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2, *sourceDist); + + /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */ + float lambda0, lambda1, ref_wepl; + ref_wepl = 10 * 0.00244 * powf(*ein, 1.75); + lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); + lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); + + float a[2], b[2], c[2], d[2], pos1[2]; + + //Allocate memory for all pointers + // Calculate optimized xdir_in + devicedirIn[protonIndex] = devicedirIn[protonIndex] \ + / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0); // ... dz = 1! + devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0; + + // Calculate optimized ydir_in + devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \ + / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0); // ... dz = 1! + devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0; + + // Calculate optimized xdir_out + devicedirOut[protonIndex] = devicedirOut[protonIndex] \ + / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1! + devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1; + + // Calculate optimized ydir_out + devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \ + / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1! + devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1; + + // Calculate spline parameters + a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex]; + a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \ + 2.*HullOut[1] + devicedirOut[protonIndex + entries]; + + b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex]; + b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \ + + 3.*HullOut[1] - devicedirOut[protonIndex + entries]; + + c[0] = devicedirIn[protonIndex]; + c[1] = devicedirIn[protonIndex + entries]; + + d[0] = HullIn[0]; + d[1] = HullIn[1]; + + pos1[0] = HullOut[0]; + pos1[1] = HullOut[1]; + + // float newpix[2]; + newpix[0] = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + newpix[1] = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + countHull = calcInterceptsCone(InterceptsVecHull, a, b, c, d, pos1, newpix, &status3, hullIntercep, *sourceDist, HullIn[2], HullOut[2]); + /* -------------------- End CS Preparations! -------------- */ + + if(status1 && status2 && status3){ + float tOld = initpos[2]; + int indX, indY, linInd; + // WEIGHTING FACTORS FOR CHANNELS II + float weight_water = 1; + + // ---------------------------------------- Start with SL from detector to hull + float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + float pix_end_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + float pix_end_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + if (countIn == 0){ + indX = int(initpos[0]/pix_start_x + dimX/2.); + indY = int(initpos[1]/pix_start_y + dimY/2.); + lk = HullIn[2] - initpos[2]; + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + for(int i= 0; i<=countIn; i++){ + lk = InterceptsVecIn[i] - tOld; + if(i == 0){ + indX = int(initpos[0]/pix_start_x + dimX/2.); + indY = int(initpos[1]/pix_start_y + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecIn[i]; + } + } + else if(i == countIn){ + lk = HullIn[2] - InterceptsVecIn[i-1]; + indX = int(HullIn[0]/pix_end_x + dimX/2.); + indY = int(HullIn[1]/pix_end_y + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + float curr_pix_x = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + float curr_pix_y = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2] )))/curr_pix_x + dimX/2.); + indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2] )))/curr_pix_y + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecIn[i]; + } + } + } + } // end else + + // ---cone beam------------------------ CS within hull + + tOld = 0.0; + pix_start_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + pix_start_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + pix_end_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + pix_end_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + if (countHull==0){ + indX = int(HullIn[0]/pix_start_x + dimX/2.); + indY = int(HullIn[1]/pix_start_y + dimY/2.); + lk = HullOut[2] - HullIn[2]; + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + + } else{ + for(int i= 0; i<=countHull; i++){ + lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]); + if(tOld == 0){ + indX = int(d[0]/pix_start_x + dimX/2.); + indY = int(d[1]/pix_start_y + dimY/2.); + linInd = indY + indX*(dSizeY); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVecHull[i]; + + }else if(i == countHull){ + lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]); + indX = int(pos1[0]/pix_end_x + dimX/2.); + indY = int(pos1[1]/pix_end_y + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + + }else{ + float curr_len = (InterceptsVecHull[i]-eps)*(HullOut[2]-HullIn[2]) + (HullIn[2] - *sourceDist); // abs(((InterceptsVecHull[i]-eps)*lenZ + *detectDistIn) - *sourceDist) + float curr_pix_x = curr_len * (pix[0]/abs(exitpos[2] - *sourceDist)); + float curr_pix_y = curr_len * (pix[1]/abs(exitpos[2] - *sourceDist)); + indX = int(csplineCone(InterceptsVecHull[i] - eps, a[0], b[0], c[0], d[0])/curr_pix_x + dimX/2.); + indY = int(csplineCone(InterceptsVecHull[i] - eps, a[1], b[1], c[1], d[1])/curr_pix_y + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVecHull[i]; + } + + }//i + } + + // --------------------------- SL from hull to detector + tOld = HullOut[2]; + pix_start_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + pix_start_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + if (countOut == 0){ + indX = int(exitpos[0]/pix[0] + dimX/2.); + indY = int(exitpos[1]/pix[1] + dimY/2.); + lk = exitpos[2] - HullOut[2]; + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + for(int i= 0; i<=countOut; i++){ + lk = abs(InterceptsVecOut[i] - tOld); + if(i == 0){ + indX = int(HullOut[0]/pix_start_x + dimX/2.); + indY = int(HullOut[1]/pix_start_y + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecOut[i]; + } + } + else if(i == countOut){ + lk = exitpos[2] - InterceptsVecOut[i-1]; + indX = int(exitpos[0]/pix[0] + dimX/2.); + indY = int(exitpos[1]/pix[1] + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + } + + else{ + float curr_pix_x = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + float curr_pix_y = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/curr_pix_x + dimX/2.); + indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/curr_pix_y + dimY/2.); + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + tOld = InterceptsVecOut[i]; + } + } + } + } // end else + + } + else{ + atomicAdd(reject, 1.0); + } + + /* --------------------------- End Hull + SL outside ------------------------------- */ + + } + + else{ + + /* --------------------------------------------------------------------------------- */ + /* ----------------------------- Start with SL only! ------------------------------ */ + /* --------------------------------------------------------------------------------- */ + int count; + bool status = false; + float InterceptsVec[vecSizeCS] = {0}; + //float InterceptsLengths[vecSizeCS+1] = {0}; + + float initpos[3], exitpos[3]; + float mydir[2]; + initpos[0] = devicePosIn[protonIndex]; + initpos[1] = devicePosIn[protonIndex + entries]; + initpos[2] = *detectDistIn; + exitpos[0] = devicePosOut[protonIndex]; + exitpos[1] = devicePosOut[protonIndex + entries]; + exitpos[2] = *detectDistOut; + + mydir[0] = (exitpos[0] - initpos[0])/lenZ; + mydir[1] = (exitpos[1] - initpos[1])/lenZ; // dz = 1 + count = calcInterceptsLinearCone(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status, *sourceDist); + + // for cone beam we need this + /*float lenZ_custom = 0.0; + float head[3], tail[3]; + for (int i=0; i<=count; i++){ + if (i == 0){ + head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]); + head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]); + head[2] = InterceptsVec[i]; + InterceptsLengths[i] = sqrt(powf(head[0] - initpos[0], 2.0) + powf(head[1] - initpos[1], 2.0) + powf(head[2] - initpos[2], 2.0)); + tail[0] = head[0]; + tail[1] = head[1]; + tail[2] = head[2]; + lenZ_custom += InterceptsLengths[i]; + } + else if (i == count){ + InterceptsLengths[i] = sqrt(powf(exitpos[0] - tail[0], 2.0) + powf(exitpos[1] - tail[1], 2.0) + powf(exitpos[2] - tail[2], 2.0)); + lenZ_custom += InterceptsLengths[i]; + } + else{ + head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]); + head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]); + head[2] = InterceptsVec[i]; + InterceptsLengths[i] = sqrt(powf(head[0] - tail[0], 2.0) + powf(head[1] - tail[1], 2.0) + powf(head[2] - tail[2], 2.0)); + tail[0] = head[0]; + tail[1] = head[1]; + tail[2] = head[2]; + lenZ_custom += InterceptsLengths[i]; + } + }*/ + + float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + + if (status) { + int indX, indY, linInd; + // exitpos[0] / (exitpos[2] - *sourceDir); + float tOld = initpos[2]; + if (count==0){ + indX = int(initpos[0]/pix_start_x + dimX/2.); + indY = int(initpos[1]/pix_start_y + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*1.0f); + } + + } else{ + for(int i= 0; i<=count; i++){ + lk = InterceptsVec[i] - tOld; + // lk = InterceptsLengths[i]; + if(i == 0){ + indX = int(initpos[0]/pix_start_x + dimX/2.); + indY = int(initpos[1]/pix_start_y + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVec[i]; + + }else if(i == count){ + lk = exitpos[2] - InterceptsVec[i-1]; + indX = int(exitpos[0]/pix[0] + dimX/2.); + indY = int(exitpos[1]/pix[1] + dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + + }else{ + float curr_pix_x = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); + float curr_pix_y = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); + indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/curr_pix_x+dimX/2.); + indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/curr_pix_y+dimY/2.); + + if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ + linInd = indY + indX*(dSizeY); + atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); + atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); + } + tOld = InterceptsVec[i]; + } + + } //i + }//if - Intercepts + } + else{ + // *reject += 1; + atomicAdd(reject, 1.0); + } + /* ------------------------------ End SL only! ------ -------------------------- */ + } + } +} + +__global__ void sumHistCone(float* hist, float* histNorm){ + + unsigned int index = blockIdx.x*blockDim.x + threadIdx.x; + hist[index] = hist[index]/histNorm[index]; +} + +__host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \ + float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \ + float detectDistIn, float detectDistOut, float sourcePos, \ + float ein, float* ch_param){ + + /* + Detect Size = 400x400 + Prepare Input for GPU*/ + + const int sizeInputs = 2*numOfEntries*sizeof(float); + const int detectorMem = detectSizeX*detectSizeY*sizeof(float); + float reject = 0.0; + + float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull; + int *dnumEntries, *ddetectorX, *ddetectorY; + float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dSourceDist, *dEin, *dReject; + + float *hist1, *hist2; + hist1 = new float[detectSizeX*detectSizeY]; + hist2 = new float[detectSizeX*detectSizeY]; + for(int i = 0; i>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ + dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist); + cudaError_t _err = cudaGetLastError(); + mexPrintf("%s \n", cudaGetErrorString(_err)); + cudaCheckErrors("Kernel fail!"); + + //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); + //dim3 block_sum(64,1,1); + //sumHist<<>>(dhist1, dhist2); + + //Copy result from device to host + //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); + cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); + cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); + cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); + //cudaError_t _errcp = cudaGetLastError(); + //mexPrintf("%s \n", cudaGetErrorString(_errcp)); + cudaCheckErrors("Device to host transport failed!"); + + for(int j = 0; j +#include + +float maxDistanceCubeXY(Geometry geo, float alpha,int i){ + /////////// + // Compute initial "t" so we access safely as less as out of bounds as possible. + ////////// + + float maxCubX,maxCubY; + // Forgetting Z, compute max distance: diagonal+offset + maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX; + maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY; + + return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY); +} + +void rollPitchYaw(Geometry geo,int i, Point3D* point){ + Point3D auxPoint; + auxPoint.x=point->x; + auxPoint.y=point->y; + auxPoint.z=point->z; + + point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x + +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y + +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; + + point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x + +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y + +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; + + point->z=-sin(geo.dPitch[i])*auxPoint.x + +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y + +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z; +} \ No newline at end of file diff --git a/Common/CUDA/projection.hpp.prehip b/Common/CUDA/projection.hpp.prehip new file mode 100644 index 00000000..54597d92 --- /dev/null +++ b/Common/CUDA/projection.hpp.prehip @@ -0,0 +1,9 @@ +#ifndef PROJECTION_HPP +#define PROJECTION_HPP + +#include "types_TIGRE.hpp" + +float maxDistanceCubeXY(Geometry geo, float alpha,int i); +void rollPitchYaw(Geometry geo,int i, Point3D* point); + +#endif diff --git a/Common/CUDA/ray_interpolated_projection.cu b/Common/CUDA/ray_interpolated_projection.cu index e71c5b59..8ab4a7e7 100644 --- a/Common/CUDA/ray_interpolated_projection.cu +++ b/Common/CUDA/ray_interpolated_projection.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA functions for texture-memory interpolation based projection @@ -53,19 +54,19 @@ #include -#include -#include +#include +#include #include "ray_interpolated_projection.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\ + hipDeviceReset();\ + mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",hipGetErrorString(__err));\ } \ } while (0) @@ -100,7 +101,7 @@ do { \ * * **/ - void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate); + void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate); __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device @@ -119,7 +120,7 @@ template float* detector, const int currProjSetNumber, const int totalNoOfProjections, - cudaTextureObject_t tex){ + hipTextureObject_t tex){ unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; @@ -255,10 +256,10 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c if (!fits_in_memory){ dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); - cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); + hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj); + hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj); cudaCheckErrors("cudaMallocauxiliarty projections fail"); } } @@ -267,12 +268,12 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c // This is happening regarthless if the image fits on memory float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); - cudaMemset(dProjection[dev*2+i] ,0,num_bytes_proj); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[dev*2+i], num_bytes_proj); + hipMemset(dProjection[dev*2+i] ,0,num_bytes_proj); + cudaCheckErrors("hipMalloc projections fail"); } } @@ -284,34 +285,34 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & splits>1){ - cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); #endif Point3D source, deltaU, deltaV, uvOrigin; Point3D* projParamsArrayHost = 0; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); float* projFloatsArrayHost = 0; - cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); + hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); cudaCheckErrors("Error allocating auxiliary constant memory"); // Create Streams for overlapping memcopy and compute int nStream_device=2; int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); + hipStreamCreate(&stream[i+dev*nStream_device]); } } @@ -324,8 +325,8 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c - cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; - cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; + hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount]; + hipArray **d_cuArrTex = new hipArray*[deviceCount]; for (unsigned int sp=0;sp=nangles) @@ -419,12 +420,12 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) else projection_this_block=PROJ_PER_BLOCK; - cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); + hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]); } // 2) take the results from current compute call and add it to the code in execution. for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -436,7 +437,7 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) else projection_this_block=PROJ_PER_BLOCK; - cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished + hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); } } // end accumulation case, where the image needs to be split @@ -446,7 +447,7 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c { for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on previous set on this GPU proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; if (dev+1==deviceCount) { //is it the last device? @@ -466,21 +467,21 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c else { projection_this_block=PROJ_PER_BLOCK; } - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } } // Make sure Computation on kernels has finished before we launch the next batch. for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*2]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*2]); } } // End noOfKernelCalls (i) loop. // We still have the last set of projections to get out of GPUs for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); //Global index of FIRST projection on this set on this GPU proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; if(proj_global>=nangles) @@ -489,15 +490,15 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. + hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]); } // Make sure everyone has done their bussiness before the next image split: for (dev = 0; dev < deviceCount; dev++) { - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // End image split loop. @@ -505,99 +506,99 @@ int interpolation_projection(float * img, Geometry geo, float** result,float c /////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////// for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texImg[dev]); - cudaFreeArray(d_cuArrTex[dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texImg[dev]); + hipFreeArray(d_cuArrTex[dev]); } delete[] texImg; texImg = 0; delete[] d_cuArrTex; d_cuArrTex = 0; // Freeing Stage for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection[dev*2]); - cudaFree(dProjection[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection[dev*2]); + hipFree(dProjection[dev*2+1]); } free(dProjection); if(!fits_in_memory){ for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection_accum[dev*2]); - cudaFree(dProjection_accum[dev*2+1]); + hipSetDevice(gpuids[dev]); + hipFree(dProjection_accum[dev*2]); + hipFree(dProjection_accum[dev*2+1]); } free(dProjection_accum); } freeGeoArray(splits,geoArray); - cudaFreeHost(projParamsArrayHost); - cudaFreeHost(projFloatsArrayHost); + hipHostFree(projParamsArrayHost); + hipHostFree(projFloatsArrayHost); for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; + hipStreamDestroy(stream[i]) ; #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & splits>1){ - cudaHostUnregister(img); + hipHostUnregister(img); } #endif - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } -void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate) +void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate) { const unsigned int num_devices = gpuids.GetLength(); //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); if(allocate){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor + //hipArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); cudaCheckErrors("Texture memory allocation fail"); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaMemcpy3DParms copyParams = {0}; - cudaSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params); //cudaCheckErrors("Texture memory data copy fail"); //Array creation End } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; if (geo.accuracy>1){ - texDescr.filterMode = cudaFilterModePoint; + texDescr.filterMode = hipFilterModePoint; geo.accuracy=1; } else{ - texDescr.filterMode = cudaFilterModeLinear; + texDescr.filterMode = hipFilterModeLinear; } - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); cudaCheckErrors("Texture object creation fail"); } } @@ -828,8 +829,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ size_t memtotal; int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree +#include +#include +#include "ray_interpolated_projection.hpp" +#include "TIGRE_common.hpp" +#include + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + cudaDeviceReset();\ + mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\ + } \ +} while (0) + + + +#define MAXTREADS 1024 +#define PROJ_PER_BLOCK 9 +#define PIXEL_SIZE_BLOCK 9 + /*GEOMETRY DEFINITION + * + * Detector plane, behind + * |-----------------------------| + * | | + * | | + * | | + * | | + * | +--------+ | + * | / /| | + * A Z | / / |*D | + * | | +--------+ | | + * | | | | | | + * | | | *O | + | + * --->y | | | / | + * / | | |/ | + * V X | +--------+ | + * |-----------------------------| + * + * *S + * + * + * + * + * + **/ + void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate); +__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device +__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device + + +__global__ void vecAddInPlaceInterp(float *a, float *b, unsigned long n) +{ + int idx = blockIdx.x*blockDim.x+threadIdx.x; + // Make sure we do not go out of bounds + if (idx < n) + a[idx] = a[idx] + b[idx]; +} + + +template + __global__ void kernelPixelDetector( Geometry geo, + float* detector, + const int currProjSetNumber, + const int totalNoOfProjections, + cudaTextureObject_t tex){ + + unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; + unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; + unsigned long long projNumber=threadIdx.z; + + if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) + return; + +#if IS_FOR_MATLAB_TIGRE + size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#else + size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#endif + + unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array + + if(indAlpha>=totalNoOfProjections) + return; + + Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection + Point3D deltaU = projParamsArrayDev[4*projNumber+1]; + Point3D deltaV = projParamsArrayDev[4*projNumber+2]; + Point3D source = projParamsArrayDev[4*projNumber+3]; + + float DSO = projFloatsArrayDev[2*projNumber+0]; + float cropdist_init = projFloatsArrayDev[2*projNumber+1]; + + + + /////// Get coordinates XYZ of pixel UV + unsigned long pixelV = geo.nDetecV-v-1; + unsigned long pixelU = u; + + + float vectX,vectY,vectZ; + Point3D P; + P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); + P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); + P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); + + // Length is the ray length in normalized space + float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z)); + //now legth is an integer of Nsamples that are required on this line + length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer + vectX=__fdividef(P.x -source.x,length); + vectY=__fdividef(P.y -source.y,length); + vectZ=__fdividef(P.z -source.z,length); + + +// //Integrate over the line + float tx,ty,tz; + float sum=0; + float i; + + + +// Because I have no idea how to efficiently cutoff the legth path in 3D, a very upper limit is computed (see maxdistanceCuboid) +// for the 3D case. However it would be bad to lose performance in the 3D case +// TODO: can ge really improve this? + if (sphericalrotation){ + if ((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy < length) + length=ceilf((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy); + } + else{ + if ((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy < length) + length=ceilf((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy); + } + + + //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel") + for (i=floorf(cropdist_init/geo.accuracy); i<=length; i=i+1){ + tx=vectX*i+source.x; + ty=vectY*i+source.y; + tz=vectZ*i+source.z; + + sum += tex3D(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time. + } + + float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+ + (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+ + (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) ); + + detector[idx]=sum*deltalength; +} + + + +// legnth(angles)=3 x nagnles, as we have roll, pitch, yaw. +int interpolation_projection(float * img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ + + + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","There are no available device(s) that support CUDA\n"); + } + // + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + int dev; + + // Check free memory + size_t mem_GPU_global; + checkFreeMemory(gpuids,&mem_GPU_global); + + // printf("geo.nDetec (U, V) = %d, %d\n", geo.nDetecU, geo.nDetecV); + + size_t mem_image=(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); + size_t mem_proj =(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV * sizeof(float); + + // Does everything fit in the GPUs? + const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to + // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. + +#ifndef NO_PINNED_MEMORY + if (isHostRegisterSupported & splits>1){ + cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + } + cudaCheckErrors("Error pinning memory"); +#endif + Point3D source, deltaU, deltaV, uvOrigin; + + Point3D* projParamsArrayHost = 0; + cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + float* projFloatsArrayHost = 0; + cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); + cudaCheckErrors("Error allocating auxiliary constant memory"); + + // Create Streams for overlapping memcopy and compute + int nStream_device=2; + int nStreams=deviceCount*nStream_device; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + for (int i = 0; i < nStream_device; ++i){ + cudaStreamCreate(&stream[i+dev*nStream_device]); + + } + } + cudaCheckErrors("Stream creation fail"); + int nangles_device=(nangles+deviceCount-1)/deviceCount; + int nangles_last_device=(nangles-(deviceCount-1)*nangles_device); + unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK + unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management. + int projection_this_block; + + + + cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; + cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; + for (unsigned int sp=0;sp=nangles) + break; + if ((i*PROJ_PER_BLOCK+j)>=nangles_device) + break; + geoArray[sp].alpha=angles[proj_global*3]; + geoArray[sp].theta=angles[proj_global*3+1]; + geoArray[sp].psi =angles[proj_global*3+2]; + + is_spherical+=abs(geoArray[sp].theta)+abs(geoArray[sp].psi); + + //precomute distances for faster execution + maxdist=maxdistanceCuboid(geoArray[sp],proj_global); + //Precompute per angle constant stuff for speed + computeDeltas(geoArray[sp], proj_global, &uvOrigin, &deltaU, &deltaV, &source); + //Ray tracing! + projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection + projParamsArrayHost[4*j+1]=deltaU; + projParamsArrayHost[4*j+2]=deltaV; + projParamsArrayHost[4*j+3]=source; + + projFloatsArrayHost[2*j]=geo.DSO[proj_global]; + projFloatsArrayHost[2*j+1]=floor(maxdist); + } + + cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]); + cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]); + cudaStreamSynchronize(stream[dev*nStream_device]); + + + //TODO: we could do this around X and Y axis too, but we would need to compute the new axis of rotation (not possible to know from jsut the angles) + if (!is_spherical){ + kernelPixelDetector<<>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); + } + else{ + kernelPixelDetector <<>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); + } + } + + + // Now that the computation is happening, we need to either prepare the memory for + // combining of the projections (splits>1) and start removing previous results. + + + // If our image does not fit in memory then we need to make sure we accumulate previous results too. + // This is done in 2 steps: + // 1)copy previous results back into GPU + // 2)accumulate with current results + // The code to take them out is the same as when there are no splits needed + if( !fits_in_memory&&sp>0) + { + // 1) grab previous results and put them in the auxiliary variable dProjection_accum + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on this set on this GPU + proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; + if(proj_global>=nangles) + break; + + // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... + if(i+1==noOfKernelCalls) //is it the last block? + projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) + nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) + else + projection_this_block=PROJ_PER_BLOCK; + cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); + } + // 2) take the results from current compute call and add it to the code in execution. + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on this set on this GPU + proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; + if(proj_global>=nangles) + break; + + // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... + if(i+1==noOfKernelCalls) //is it the last block? + projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) + nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) + else + projection_this_block=PROJ_PER_BLOCK; + cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished + vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); + } + } // end accumulation case, where the image needs to be split + + // Now, lets get out the projections from the previous execution of the kernels. + if (i>0) + { + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on previous set on this GPU + proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; + if (dev+1==deviceCount) { //is it the last device? + // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device + if (i-1 < noOfKernelCallsLastDev) { + // The previous set(block) was not empty. + projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global); + } + else { + // The previous set was empty. + // This happens if deviceCount > PROJ_PER_BLOCK+1. + // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199. + // e.g. PROJ_PER_BLOCK = 1, deviceCount = 3, nangles = 7. + break; + } + } + else { + projection_this_block=PROJ_PER_BLOCK; + } + cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + } + } + // Make sure Computation on kernels has finished before we launch the next batch. + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + cudaStreamSynchronize(stream[dev*2]); + } + } // End noOfKernelCalls (i) loop. + + // We still have the last set of projections to get out of GPUs + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + //Global index of FIRST projection on this set on this GPU + proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; + if(proj_global>=nangles) + break; + // How many projections are left here? + projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) + nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) + + cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. + cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); + cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); + } + // Make sure everyone has done their bussiness before the next image split: + for (dev = 0; dev < deviceCount; dev++) + { + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + } // End image split loop. + + cudaCheckErrors("Main loop fail"); + /////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////// + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDestroyTextureObject(texImg[dev]); + cudaFreeArray(d_cuArrTex[dev]); + } + delete[] texImg; texImg = 0; + delete[] d_cuArrTex; d_cuArrTex = 0; + // Freeing Stage + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaFree(dProjection[dev*2]); + cudaFree(dProjection[dev*2+1]); + + } + free(dProjection); + + if(!fits_in_memory){ + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaFree(dProjection_accum[dev*2]); + cudaFree(dProjection_accum[dev*2+1]); + + } + free(dProjection_accum); + } + freeGeoArray(splits,geoArray); + cudaFreeHost(projParamsArrayHost); + cudaFreeHost(projFloatsArrayHost); + + + for (int i = 0; i < nStreams; ++i) + cudaStreamDestroy(stream[i]) ; +#ifndef NO_PINNED_MEMORY + if (isHostRegisterSupported & splits>1){ + cudaHostUnregister(img); + } +#endif + cudaCheckErrors("cudaFree fail"); + +// cudaDeviceReset(); + return 0; +} +void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate) +{ + const unsigned int num_devices = gpuids.GetLength(); + //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; + const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + if(allocate){ + + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + + //cudaArray Descriptor + + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //cuda Array + cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + cudaCheckErrors("Texture memory allocation fail"); + } + + } + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaMemcpy3DParms copyParams = {0}; + cudaSetDevice(gpuids[dev]); + //Array creation + copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.dstArray = d_cuArrTex[dev]; + copyParams.extent = extent; + copyParams.kind = cudaMemcpyHostToDevice; + cudaMemcpy3DAsync(©Params); + //cudaCheckErrors("Texture memory data copy fail"); + //Array creation End + } + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_cuArrTex[dev]; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + if (geo.accuracy>1){ + texDescr.filterMode = cudaFilterModePoint; + geo.accuracy=1; + } + else{ + texDescr.filterMode = cudaFilterModeLinear; + } + texDescr.addressMode[0] = cudaAddressModeBorder; + texDescr.addressMode[1] = cudaAddressModeBorder; + texDescr.addressMode[2] = cudaAddressModeBorder; + texDescr.readMode = cudaReadModeElementType; + cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + cudaCheckErrors("Texture object creation fail"); + } +} + +/* This code generates the geometries needed to split the image properly in + * cases where the entire image does not fit in the memory of the GPU + **/ +void splitImageInterp(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){ + + unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible + for(unsigned int sp=0;spx=Pfinalu0.x-Pfinal.x; + deltaU->y=Pfinalu0.y-Pfinal.y; + deltaU->z=Pfinalu0.z-Pfinal.z; + + deltaV->x=Pfinalv0.x-Pfinal.x; + deltaV->y=Pfinalv0.y-Pfinal.y; + deltaV->z=Pfinalv0.z-Pfinal.z; + + *source=S; +} + +float maxdistanceCuboid(Geometry geo,unsigned int i){ + /////////// + // Compute initial "t" so we access safely as less as out of bounds as possible. + ////////// + + + float maxCubX,maxCubY,maxCubZ; + // Forgetting Z, compute mas distance: diagonal+offset + maxCubX=(geo.nVoxelX/2+ abs(geo.offOrigX[i])/geo.dVoxelX); + maxCubY=(geo.nVoxelY/2+ abs(geo.offOrigY[i])/geo.dVoxelY); + maxCubZ=(geo.nVoxelZ/2+ abs(geo.offOrigZ[i])/geo.dVoxelZ); + + float a,b; + a=geo.DSO[i]/geo.dVoxelX; + b=geo.DSO[i]/geo.dVoxelY; + +// As the return of this value is in "voxel space", the source may have an elliptical curve. +// The distance returned is the safe distance that can be skipped for a given angle alpha, before we need to start sampling. + + if (geo.theta==0.0f & geo.psi==0.0f) // Special case, it will make the code faster + return max(a*b/sqrt(a*a*sin(geo.alpha)*sin(geo.alpha)+b*b*cos(geo.alpha)*cos(geo.alpha))- + sqrt(maxCubX*maxCubX+maxCubY*maxCubY),0.0f); + //TODO: think of more special cases? + return max(geo.DSO[i]/max(max(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)-sqrt(maxCubX*maxCubX+maxCubY*maxCubY+maxCubZ*maxCubZ),0.0f); + +} +void rollPitchYaw(Geometry geo,unsigned int i, Point3D* point){ + Point3D auxPoint; + auxPoint.x=point->x; + auxPoint.y=point->y; + auxPoint.z=point->z; + + point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x + +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y + +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; + + point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x + +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y + +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; + + point->z=-sin(geo.dPitch[i])*auxPoint.x + +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y + +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z; + +} +void eulerZYZ(Geometry geo, Point3D* point){ + Point3D auxPoint; + auxPoint.x=point->x; + auxPoint.y=point->y; + auxPoint.z=point->z; + + point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+ + (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+ + cos(geo.alpha)*sin(geo.theta)*auxPoint.z; + + point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+ + (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+ + sin(geo.alpha)*sin(geo.theta)*auxPoint.z; + + point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+ + sin(geo.theta)*sin(geo.psi)*auxPoint.y+ + cos(geo.theta)*auxPoint.z; + + +} +//______________________________________________________________________________ +// +// Function: freeGeoArray +// +// Description: Frees the memory from the geometry array for multiGPU. +//______________________________________________________________________________ +void freeGeoArray(unsigned int splits,Geometry* geoArray){ + for(unsigned int sp=0;sp -#include -#include +#include +#include #include "ray_interpolated_projection_parallel.hpp" #include "TIGRE_common.hpp" #include #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",hipGetErrorString(__err));\ } \ } while (0) @@ -96,7 +97,7 @@ do { \ * * **/ -void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); +void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream); __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device @@ -104,7 +105,7 @@ __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on __global__ void kernelPixelDetector_parallel_interpolated( Geometry geo, float* detector, - const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) + const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex) { // Point3D source , // Point3D deltaU, @@ -199,23 +200,23 @@ int interpolation_projection_parallel(float * img, Geometry geo, float** resul size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float); float** dProjection=(float **)malloc(2*sizeof(float *)); for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[i], num_bytes); - cudaCheckErrors("cudaMalloc projections fail"); + hipMalloc((void**)&dProjection[i], num_bytes); + cudaCheckErrors("hipMalloc projections fail"); } // allocate streams for memory and compute int nStreams=2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i]); + hipStreamCreate(&stream[i]); } // Texture object variables - cudaTextureObject_t *texImg = 0; - cudaArray **d_cuArrTex = 0; - texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); + hipTextureObject_t *texImg = 0; + hipArray **d_cuArrTex = 0; + texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*)); CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream); cudaCheckErrors("Texture allocation fail"); @@ -226,9 +227,9 @@ int interpolation_projection_parallel(float * img, Geometry geo, float** resul Point3D source, deltaU, deltaV, uvOrigin; Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); float* projFloatsArrayHost; - cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); + hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); // 16x16 gave the best performance empirically // Funnily that makes it compatible with most GPUs..... @@ -266,39 +267,39 @@ int interpolation_projection_parallel(float * img, Geometry geo, float** resul projFloatsArrayHost[2*j+1]=floor(maxdist); } - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projFloatsArrayDev), projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]); + hipStreamSynchronize(stream[0]); kernelPixelDetector_parallel_interpolated<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); // copy result to host if (i>0) - cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]); } - cudaDeviceSynchronize(); + hipDeviceSynchronize(); int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; - cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); + hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]); - cudaDestroyTextureObject(texImg[0]); - cudaFreeArray(d_cuArrTex[0]); + hipDestroyTextureObject(texImg[0]); + hipFreeArray(d_cuArrTex[0]); free(texImg); texImg = 0; free(d_cuArrTex); d_cuArrTex = 0; cudaCheckErrors("Unbind fail"); - cudaFree(dProjection[0]); - cudaFree(dProjection[1]); + hipFree(dProjection[0]); + hipFree(dProjection[1]); free(dProjection); - cudaFreeHost(projParamsArrayHost); - cudaFreeHost(projFloatsArrayHost); + hipHostFree(projParamsArrayHost); + hipHostFree(projFloatsArrayHost); - cudaCheckErrors("cudaFree d_imagedata fail"); + cudaCheckErrors("hipFree d_imagedata fail"); for (int i = 0; i < 2; ++i){ - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); } -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } @@ -410,40 +411,40 @@ void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* u *source=S; } -void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; +void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); - cudaMemcpy3DParms copyParams = {0}; + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[0]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[1]); //Array creation End - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); } \ No newline at end of file diff --git a/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip b/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip new file mode 100644 index 00000000..4aad5d6f --- /dev/null +++ b/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip @@ -0,0 +1,449 @@ +/*------------------------------------------------------------------------- + * + * CUDA functions for texture-memory interpolation based projection + * + * This file has the necessary functions to perform X-ray parallel projection + * operation given a geaometry, angles and image. It uses the 3D texture + * memory linear interpolation to uniformily sample a path to integrate the + * X-rays. + * + * CODE by Ander Biguri + * Sepideh Hatamikia (arbitrary rotation) + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + + + +#include +#include +#include +#include "ray_interpolated_projection_parallel.hpp" +#include "TIGRE_common.hpp" +#include + +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\ + } \ +} while (0) + + + +#define MAXTREADS 1024 +#define PROJ_PER_BLOCK 8 +#define PIXEL_SIZE_BLOCK 8 +/*GEOMETRY DEFINITION + * + * Detector plane, behind + * |-----------------------------| + * | | + * | | + * | | + * | | + * | +--------+ | + * | / /| | + * A Z | / / |*D | + * | | +--------+ | | + * | | | | | | + * | | | *O | + | + * --->y | | | / | + * / | | |/ | + * V X | +--------+ | + * |-----------------------------| + * + * *S + * + * + * + * + * + **/ +void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); +__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device +__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device + + + +__global__ void kernelPixelDetector_parallel_interpolated( Geometry geo, + float* detector, + const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) +{ +// Point3D source , +// Point3D deltaU, +// Point3D deltaV, +// Point3D uvOrigin, +// float DSO, +// float maxdist){ + + unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; + unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; + unsigned long long projNumber=threadIdx.z; + + if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) + return; + + int indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array + + +#if IS_FOR_MATLAB_TIGRE + size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#else + size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; +#endif + + if(indAlpha>=totalNoOfProjections) + return; + + Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection + Point3D deltaU = projParamsArrayDev[4*projNumber+1]; + Point3D deltaV = projParamsArrayDev[4*projNumber+2]; + Point3D source = projParamsArrayDev[4*projNumber+3]; + + float DSO = projFloatsArrayDev[2*projNumber+0]; + float maxdist = projFloatsArrayDev[2*projNumber+1]; + + + /////// Get coordinates XYZ of pixel UV + unsigned long pixelV = geo.nDetecV-v-1; + unsigned long pixelU = u; + + + float vectX,vectY,vectZ; + Point3D P; + P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); + P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); + P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); + Point3D S; + S.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x); + S.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y); + S.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z); + + // Length is the ray length in normalized space + double length=sqrtf((S.x-P.x)*(S.x-P.x)+(S.y-P.y)*(S.y-P.y)+(S.z-P.z)*(S.z-P.z)); + //now legth is an integer of Nsamples that are required on this line + length=ceilf(length/geo.accuracy);//Divide the directional vector by an integer + vectX=(P.x -S.x)/(length); + vectY=(P.y -S.y)/(length); + vectZ=(P.z -S.z)/(length); + + +// //Integrate over the line + float tx,ty,tz; + float sum=0; + float i; + + + // limit the amount of mem access after the cube, but before the detector. + if ((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy < length) + length=ceilf((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy); + //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel") + + for (i=floorf(maxdist/geo.accuracy); i<=length; i=i+1){ + tx=vectX*i+S.x; + ty=vectY*i+S.y; + tz=vectZ*i+S.z; + + sum += tex3D(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time. + + } + float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+ + (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+ + (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) ); + detector[idx]=sum*deltalength; +} + + + +int interpolation_projection_parallel(float * img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ + + + + size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float); + float** dProjection=(float **)malloc(2*sizeof(float *)); + for (int i = 0; i < 2; ++i){ + cudaMalloc((void**)&dProjection[i], num_bytes); + cudaCheckErrors("cudaMalloc projections fail"); + } + // allocate streams for memory and compute + int nStreams=2; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + + for (int i = 0; i < 2; ++i){ + cudaStreamCreate(&stream[i]); + } + + + // Texture object variables + cudaTextureObject_t *texImg = 0; + cudaArray **d_cuArrTex = 0; + texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); + d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); + + CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream); + cudaCheckErrors("Texture allocation fail"); + //Done! Image put into texture memory. + + + + Point3D source, deltaU, deltaV, uvOrigin; + + Point3D* projParamsArrayHost; + cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); + float* projFloatsArrayHost; + cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); + + // 16x16 gave the best performance empirically + // Funnily that makes it compatible with most GPUs..... + int divU,divV,divangle; + divU=PIXEL_SIZE_BLOCK; + divV=PIXEL_SIZE_BLOCK; + + dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1); + dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK); + unsigned int proj_global; + unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK + unsigned int i; + + float maxdist; + for ( i=0; i=nangles) + break; + + geo.alpha=angles[proj_global*3]; + geo.theta=angles[proj_global*3+1]; + geo.psi =angles[proj_global*3+2]; + //precomute distances for faster execution + maxdist=maxdistanceCuboid(geo,proj_global); + //Precompute per angle constant stuff for speed + computeDeltas_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source); + //Ray tracing! + projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection + projParamsArrayHost[4*j+1]=deltaU; + projParamsArrayHost[4*j+2]=deltaV; + projParamsArrayHost[4*j+3]=source; + + projFloatsArrayHost[2*j]=geo.DSO[proj_global]; + projFloatsArrayHost[2*j+1]=floor(maxdist); + + } + cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); + cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); + cudaStreamSynchronize(stream[0]); + + kernelPixelDetector_parallel_interpolated<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); + // copy result to host + if (i>0) + cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); + } + cudaDeviceSynchronize(); + + int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; + cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); + + + cudaDestroyTextureObject(texImg[0]); + cudaFreeArray(d_cuArrTex[0]); + free(texImg); texImg = 0; + free(d_cuArrTex); d_cuArrTex = 0; + cudaCheckErrors("Unbind fail"); + cudaFree(dProjection[0]); + cudaFree(dProjection[1]); + free(dProjection); + cudaFreeHost(projParamsArrayHost); + cudaFreeHost(projFloatsArrayHost); + + cudaCheckErrors("cudaFree d_imagedata fail"); + + + for (int i = 0; i < 2; ++i){ + cudaStreamDestroy(stream[i]); + } +// cudaDeviceReset(); + + return 0; +} + + + + +/* This code precomputes The location of the source and the Delta U and delta V (in the warped space) + * to compute the locations of the x-rays. While it seems verbose and overly-optimized, + * it does saves about 30% of each of the kernel calls. Thats something! + **/ +void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){ + Point3D S; + S.x=geo.DSO[i]; + S.y=geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); + S.z=geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); + + //End point + Point3D P,Pu0,Pv0; + + P.x =-(geo.DSD[i]-geo.DSO[i]); P.y = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); P.z = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); + Pu0.x=-(geo.DSD[i]-geo.DSO[i]); Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5); Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); + Pv0.x=-(geo.DSD[i]-geo.DSO[i]); Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1); + // Geometric trasnformations: + P.x=0;Pu0.x=0;Pv0.x=0; + + // Roll pitch yaw + rollPitchYaw(geo,i,&P); + rollPitchYaw(geo,i,&Pu0); + rollPitchYaw(geo,i,&Pv0); + //Now lets translate the points where they should be: + P.x=P.x-(geo.DSD[i]-geo.DSO[i]); + Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]); + Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]); + + S.x=0; + // Roll pitch yaw + rollPitchYaw(geo,i,&S); + //Now lets translate the points where they should be: + S.x=S.x+geo.DSO[i]; + + + //1: Offset detector + + //P.x + P.y =P.y +geo.offDetecU[i]; P.z =P.z +geo.offDetecV[i]; + Pu0.y=Pu0.y+geo.offDetecU[i]; Pu0.z=Pu0.z+geo.offDetecV[i]; + Pv0.y=Pv0.y+geo.offDetecU[i]; Pv0.z=Pv0.z+geo.offDetecV[i]; + //S doesnt need to chagne + + + //3: Rotate (around z)! + Point3D Pfinal, Pfinalu0, Pfinalv0; + Pfinal.x =P.x; + Pfinal.y =P.y +geo.offDetecU[i]; Pfinal.z =P.z +geo.offDetecV[i]; + Pfinalu0.x=Pu0.x; + Pfinalu0.y=Pu0.y +geo.offDetecU[i]; Pfinalu0.z =Pu0.z +geo.offDetecV[i]; + Pfinalv0.x=Pv0.x; + Pfinalv0.y=Pv0.y +geo.offDetecU[i]; Pfinalv0.z =Pv0.z +geo.offDetecV[i]; + + eulerZYZ(geo,&Pfinal); + eulerZYZ(geo,&Pfinalu0); + eulerZYZ(geo,&Pfinalv0); + eulerZYZ(geo,&S); + + + + //2: Offset image (instead of offseting image, -offset everything else) + + Pfinal.x =Pfinal.x-geo.offOrigX[i]; Pfinal.y =Pfinal.y-geo.offOrigY[i]; Pfinal.z =Pfinal.z-geo.offOrigZ[i]; + Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i]; Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i]; Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i]; + Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i]; Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i]; Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i]; + S.x=S.x-geo.offOrigX[i]; S.y=S.y-geo.offOrigY[i]; S.z=S.z-geo.offOrigZ[i]; + + // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation); + Pfinal.x =Pfinal.x+geo.sVoxelX/2-geo.dVoxelX/2; Pfinal.y =Pfinal.y+geo.sVoxelY/2-geo.dVoxelY/2; Pfinal.z =Pfinal.z +geo.sVoxelZ/2-geo.dVoxelZ/2; + Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2-geo.dVoxelX/2; Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2-geo.dVoxelY/2; Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2-geo.dVoxelZ/2; + Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2-geo.dVoxelX/2; Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2-geo.dVoxelY/2; Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2-geo.dVoxelZ/2; + S.x =S.x+geo.sVoxelX/2-geo.dVoxelX/2; S.y =S.y+geo.sVoxelY/2-geo.dVoxelY/2; S.z =S.z +geo.sVoxelZ/2-geo.dVoxelZ/2; + + //4. Scale everything so dVoxel==1 + Pfinal.x =Pfinal.x/geo.dVoxelX; Pfinal.y =Pfinal.y/geo.dVoxelY; Pfinal.z =Pfinal.z/geo.dVoxelZ; + Pfinalu0.x=Pfinalu0.x/geo.dVoxelX; Pfinalu0.y=Pfinalu0.y/geo.dVoxelY; Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ; + Pfinalv0.x=Pfinalv0.x/geo.dVoxelX; Pfinalv0.y=Pfinalv0.y/geo.dVoxelY; Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ; + S.x =S.x/geo.dVoxelX; S.y =S.y/geo.dVoxelY; S.z =S.z/geo.dVoxelZ; + + + + //5. apply COR. Wherever everything was, now its offesetd by a bit + float CORx, CORy; + CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX; + CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY; + Pfinal.x+=CORx; Pfinal.y+=CORy; + Pfinalu0.x+=CORx; Pfinalu0.y+=CORy; + Pfinalv0.x+=CORx; Pfinalv0.y+=CORy; + S.x+=CORx; S.y+=CORy; + + // return + + *uvorigin=Pfinal; + + deltaU->x=Pfinalu0.x-Pfinal.x; + deltaU->y=Pfinalu0.y-Pfinal.y; + deltaU->z=Pfinalu0.z-Pfinal.z; + + deltaV->x=Pfinalv0.x-Pfinal.x; + deltaV->y=Pfinalv0.y-Pfinal.y; + deltaV->z=Pfinalv0.z-Pfinal.z; + + *source=S; +} +void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; + + + const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + + //cudaArray Descriptor + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //cuda Array + cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + + + cudaMemcpy3DParms copyParams = {0}; + //Array creation + copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); + copyParams.dstArray = d_cuArrTex[0]; + copyParams.extent = extent; + copyParams.kind = cudaMemcpyHostToDevice; + cudaMemcpy3DAsync(©Params,stream[1]); + + + //Array creation End + + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_cuArrTex[0]; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeBorder; + texDescr.addressMode[1] = cudaAddressModeBorder; + texDescr.addressMode[2] = cudaAddressModeBorder; + texDescr.readMode = cudaReadModeElementType; + cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + +} \ No newline at end of file diff --git a/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip b/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip new file mode 100644 index 00000000..1280b6ed --- /dev/null +++ b/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------- + * + * Header CUDA functions for texture-memory interpolation based projection + * + * + * CODE by Ander Biguri + * Sepideh Hatamikia (arbitrary rotation) +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + + + + +#include "ray_interpolated_projection.hpp" + +#include "types_TIGRE.hpp" +#include "GpuIds.hpp" + +#ifndef PROJECTION_PARALLEL_HPP +#define PROJECTION_PARALLEL_HPP + +int interpolation_projection_parallel(float* img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids); +// float computeMaxLength(Geometry geo, float alpha); +void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source); + +// float maxDistanceCubeXY(Geometry geo, float alpha,int i); + +// below, not used +Geometry nomralizeGeometryImage(Geometry geo); +#endif \ No newline at end of file diff --git a/Common/CUDA/tv_proximal.cu b/Common/CUDA/tv_proximal.cu index 32ae99c2..87d5407f 100644 --- a/Common/CUDA/tv_proximal.cu +++ b/Common/CUDA/tv_proximal.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * MATLAB MEX functions for TV image denoising. Check inputs and parses @@ -57,17 +58,17 @@ #include "tv_proximal.hpp" #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - cudaDeviceReset();\ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ + hipDeviceReset();\ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",hipGetErrorString(__err));\ } \ } while (0) void cpy_from_host(float* device_array,float* host_array, unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, unsigned long long pixels_per_slice, unsigned int buffer_length, - cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size); + hipStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size); __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n) @@ -263,11 +264,11 @@ void cpy_from_host(float* device_array,float* host_array, // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif if (isHostRegisterSupported & splits>1){ - cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + hipHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); + hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -282,21 +283,21 @@ void cpy_from_host(float* device_array,float* host_array, if (buffer_length0){ // U - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer_u, d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]); - cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]); - cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]); - cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]); + hipSetDevice(gpuids[dev-1]); + hipMemcpyAsync(buffer_u, d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]); + hipMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]); + hipMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]); + hipMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]); - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]); - cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]); - cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]); - cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]); - cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+1]); + hipMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+1]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+2]); + hipMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+2]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+3]); + hipMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+3]); + hipStreamSynchronize(stream[(dev-1)*nStream_device+4]); + hipMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+4]); } @@ -567,22 +568,22 @@ void cpy_from_host(float* device_array,float* host_array, }else{ // Vopy all the U variable into the host. for(dev=0; dev1 && buffer_length1){ - cudaHostUnregister(src); - cudaHostUnregister(dst); + hipHostUnregister(src); + hipHostUnregister(dst); } for(dev=0; dev= 0 ) { + _div += (pz[idx] - pz[(z-1)*size2d + y*cols + x]) / dz; + } else { + _div += pz[idx]; + } + + if ( y - 1 >= 0 ) { + _div += (py[idx] - py[z*size2d + (y-1)*cols + x]) / dy; + } else { + _div += py[idx]; + } + + if ( x - 1 >= 0 ) { + _div += (px[idx] - px[z*size2d + y*cols + (x-1)]) / dx; + } else { + _div += px[idx]; + } + + return _div; + } + + __device__ __inline__ + void gradient(const float* u, float* grad, + long z, long y, long x, + long depth, long rows, long cols, + float dz, float dy, float dx) + { + long size2d = rows*cols; + long idx = z * size2d + y * cols + x; + + float uidx = u[idx]; + + if ( z + 1 < depth ) { + grad[0] = (u[(z+1)*size2d + y*cols + x] - uidx) / dz; + } + + if ( y + 1 < rows ) { + grad[1] = (u[z*size2d + (y+1)*cols + x] - uidx) / dy; + } + + if ( x + 1 < cols ) { + grad[2] = (u[z*size2d + y*cols + (x+1)] - uidx) / dx; + } + } + + + __global__ + void update_u(const float* f, const float* pz, const float* py, const float* px, float* u, + float tau, float lambda, + long depth, long rows, long cols, + float dz, float dy, float dx) + { + long x = threadIdx.x + blockIdx.x * blockDim.x; + long y = threadIdx.y + blockIdx.y * blockDim.y; + long z = threadIdx.z + blockIdx.z * blockDim.z; + long idx = z * rows * cols + y * cols + x; + + if ( x >= cols || y >= rows || z >= depth ) + return; + + float _div = divergence(pz, py, px, z, y, x, depth, rows, cols, dz, dy, dx); + + u[idx] = u[idx] * (1.0f - tau) + tau * (f[idx] + (1.0f/lambda) * _div); + } + + + __global__ + void update_p(const float* u, float* pz, float* py, float* px, + float tau, long depth, long rows, long cols, + float dz, float dy, float dx) + { + long x = threadIdx.x + blockIdx.x * blockDim.x; + long y = threadIdx.y + blockIdx.y * blockDim.y; + long z = threadIdx.z + blockIdx.z * blockDim.z; + long idx = z * rows * cols + y * cols + x; + + if ( x >= cols || y >= rows || z >= depth ) + return; + + float grad[3] = {0,0,0}, q[3]; + gradient(u, grad, z, y, x, depth, rows, cols, dz, dy, dx); + + q[0] = pz[idx] + tau * grad[0]; + q[1] = py[idx] + tau * grad[1]; + q[2] = px[idx] + tau * grad[2]; + + float norm = fmaxf(1.0f, sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2])); + + pz[idx] = q[0] / norm; + py[idx] = q[1] / norm; + px[idx] = q[2] / norm; + } + + +// Main function + void tvdenoising(float* src, float* dst, float lambda, + const float* spacing, const long* image_size, int maxIter, const GpuIds& gpuids) { + + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","There are no available device(s) that support CUDA\n"); + } + // + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + int dev; + + // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. + + size_t mem_GPU_global; + checkFreeMemory(gpuids, &mem_GPU_global); + + + // %5 of free memory should be enough, we have almost no variables in these kernels + size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; + const size_t pixels_per_slice = image_size[0] * image_size[1] ; + const size_t mem_slice_image = sizeof(float)* pixels_per_slice ; + const size_t mem_size_image = sizeof(float)* total_pixels; + + // Decide how are we handling the distribution of computation + size_t mem_img_each_GPU; + + unsigned int buffer_length=1; + //Does everything fit in the GPU? + unsigned int slices_per_split; + unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this. + if(mem_GPU_global> 5*mem_size_image+5*mem_slice_image*buffer_length*2){ + // We only need to split if we have extra GPUs + slices_per_split=(image_size[2]+deviceCount-1)/deviceCount; + mem_img_each_GPU=mem_slice_image*( (image_size[2]+deviceCount-1)/deviceCount + buffer_length*2); + }else{ + // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all + size_t mem_free=mem_GPU_global; + + splits=(unsigned int)(ceil(((float)(5*mem_size_image)/(float)(deviceCount))/mem_free)); + // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images. + // lets make sure these 2 slices fit, if they do not, add 1 to splits. + slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); + mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); + + // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory + if (mem_GPU_global< 5*mem_img_each_GPU){ + // one more split should do the job, as its an edge case. + splits++; + //recompute for later + slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap + mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); + } + + // How many EXTRA buffer slices should be able to fit in here??!?! + mem_free=mem_GPU_global-(5*mem_img_each_GPU); + unsigned int extra_buff=(mem_free/mem_slice_image); + buffer_length=(extra_buff/2)/5; // we need double whatever this results in, rounded down. + + buffer_length=min(MAX_BUFFER,buffer_length); + + mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); + + // Assert + if (mem_GPU_global< 5*mem_img_each_GPU){ + mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","Bad assert. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n"); + } + } + + + // Lets try to make the host memory pinned: + // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + if (isHostRegisterSupported & splits>1){ + cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); + } + cudaCheckErrors("Error pinning memory"); + + + + // Lets allocate auxiliary variables. + float* buffer_u, *buffer_px, *buffer_py, *buffer_pz; + float* h_px, *h_py, *h_pz, *h_u; + if(splits>1){ + + //These take A LOT of memory and A LOT of time to use. If we can avoid using them, better. + if (buffer_length1 & i>0){ + + for (dev = 0; dev < deviceCount; dev++){ + is_last_chunk=!((sp*deviceCount+dev)>>(d_pz[dev], -1, pixels_per_slice*buffer_length); + } + if (is_last_chunk){ + multiplyArrayScalar<<<60,MAXTREADS,0,stream[dev*nStream_device+4]>>>(d_pz[dev]+bytes_device[dev],-1, pixels_per_slice*buffer_length); + } + } + for (dev = 0; dev < deviceCount; dev++){ + is_last_chunk=!((sp*deviceCount+dev)>>(d_src[dev], d_pz[dev], d_py[dev], d_px[dev], d_u[dev], tau1, lambda, + (long)(curr_slices+buffer_length*2), image_size[1],image_size[0], + spacing[2], spacing[1], spacing[0]); + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_u[dev], d_pz[dev], d_py[dev], d_px[dev], tau2, + (long)(curr_slices+buffer_length*2), image_size[1], image_size[0], + spacing[2], spacing[1], spacing[0]); + } + }// END internal iter + + // Synchronize mathematics, make sure bounding pixels are correct + for(dev=0; dev0){ + // U + cudaSetDevice(gpuids[dev-1]); + cudaMemcpyAsync(buffer_u, d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]); + cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]); + cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]); + cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]); + + + cudaSetDevice(gpuids[dev]); + cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]); + cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]); + cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]); + cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]); + cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]); + cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]); + cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]); + cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]); + + + } + } + // This is the case when we can't solely use GPU memory, as the total size of the images+variables exceeds total amounf of memory among GPUs. + // This situation requires partial results and full memory allocation in the host. + }else{ + // Vopy all the U variable into the host. + for(dev=0; dev1 && buffer_length1){ + cudaHostUnregister(src); + cudaHostUnregister(dst); + } + for(dev=0; dev Origin is at (0,0,0). Image center is there +offOrig + // -> at angle 0, source + image centre (without the offset) + detector centre (without offset) + // are aligned in the Y_Z plane. + // -> detector is orthonormal to projection plane. + + //Parameters part of the image geometry + int nVoxelX, nVoxelY, nVoxelZ; + float sVoxelX, sVoxelY, sVoxelZ; + float dVoxelX, dVoxelY, dVoxelZ; + float *offOrigX,*offOrigY,*offOrigZ; + float* DSO; + // Parameters of the Detector. + int nDetecU, nDetecV; + float sDetecU, sDetecV; + float dDetecU, dDetecV; + float *offDetecU, *offDetecV; + float* DSD; + float* dRoll; + float* dPitch; + float* dYaw; + // The base unit we are working with in mm. + float unitX; + float unitY; + float unitZ; + + //rotation angle for e uler (ZYZ) + float alpha; + float theta; + float psi; + // Centre of Rotation correction. + float* COR; + //Maximum length of cube + float maxLength; + //User option + float accuracy; +}; + + struct Point3D{ + float x; + float y; + float z; +}; + +struct Point3Ddouble{ + double x; + double y; + double z; + + // cast to float member function for "copying" Point3Ddouble to Point3D + Point3D to_float() + { + Point3D castToFloat; + castToFloat.x = (float)x; + castToFloat.y = (float)y; + castToFloat.z = (float)z; + return(castToFloat); + } +}; + +#endif \ No newline at end of file diff --git a/Common/CUDA/voxel_backprojection.cu b/Common/CUDA/voxel_backprojection.cu index bec4d909..8fb9df3c 100644 --- a/Common/CUDA/voxel_backprojection.cu +++ b/Common/CUDA/voxel_backprojection.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*------------------------------------------------------------------------- * * CUDA function for backrpojection using FDK weigts for CBCT @@ -45,8 +46,8 @@ #define PI_2 1.57079632679489661923 #include -#include -#include +#include +#include #include "voxel_backprojection.hpp" #include "TIGRE_common.hpp" #include @@ -55,10 +56,10 @@ // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\ } \ } while (0) @@ -91,7 +92,7 @@ do { \ * **/ - void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate); + void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, int nStreamDevice,bool allocate); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -134,7 +135,7 @@ __constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL]; // Description: Main FDK backprojection kernel //______________________________________________________________________________ -__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) +__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex) { // Old kernel call signature: @@ -323,16 +324,16 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ - cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } if (isHostRegisterSupported ){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); } #endif cudaCheckErrors("Error pinning memory"); @@ -348,20 +349,20 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); float** dimage=(float**)malloc(deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMalloc((void**)&dimage[dev], num_bytes_img); - cudaCheckErrors("cudaMalloc fail"); + hipSetDevice(gpuids[dev]); + hipMalloc((void**)&dimage[dev], num_bytes_img); + cudaCheckErrors("hipMalloc fail"); } //If it is the first time, lets make sure our image is zeroed. int nStreamDevice=2; int nStreams=deviceCount*nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i+dev*nStreamDevice]); + hipStreamCreate(&stream[i+dev*nStreamDevice]); } } @@ -371,16 +372,16 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa // Kernel auxiliary variables Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D)); float* projSinCosArrayHost; - cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float)); + hipHostMalloc((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float)); // Texture object variables - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); + hipTextureObject_t *texProj; + hipArray **d_cuArrTex; + texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*)); // Auxiliary Host page-locked memory for fast and asycnornous memcpy. @@ -401,8 +402,8 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa for(unsigned int img_slice=0;img_slice>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); } // END for @@ -551,8 +552,8 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa } // END sub-split of current projection chunk for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } } // END projection splits @@ -560,15 +561,15 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa // Now we need to take the image out of the GPU for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); // We do not need to sycnronize because the array dealocators already do. num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); - cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); + hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); } for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); cudaCheckErrors("Main loop fail"); } @@ -582,38 +583,38 @@ int voxel_backprojection(float * projections, Geometry geo, float* result,floa if (!two_buffers_used && i==1) break; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texProj[i*deviceCount+dev]); - cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texProj[i*deviceCount+dev]); + hipFreeArray(d_cuArrTex[i*deviceCount+dev]); } } cudaCheckErrors("cudadestroy textures result fail"); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dimage[dev]); + hipSetDevice(gpuids[dev]); + hipFree(dimage[dev]); } - cudaFreeHost(projSinCosArrayHost); - cudaFreeHost(projParamsArrayHost); + hipHostFree(projSinCosArrayHost); + hipHostFree(projParamsArrayHost); free(partial_projection); free(proj_split_size); freeGeoArray(split_image*deviceCount,geoArray); #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ - cudaHostUnregister(result); + hipHostUnregister(result); } if (isHostRegisterSupported){ - cudaHostUnregister(projections); + hipHostUnregister(projections); } #endif for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); - //cudaDeviceReset(); // For the Nvidia Visual Profiler + //hipDeviceReset(); // For the Nvidia Visual Profiler return 0; } // END voxel_backprojection @@ -664,52 +665,52 @@ void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsign } -void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){ +void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; #if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles); #else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles); #endif const unsigned int num_devices = gpuids.GetLength(); if (allocate){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); } //Array creation End for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); } } @@ -903,8 +904,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ const int deviceCount = gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree +#include +#include +#include "voxel_backprojection.hpp" +#include "TIGRE_common.hpp" +#include +#include "GpuIds.hpp" + +// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + } \ +} while (0) + + +#define MAXTREADS 1024 + /*GEOMETRY DEFINITION + * + * Detector plane, behind + * |-----------------------------| + * | | + * | | + * | | + * | | + * | +--------+ | + * | / /| | + * A Z | / / |*D | + * | | +--------+ | | + * | | | | | | + * | | | *O | + | + * *--->y | | | / | + * / | | |/ | + * V X | +--------+ | + * |-----------------------------| + * + * *S + * + * + * + * + * + **/ + + void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate); + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were: +// PROJ_PER_KERNEL = 32 or 16 (very similar times) +// VOXELS_PER_THREAD = 8 +// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code. +// (e.g. 16.2 s vs. ~62 s). + +const int PROJ_PER_KERNEL = 32; // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck. +const int VOXELS_PER_THREAD = 8; // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck. + +// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection: +// deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec +// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel +// (they will be updated in the main loop before each kernel call). + +__constant__ Point3D projParamsArrayDev[6*PROJ_PER_KERNEL]; // Dev means it is on device + +// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above) +// Point3D projParamsArrayHost[6*PROJ_PER_KERNEL]; // Host means it is host memory + +// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection) +__constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL]; + + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + +//______________________________________________________________________________ +// +// Function: kernelPixelBackprojectionFDK +// +// Description: Main FDK backprojection kernel +//______________________________________________________________________________ + +__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) +{ + + // Old kernel call signature: + // kernelPixelBackprojectionFDK<<>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha); + // We just read in most of the params from the constant memory instead of getting them from the param list. + // This is because we now have MANY params, since single kernel processes more than one projection! + /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo, + * float* image, + * const int indAlpha, + * const Point3D deltaX , + * const Point3D deltaY, + * const Point3D deltaZ, + * const Point3D xyzOrigin, + * const Point3D xyzOffset, + * const Point3D uv0Offset, + * const float sinalpha, + * const float cosalpha){ + */ + unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y; + unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x; + // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle + unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle + //Make sure we don't go out of bounds + if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ) + return; + + // We'll keep a local auxiliary array of values of a column of voxels that this thread will update + float voxelColumn[VOXELS_PER_THREAD]; + + // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then + // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes + + unsigned long colIdx; +#pragma unroll + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; + voxelColumn[colIdx] = image[idx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) + // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. + } // END copy 3D volume voxels to local array + + // Now iterate through projections +#pragma unroll + for(unsigned long projNumber=0; projNumber=totalNoOfProjections) + break; + + Point3D deltaX = projParamsArrayDev[6*projNumber]; // 6*projNumber because we have 6 Point3D values per projection + Point3D deltaY = projParamsArrayDev[6*projNumber+1]; + Point3D deltaZ = projParamsArrayDev[6*projNumber+2]; + Point3D xyzOrigin = projParamsArrayDev[6*projNumber+3]; + Point3D xyzOffset = projParamsArrayDev[6*projNumber+4]; + Point3D S = projParamsArrayDev[6*projNumber+5]; + + float sinalpha = projSinCosArrayDev[5*projNumber]; // 2*projNumber because we have 2 float (sin or cos angle) values per projection + float cosalpha = projSinCosArrayDev[5*projNumber+1]; + float COR = projSinCosArrayDev[5*projNumber+2]; + float DSD = projSinCosArrayDev[5*projNumber+3]; + float DSO = projSinCosArrayDev[5*projNumber+4]; + + float auxCOR=COR/geo.dDetecU; + // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION +#pragma unroll + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles. + Point3D P; + P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x); + P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-auxCOR; + P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z); + + // This is the vector defining the line from the source to the Voxel + float vectX,vectY,vectZ; + vectX=(P.x -S.x); + vectY=(P.y -S.y); + vectZ=(P.z -S.z); + + // Get the coordinates in the detector UV where the mid point of the voxel is projected. + float t=__fdividef(DSO-DSD-S.x,vectX); + float y,z; + y=vectY*t+S.y; + z=vectZ*t+S.z; + float u,v; + u=y+(float)geo.nDetecU*0.5f; + v=z+(float)geo.nDetecV*0.5f; + + float weight; + float realx,realy; + realx=-(geo.sVoxelX-geo.dVoxelX)*0.5f +indX*geo.dVoxelX +xyzOffset.x; + realy=-(geo.sVoxelY-geo.dVoxelY)*0.5f +indY*geo.dVoxelY +xyzOffset.y+COR; + + weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO); + + weight=__frcp_rd(weight*weight); + + // Get Value in the computed (U,V) and multiply by the corresponding weight. + // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) + +#if IS_FOR_MATLAB_TIGRE + voxelColumn[colIdx]+=tex3D(tex, v, u ,indAlpha+0.5f)*weight; +#else + voxelColumn[colIdx]+=tex3D(tex, u, v ,indAlpha+0.5f)*weight; +#endif + } // END iterating through column of voxels + + } // END iterating through multiple projections + + // And finally copy the updated local voxelColumn array back to our 3D volume (main memory) +#pragma unroll + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; + image[idx] = voxelColumn[colIdx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) + // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. + // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write. + // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is + // better for avoiding memory congestion. + } // END copy updated voxels from local array to our 3D volume + +} // END kernelPixelBackprojectionFDK + + + + +//______________________________________________________________________________ +// +// Function: voxel_backprojection +// +// Description: Main host function for FDK backprojection (invokes the kernel) +//______________________________________________________________________________ + +int voxel_backprojection(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids) +{ + // printf("voxel_backprojection(geo.nDetector = %d, %d)\n", geo.nDetecU, geo.nDetecV); + // printf("geo.nVoxel = %d, %d, %d\n", geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); + + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n"); + } + + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + + int dev; + // Split the CT problem + unsigned int split_image; + unsigned int split_projections; + splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections); + + + cudaCheckErrors("Error"); + //Pagelock memory for synchronous copy. + // Lets try to make the host memory pinned: + // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to + // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. +#ifndef NO_PINNED_MEMORY + if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ + cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + } + if (isHostRegisterSupported ){ + cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + } +#endif + cudaCheckErrors("Error pinning memory"); + + + // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the + // image slices. The rest of the Geometry is the same + Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry)); + createGeoArray(split_image*deviceCount,geo,geoArray,nalpha); + + // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly + // in the previous section this should leave enough space for the textures. + size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); + float** dimage=(float**)malloc(deviceCount*sizeof(float*)); + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMalloc((void**)&dimage[dev], num_bytes_img); + cudaCheckErrors("cudaMalloc fail"); + } + + //If it is the first time, lets make sure our image is zeroed. + int nStreamDevice=2; + int nStreams=deviceCount*nStreamDevice; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + for (int i = 0; i < nStreamDevice; ++i){ + cudaStreamCreate(&stream[i+dev*nStreamDevice]); + + } + } + + + + + // Kernel auxiliary variables + Point3D* projParamsArrayHost; + cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D)); + float* projSinCosArrayHost; + cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float)); + + + // Texture object variables + cudaTextureObject_t *texProj; + cudaArray **d_cuArrTex; + texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); + d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); + + // Auxiliary Host page-locked memory for fast and asycnornous memcpy. + + // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop + // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution + // of the code, as repeated allocation and deallocation only happens when the projection data is very very big, + // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time. + unsigned long long proj_linear_idx_start; + unsigned int proj_split_overlap_number; + unsigned int current_proj_split_size,current_proj_overlap_split_size; + size_t num_bytes_img_curr; + size_t img_linear_idx_start; + float** partial_projection; + size_t* proj_split_size; + + + + for(unsigned int img_slice=0;img_slice=proj_split_size[proj_block_split]) + break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. + if(currProjNumber_global>=nalpha) + break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. + + Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source; + float sinalpha,cosalpha; + + geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now. + geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1]; + geoArray[img_slice*deviceCount+dev].psi =-alphas[currProjNumber_global*3+2]; + +// mexPrintf("%u %f \n",i,geoArray[img_slice*deviceCount+dev].alpha); +// mexPrintf("%u \n",currProjNumber_global); + + sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha); + cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha); + + projSinCosArrayHost[5*j]=sinalpha; // 2*j because we have 2 float (sin or cos angle) values per projection + projSinCosArrayHost[5*j+1]=cosalpha; + projSinCosArrayHost[5*j+2]=geo.COR[currProjNumber_global]; + projSinCosArrayHost[5*j+3]=geo.DSD[currProjNumber_global]; + projSinCosArrayHost[5*j+4]=geo.DSO[currProjNumber_global]; + + computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); + + offOrig.x=geo.offOrigX[currProjNumber_global]; + offOrig.y=geo.offOrigY[currProjNumber_global]; + offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global]; + + projParamsArrayHost[6*j]=deltaX; // 6*j because we have 6 Point3D values per projection + projParamsArrayHost[6*j+1]=deltaY; + projParamsArrayHost[6*j+2]=deltaZ; + projParamsArrayHost[6*j+3]=xyzOrigin; + projParamsArrayHost[6*j+4]=offOrig; + projParamsArrayHost[6*j+5]=source; + } // END for (preparing params for kernel call) + + // Copy the prepared parameter arrays to constant memory to make it available for the kernel + cudaMemcpyToSymbolAsync(projSinCosArrayDev, projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); + cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); + cudaStreamSynchronize(stream[dev*nStreamDevice]); + + kernelPixelBackprojectionFDK<<>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); + } // END for + ////////////////////////////////////////////////////////////////////////////////////// + // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject + ////////////////////////////////////////////////////////////////////////////////////// + }// END for deviceCount + } // END sub-split of current projection chunk + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + + } // END projection splits + + + // Now we need to take the image out of the GPU + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + // We do not need to sycnronize because the array dealocators already do. + num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); + img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); + cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); + } + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + cudaCheckErrors("Main loop fail"); + } + + } // end image splits + + ///////// Cleaning: + + + bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1; + for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) + if (!two_buffers_used && i==1) + break; + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDestroyTextureObject(texProj[i*deviceCount+dev]); + cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); + } + } + cudaCheckErrors("cudadestroy textures result fail"); + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaFree(dimage[dev]); + } + cudaFreeHost(projSinCosArrayHost); + cudaFreeHost(projParamsArrayHost); + free(partial_projection); + free(proj_split_size); + + freeGeoArray(split_image*deviceCount,geoArray); +#ifndef NO_PINNED_MEMORY + if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ + cudaHostUnregister(result); + } + if (isHostRegisterSupported){ + cudaHostUnregister(projections); + } +#endif + + for (int i = 0; i < nStreams; ++i) + cudaStreamDestroy(stream[i]); + + cudaCheckErrors("cudaFree fail"); + + //cudaDeviceReset(); // For the Nvidia Visual Profiler + return 0; + +} // END voxel_backprojection +// + +void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){ + + + // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. + + size_t mem_GPU_global; + checkFreeMemory(gpuids, &mem_GPU_global); + + const int deviceCount = gpuids.GetLength(); + + // Compute how much memory each of the relevant memory pieces need + size_t mem_image= (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); + size_t mem_proj= (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float); + + + + + // Does everything fit in the GPU? + + if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2(); + //cuda Array + cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + + } + } + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemcpy3DParms copyParams = {0}; + //Array creation + copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.dstArray = d_cuArrTex[dev]; + copyParams.extent = extent; + copyParams.kind = cudaMemcpyHostToDevice; + cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); + } + + //Array creation End + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_cuArrTex[dev]; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeBorder; + texDescr.addressMode[1] = cudaAddressModeBorder; + texDescr.addressMode[2] = cudaAddressModeBorder; + texDescr.readMode = cudaReadModeElementType; + cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + } +} + +//______________________________________________________________________________ +// +// Function: createGeoArray +// +// Description: This code generates the geometries needed to split the image properly in +// cases where the entire image does not fit in the memory of the GPU +//______________________________________________________________________________ + +void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles){ + + + unsigned int splitsize=(geo.nVoxelZ+image_splits-1)/image_splits; + + for(unsigned int sp=0;spx; + auxPoint.y=point->y; + auxPoint.z=point->z; + + // calculate sin and cos of 3 angles (used multiple times) + double sin_alpha, cos_alpha, sin_theta, cos_theta, sin_psi, cos_psi; + sin_alpha = sin((double)geo.alpha); + cos_alpha = cos((double)geo.alpha); + sin_theta = sin((double)geo.theta); + cos_theta = cos((double)geo.theta); + sin_psi = sin((double)geo.psi); + cos_psi = cos((double)geo.psi); + + point->x = auxPoint.x*(cos_psi*cos_theta*cos_alpha-sin_psi*sin_alpha) + +auxPoint.y*(-cos_psi*cos_theta*sin_alpha-sin_psi*cos_alpha) + +auxPoint.z*cos_psi*sin_theta; + point->y = auxPoint.x*(sin_psi*cos_theta*cos_alpha+cos_psi*sin_alpha) + +auxPoint.y*(-sin_psi*cos_theta*sin_alpha+cos_psi*cos_alpha) + +auxPoint.z*sin_psi*sin_theta; + point->z =-auxPoint.x*sin_theta*cos_alpha + +auxPoint.y*sin_theta*sin_alpha + +auxPoint.z*cos_theta; +} + +void rollPitchYawT(Geometry geo,int i, Point3Ddouble* point){ + + Point3Ddouble auxPoint; + auxPoint.x=point->x; + auxPoint.y=point->y; + auxPoint.z=point->z; + + // calculate sin and cos of 3 angles (used multiple times) + double sin_dRoll, cos_dRoll, sin_dPitch, cos_dPitch, sin_dYaw, cos_dYaw; + sin_dRoll = sin((double)geo.dRoll[i]); + cos_dRoll = cos((double)geo.dRoll[i]); + sin_dPitch = sin((double)geo.dPitch[i]); + cos_dPitch = cos((double)geo.dPitch[i]); + sin_dYaw = sin((double)geo.dYaw[i]); + cos_dYaw = cos((double)geo.dYaw[i]); + + point->x=cos_dRoll*cos_dPitch*auxPoint.x + +sin_dRoll*cos_dPitch*auxPoint.y + -sin_dPitch*auxPoint.z; + + point->y=(cos_dRoll*sin_dPitch*sin_dYaw - sin_dRoll*cos_dYaw)*auxPoint.x + +(sin_dRoll*sin_dPitch*sin_dYaw + cos_dRoll*cos_dYaw)*auxPoint.y + +cos_dPitch*sin_dYaw*auxPoint.z; + + point->z=(cos_dRoll*sin_dPitch*cos_dYaw + sin_dRoll*sin_dYaw)*auxPoint.x + +(sin_dRoll*sin_dPitch*cos_dYaw - cos_dRoll*sin_dYaw)*auxPoint.y + +cos_dPitch*cos_dYaw*auxPoint.z; +} + +//______________________________________________________________________________ +// +// Function: computeDeltasCube +// +// Description: Computes relative increments for each projection (volume rotation). +// Increments get passed to the backprojection kernel. +//______________________________________________________________________________ + +void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S) +{ + + // initialize points with double precision + Point3Ddouble P, Px,Py,Pz; + + // Get coords of Img(0,0,0) + P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i]; + P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i]; + P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i]; + + // Get coords from next voxel in each direction + Px.x=P.x+geo.dVoxelX; Py.x=P.x; Pz.x=P.x; + Px.y=P.y; Py.y=P.y+geo.dVoxelY; Pz.y=P.y; + Px.z=P.z; Py.z=P.z; Pz.z=P.z+geo.dVoxelZ; + + // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ + eulerZYZT(geo,&P); + eulerZYZT(geo,&Px); + eulerZYZT(geo,&Py); + eulerZYZT(geo,&Pz); + + //detector offset + P.z =P.z-geo.offDetecV[i]; P.y =P.y-geo.offDetecU[i]; + Px.z =Px.z-geo.offDetecV[i]; Px.y =Px.y-geo.offDetecU[i]; + Py.z =Py.z-geo.offDetecV[i]; Py.y =Py.y-geo.offDetecU[i]; + Pz.z =Pz.z-geo.offDetecV[i]; Pz.y =Pz.y-geo.offDetecU[i]; + + //Detector Roll pitch Yaw + // + // first, we need to offset everything so (0,0,0) is the center of the detector + // Only X is required for that + P.x=P.x+(geo.DSD[i]-geo.DSO[i]); + Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]); + Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]); + Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]); + rollPitchYawT(geo,i,&P); + rollPitchYawT(geo,i,&Px); + rollPitchYawT(geo,i,&Py); + rollPitchYawT(geo,i,&Pz); + + P.x=P.x-(geo.DSD[i]-geo.DSO[i]); + Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]); + Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]); + Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]); + //Done for P, now source + Point3Ddouble source; + source.x=geo.DSD[i]; //already offseted for rotation + source.y=-geo.offDetecU[i]; + source.z=-geo.offDetecV[i]; + rollPitchYawT(geo,i,&source); + + source.x=source.x-(geo.DSD[i]-geo.DSO[i]);// source.y=source.y-auxOff.y; source.z=source.z-auxOff.z; + +// mexPrintf("%f,%f,%f\n",source.x,source.y,source.z); + // Scale coords so detector pixels are 1x1 + + P.z =P.z /geo.dDetecV; P.y =P.y/geo.dDetecU; + Px.z=Px.z/geo.dDetecV; Px.y=Px.y/geo.dDetecU; + Py.z=Py.z/geo.dDetecV; Py.y=Py.y/geo.dDetecU; + Pz.z=Pz.z/geo.dDetecV; Pz.y=Pz.y/geo.dDetecU; + + source.z=source.z/geo.dDetecV; source.y=source.y/geo.dDetecU; + + // get deltas of the changes in voxels + deltaX->x=Px.x-P.x; deltaX->y=Px.y-P.y; deltaX->z=Px.z-P.z; + deltaY->x=Py.x-P.x; deltaY->y=Py.y-P.y; deltaY->z=Py.z-P.z; + deltaZ->x=Pz.x-P.x; deltaZ->y=Pz.y-P.y; deltaZ->z=Pz.z-P.z; + + // cast the results from the double precision calculations back to float + *xyzorigin=P.to_float(); + *S=source.to_float(); +} + +void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ + size_t memfree; + size_t memtotal; + const int deviceCount = gpuids.GetLength(); + + for (int dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemGetInfo(&memfree,&memtotal); + if(dev==0) *mem_GPU_global=memfree; + if(memfree -#include -#include +#include +#include #include "voxel_backprojection2.hpp" #include "TIGRE_common.hpp" #include @@ -55,10 +56,10 @@ // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\ } \ } while (0) @@ -92,7 +93,7 @@ do { \ **/ // this definitionmust go here. -void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate); +void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate); __global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){ size_t idx = threadIdx.x + blockIdx.x * blockDim.x; @@ -139,7 +140,7 @@ __constant__ float projSinCosArray2Dev[5*PROJ_PER_KERNEL]; // Description: Main FDK backprojection kernel //______________________________________________________________________________ -__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) +__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex) { unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y; @@ -355,9 +356,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); float** dimage=(float**)malloc(deviceCount*sizeof(float*)); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMalloc((void**)&dimage[dev], num_bytes_img); - cudaCheckErrors("cudaMalloc fail"); + hipSetDevice(gpuids[dev]); + hipMalloc((void**)&dimage[dev], num_bytes_img); + cudaCheckErrors("hipMalloc fail"); } @@ -366,15 +367,15 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. if (isHostRegisterSupported & split_image>1){ - cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable); } if (isHostRegisterSupported ){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -385,27 +386,27 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float //If it is the first time, lets make sure our image is zeroed. int nStreamDevice=2; int nStreams=deviceCount*nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i+dev*nStreamDevice]); + hipStreamCreate(&stream[i+dev*nStreamDevice]); } } // Kernel auxiliary variables Point3D* projParamsArray2Host; - cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D)); float* projSinCosArray2Host; - cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float)); + hipHostMalloc((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float)); // Texture object variables - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); + hipTextureObject_t *texProj; + hipArray **d_cuArrTex; + texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*)); @@ -425,8 +426,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float // // Initialize the memory if its the first time. for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemset(dimage[dev],0,num_bytes_img); + hipSetDevice(gpuids[dev]); + hipMemset(dimage[dev],0,num_bytes_img); cudaCheckErrors("memset fail"); } @@ -478,8 +479,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*nStreamDevice+1]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*nStreamDevice+1]); } for (dev = 0; dev < deviceCount; dev++){ @@ -489,7 +490,7 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0) break; - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); @@ -566,9 +567,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float } // END for (preparing params for kernel call) // Copy the prepared parameter arrays to constant memory to make it available for the kernel - cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaStreamSynchronize(stream[dev*nStreamDevice]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArray2Dev), projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArray2Dev), projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]); + hipStreamSynchronize(stream[dev*nStreamDevice]); kernelPixelBackprojection<<>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); } // END for @@ -581,24 +582,24 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float } // END projection splits for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>( geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV)); } // Now we need to take the image out of the GPU for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*nStreamDevice]); + hipSetDevice(gpuids[dev]); + hipStreamSynchronize(stream[dev*nStreamDevice]); num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); - cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); + hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); } } // end image splits for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); + hipSetDevice(gpuids[dev]); + hipDeviceSynchronize(); } @@ -607,40 +608,40 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) if (!two_buffers_used && i==1) break; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texProj[i*deviceCount+dev]); - cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); + hipSetDevice(gpuids[dev]); + hipDestroyTextureObject(texProj[i*deviceCount+dev]); + hipFreeArray(d_cuArrTex[i*deviceCount+dev]); } } free(d_cuArrTex); free(texProj); for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dimage[dev]); + hipSetDevice(gpuids[dev]); + hipFree(dimage[dev]); } free(dimage); - cudaFreeHost(projSinCosArray2Host); - cudaFreeHost(projParamsArray2Host); + hipHostFree(projSinCosArray2Host); + hipHostFree(projParamsArray2Host); free(partial_projection); free(proj_split_size); freeGeoArray(split_image*deviceCount,geoArray); #ifndef NO_PINNED_MEMORY if (isHostRegisterSupported & split_image>1){ - cudaHostUnregister(result); + hipHostUnregister(result); } if (isHostRegisterSupported){ - cudaHostUnregister(projections); + hipHostUnregister(projections); } #endif for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); - cudaCheckErrors("cudaFree fail"); + cudaCheckErrors("hipFree fail"); -// cudaDeviceReset(); // For the Nvidia Visual Profiler +// hipDeviceReset(); // For the Nvidia Visual Profiler return 0; } // END voxel_backprojection @@ -649,52 +650,52 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float -void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){ +void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; int num_devices = gpuids.GetLength(); #if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles); #else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles); #endif if (allocate){ for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); + hipSetDevice(gpuids[dev]); - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //hipArray Descriptor + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); } } for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; + hipSetDevice(gpuids[dev]); + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[dev]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); } //Array creation End for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipSetDevice(gpuids[dev]); + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); } } #ifndef BACKPROJECTION_HPP @@ -826,8 +827,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ const int gpuids.GetLength(); for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); + hipSetDevice(gpuids[dev]); + hipMemGetInfo(&memfree,&memtotal); if(dev==0) *mem_GPU_global=memfree; if(memfree +#include +#include +#include "voxel_backprojection2.hpp" +#include "TIGRE_common.hpp" +#include +#include "GpuIds.hpp" + +// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + } \ +} while (0) + + +#define MAXTREADS 1024 + /*GEOMETRY DEFINITION + * + * Detector plane, behind + * |-----------------------------| + * | | + * | | + * | | + * | | + * | +--------+ | + * | / /| | + * A Z | / / |*D | + * | | +--------+ | | + * | | | | | | + * | | | *O | + | + * *--->y | | | / | + * / | | |/ | + * V X | +--------+ | + * |-----------------------------| + * + * *S + * + * + * + * + * + **/ + +// this definitionmust go here. +void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate); + +__global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){ + size_t idx = threadIdx.x + blockIdx.x * blockDim.x; + for(; idx=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ) + return; + + // We'll keep a local auxiliary array of values of a column of voxels that this thread will update + float voxelColumn[VOXELS_PER_THREAD]; + + // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then + // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes + + unsigned long colIdx; +#pragma unroll + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; + voxelColumn[colIdx] = image[idx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) + // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. + } // END copy 3D volume voxels to local array + + // Now iterate through projections +#pragma unroll + for(unsigned long projNumber=0; projNumber=totalNoOfProjections) + break; + + Point3D deltaX = projParamsArray2Dev[7*projNumber]; // 6*projNumber because we have 6 Point3D values per projection + Point3D deltaY = projParamsArray2Dev[7*projNumber+1]; + Point3D deltaZ = projParamsArray2Dev[7*projNumber+2]; + Point3D xyzOrigin = projParamsArray2Dev[7*projNumber+3]; + Point3D xyzOffset = projParamsArray2Dev[7*projNumber+4]; + Point3D uv0Offset = projParamsArray2Dev[7*projNumber+5]; + Point3D S = projParamsArray2Dev[7*projNumber+6]; + + float sinalpha = projSinCosArray2Dev[5*projNumber]; // 2*projNumber because we have 2 float (sin or cos angle) values per projection + float cosalpha = projSinCosArray2Dev[5*projNumber+1]; + float COR = projSinCosArray2Dev[5*projNumber+2]; + float DSD = projSinCosArray2Dev[5*projNumber+3]; + float DSO = projSinCosArray2Dev[5*projNumber+4]; + // Precomputations for the weights: + //Real coords of Source + // We already have S.x (geo.DSO), and S.y and S.z are always zero. we just need to rotate + Point3D realS; + realS.x= DSO*cosalpha; + realS.y=-DSO*sinalpha; + realS.z=0; + + + Point3D realvoxel_init; + realvoxel_init.x=-geo.sVoxelX/2+geo.dVoxelX/2+xyzOffset.x; + realvoxel_init.y=-geo.sVoxelY/2+geo.dVoxelY/2+xyzOffset.y; + realvoxel_init.z=-geo.sVoxelZ/2+geo.dVoxelZ/2+xyzOffset.z; + // Real XYZ coordinates of Detector. + Point3D realD, realDaux; + // We know the index of the detector (u,v). Start from there. + realDaux.x=-(DSD-DSO); + + // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION +#pragma unroll + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles. + Point3D P; + P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x); + P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU; + P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z); + + // This is the vector defining the line from the source to the Voxel + float vectX,vectY,vectZ; + vectX=(P.x -S.x); + vectY=(P.y -S.y); + vectZ=(P.z -S.z); + + // Get the coordinates in the detector UV where the mid point of the voxel is projected. + float t=__fdividef(DSO-DSD-S.x,vectX); + float y,z; + y=vectY*t+S.y; + z=vectZ*t+S.z; + float u,v; + u=y+(float)geo.nDetecU*0.5f; + v=z+(float)geo.nDetecV*0.5f; +#if IS_FOR_MATLAB_TIGRE + float sample=tex3D(tex, v, u ,indAlpha+0.5f); +#else + float sample=tex3D(tex, u, v ,indAlpha+0.5f); +#endif + float weight=0; + // + // + // + // IMPORTANT: The weights are almost 50% of the computational time. Is there a way of speeding this up?? + // + //Real coordinates of Voxel. Instead of reverting the transformation, its less math (faster) to compute it from the indexes. + Point3D realvoxel; + + realvoxel.x=realvoxel_init.x+indX*geo.dVoxelX; + realvoxel.y=realvoxel_init.y+indY*geo.dVoxelY; + realvoxel.z=realvoxel_init.z+indZ*geo.dVoxelZ; + + + + realDaux.y=(-geo.sDetecU+geo.dDetecU)*0.5f + u*geo.dDetecU +uv0Offset.x; + realD.z =(-geo.sDetecV+geo.dDetecV)*0.5f + v*geo.dDetecV +uv0Offset.y; + //rotate the detector + realD.x= realDaux.x*cosalpha + realDaux.y*sinalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x) + realD.y=-realDaux.x*sinalpha + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x) + float L,lsq; + + L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always. + lsq = (realS.x-realvoxel.x)*(realS.x-realvoxel.x) + + (realS.y-realvoxel.y)*(realS.y-realvoxel.y) + + (realS.z-realvoxel.z)*(realS.z-realvoxel.z); + + weight=__fdividef(L*L*L,(DSD*lsq)); +// weight=1; + // Get Value in the computed (U,V) and multiply by the corresponding weight. + // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) + voxelColumn[colIdx]+=sample* weight; + } // END iterating through column of voxels + + } // END iterating through multiple projections + + // And finally copy the updated local voxelColumn array back to our 3D volume (main memory) +#pragma unroll + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; + image[idx] = voxelColumn[colIdx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) + // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. + // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write. + // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is + // better for avoiding memory congestion. + } // END copy updated voxels from local array to our 3D volume + +} // END kernelPixelBackprojectionFDK + + + + +//______________________________________________________________________________ +// +// Function: voxel_backprojection +// +// Description: Main host function for FDK backprojection (invokes the kernel) +//______________________________________________________________________________ + +int voxel_backprojection2(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids){ + + + + + // Prepare for MultiGPU + int deviceCount = gpuids.GetLength(); + cudaCheckErrors("Device query fail"); + if (deviceCount == 0) { + mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n"); + } + + + // CODE assumes + // 1.-All available devices are usable by this code + // 2.-All available devices are equal, they are the same machine (warning thrown) + // Check the available devices, and if they are the same + if (!gpuids.AreEqualDevices()) { + mexWarnMsgIdAndTxt("Atb:Voxel_backprojection2:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); + } + + int dev; + + + // Split the CT problem + unsigned int split_image; + unsigned int split_projections; + splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections); + + + // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the + // image slices. The rest of the Geometry is the same + Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry)); + createGeoArray(split_image*deviceCount,geo,geoArray,nalpha); + + // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly + // in the previous section this should leave enough space for the textures. + size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); + float** dimage=(float**)malloc(deviceCount*sizeof(float*)); + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMalloc((void**)&dimage[dev], num_bytes_img); + cudaCheckErrors("cudaMalloc fail"); + } + + + //Pagelock memory for synchronous copy. + // Lets try to make the host memory pinned: + // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to + // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. + if (isHostRegisterSupported & split_image>1){ + cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); + } + if (isHostRegisterSupported ){ + cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + } + cudaCheckErrors("Error pinning memory"); + + + + + + //If it is the first time, lets make sure our image is zeroed. + int nStreamDevice=2; + int nStreams=deviceCount*nStreamDevice; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + for (int i = 0; i < nStreamDevice; ++i){ + cudaStreamCreate(&stream[i+dev*nStreamDevice]); + + } + } + + // Kernel auxiliary variables + Point3D* projParamsArray2Host; + cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D)); + float* projSinCosArray2Host; + cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float)); + + // Texture object variables + cudaTextureObject_t *texProj; + cudaArray **d_cuArrTex; + texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); + d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); + + + + unsigned int proj_split_overlap_number; + // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop + // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution + // of the code, as repeated allocation and deallocation only happens when the projection data is very very big, + // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time. + unsigned long long proj_linear_idx_start; + unsigned int current_proj_split_size,current_proj_overlap_split_size; + size_t num_bytes_img_curr; + size_t img_linear_idx_start; + float** partial_projection; + size_t* proj_split_size; + + for(unsigned int img_slice=0;img_slice=proj_split_size[proj_block_split]) + break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. + if(currProjNumber_global>=nalpha) + break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. + + Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, offDetec,source; + float sinalpha,cosalpha; + + geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now. + geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1]; + geoArray[img_slice*deviceCount+dev].psi =-alphas[currProjNumber_global*3+2]; + + sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha); + cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha); + + projSinCosArray2Host[5*j]=sinalpha; // 2*j because we have 2 float (sin or cos angle) values per projection + projSinCosArray2Host[5*j+1]=cosalpha; + projSinCosArray2Host[5*j+2]=geo.COR[currProjNumber_global]; + projSinCosArray2Host[5*j+3]=geo.DSD[currProjNumber_global]; + projSinCosArray2Host[5*j+4]=geo.DSO[currProjNumber_global]; + + computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); + + offOrig.x=geo.offOrigX[currProjNumber_global]; + offOrig.y=geo.offOrigY[currProjNumber_global]; + offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global]; + + offDetec.x=geo.offDetecU[currProjNumber_global]; + offDetec.y=geo.offDetecV[currProjNumber_global]; + offDetec.z=0;//unused + + projParamsArray2Host[7*j] =deltaX; // 7*j because we have 7 Point3D values per projection + projParamsArray2Host[7*j+1]=deltaY; + projParamsArray2Host[7*j+2]=deltaZ; + projParamsArray2Host[7*j+3]=xyzOrigin; + projParamsArray2Host[7*j+4]=offOrig; + projParamsArray2Host[7*j+5]=offDetec; + projParamsArray2Host[7*j+6]=source; + + } // END for (preparing params for kernel call) + + // Copy the prepared parameter arrays to constant memory to make it available for the kernel + cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); + cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); + cudaStreamSynchronize(stream[dev*nStreamDevice]); + kernelPixelBackprojection<<>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); + + } // END for + ////////////////////////////////////////////////////////////////////////////////////// + // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject + ////////////////////////////////////////////////////////////////////////////////////// + } + } // END sub-split of current projection chunk + + } // END projection splits + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>( geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV)); + } + + // Now we need to take the image out of the GPU + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaStreamSynchronize(stream[dev*nStreamDevice]); + + num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); + img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); + cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); + } + } // end image splits + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDeviceSynchronize(); + } + + + // Clean the GPU + bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1; + for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) + if (!two_buffers_used && i==1) + break; for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaDestroyTextureObject(texProj[i*deviceCount+dev]); + cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); + } + } + free(d_cuArrTex); + free(texProj); + + for (dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaFree(dimage[dev]); + } + free(dimage); + + cudaFreeHost(projSinCosArray2Host); + cudaFreeHost(projParamsArray2Host); + free(partial_projection); + free(proj_split_size); + + freeGeoArray(split_image*deviceCount,geoArray); +#ifndef NO_PINNED_MEMORY + if (isHostRegisterSupported & split_image>1){ + cudaHostUnregister(result); + } + if (isHostRegisterSupported){ + cudaHostUnregister(projections); + } +#endif + for (int i = 0; i < nStreams; ++i) + cudaStreamDestroy(stream[i]); + + cudaCheckErrors("cudaFree fail"); + +// cudaDeviceReset(); // For the Nvidia Visual Profiler + return 0; + +} // END voxel_backprojection + + + + + +void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){ + //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; + int num_devices = gpuids.GetLength(); +#if IS_FOR_MATLAB_TIGRE + const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); +#else + const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); +#endif + if (allocate){ + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + + //cudaArray Descriptor + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //cuda Array + cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + + } + } + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemcpy3DParms copyParams = {0}; + //Array creation + copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.dstArray = d_cuArrTex[dev]; + copyParams.extent = extent; + copyParams.kind = cudaMemcpyHostToDevice; + cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); + } + + //Array creation End + for (unsigned int dev = 0; dev < num_devices; dev++){ + cudaSetDevice(gpuids[dev]); + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_cuArrTex[dev]; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeBorder; + texDescr.addressMode[1] = cudaAddressModeBorder; + texDescr.addressMode[2] = cudaAddressModeBorder; + texDescr.readMode = cudaReadModeElementType; + cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); + } +} +#ifndef BACKPROJECTION_HPP +void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){ + + + // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. + + size_t mem_GPU_global; + checkFreeMemory(gpuids, &mem_GPU_global); + const int deviceCount = gpuids.GetLength(); + + // Compute how much memory each of the relevant memory pieces need + size_t mem_image= (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); + size_t mem_proj= (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float); + + + + + // Does everything fit in the GPU? + + if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2x=Px.x-P.x; deltaX->y=Px.y-P.y; deltaX->z=Px.z-P.z; + deltaY->x=Py.x-P.x; deltaY->y=Py.y-P.y; deltaY->z=Py.z-P.z; + deltaZ->x=Pz.x-P.x; deltaZ->y=Pz.y-P.y; deltaZ->z=Pz.z-P.z; + + + *xyzorigin=P.to_float(); + *S=source.to_float(); +} // END computeDeltasCube + +void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ + size_t memfree; + size_t memtotal; + const int gpuids.GetLength(); + + for (int dev = 0; dev < deviceCount; dev++){ + cudaSetDevice(gpuids[dev]); + cudaMemGetInfo(&memfree,&memtotal); + if(dev==0) *mem_GPU_global=memfree; + if(memfree -#include -#include +#include +#include #include "voxel_backprojection.hpp" #include "voxel_backprojection_parallel.hpp" @@ -57,10 +58,10 @@ // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror #define cudaCheckErrors(msg) \ do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ + hipError_t __err = hipGetLastError(); \ + if (__err != hipSuccess) { \ mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\ } \ } while (0) @@ -92,7 +93,7 @@ do { \ * * **/ -void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate); +void CreateTextureParallel( float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool allocate); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call @@ -135,7 +136,7 @@ __constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL]; // Description: Main FDK backprojection kernel //______________________________________________________________________________ -__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex) +__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,hipTextureObject_t tex) { // Old kernel call signature: @@ -286,9 +287,9 @@ __global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* im int voxel_backprojection_parallel(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids) { if (gpuids.GetLength() == 0) { - cudaSetDevice(0); + hipSetDevice(0); } else { - cudaSetDevice(gpuids[0]); + hipSetDevice(gpuids[0]); } /* @@ -298,10 +299,10 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re //If it is the first time, lets make sure our image is zeroed. int nStreamDevice=2; int nStreams=nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));; for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i]); + hipStreamCreate(&stream[i]); } @@ -310,10 +311,10 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. int isHostRegisterSupported = 0; #if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); + hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]); #endif if (isHostRegisterSupported){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); } cudaCheckErrors("Error pinning memory"); @@ -321,22 +322,22 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // Allocate result image memory size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float); float* dimage; - cudaMalloc((void**)&dimage, num_bytes); - cudaMemset(dimage,0,num_bytes); - cudaCheckErrors("cudaMalloc fail"); + hipMalloc((void**)&dimage, num_bytes); + hipMemset(dimage,0,num_bytes); + cudaCheckErrors("hipMalloc fail"); Point3D* projParamsArrayHostParallel; - cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D)); + hipHostMalloc((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D)); float* projSinCosArrayHostParallel; - cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float)); + hipHostMalloc((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float)); // Texture buffer objects - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*)); + hipTextureObject_t *texProj; + hipArray **d_cuArrTex; + texProj =(hipTextureObject_t*)malloc(2*sizeof(hipTextureObject_t)); + d_cuArrTex =(hipArray**)malloc(2*sizeof(hipArray*)); @@ -389,7 +390,7 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re (proj_block_split<2));// Only allocate if its the first 2 calls - cudaStreamSynchronize(stream[0+1]); + hipStreamSynchronize(stream[0+1]); @@ -464,9 +465,9 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // Copy the prepared parameter arrays to constant memory to make it available for the kernel - cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); - cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArrayDevParallel), projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]); + hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDevParallel), projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]); + hipStreamSynchronize(stream[0]); kernelPixelBackprojection_parallel<<>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]); } // END for @@ -475,9 +476,9 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re // END Main reconstruction loop: go through projections (rotation angles) and backproject ////////////////////////////////////////////////////////////////////////////////////// } - cudaDeviceSynchronize(); - cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy result fail"); + hipDeviceSynchronize(); + hipMemcpy(result, dimage, num_bytes, hipMemcpyDeviceToHost); + cudaCheckErrors("hipMemcpy result fail"); free(partial_projection); free(proj_split_size); @@ -486,23 +487,23 @@ int voxel_backprojection_parallel(float * projections, Geometry geo, float* re for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) if (!two_buffers_used && i==1) break; - cudaDestroyTextureObject(texProj[i]); - cudaFreeArray(d_cuArrTex[i]); + hipDestroyTextureObject(texProj[i]); + hipFreeArray(d_cuArrTex[i]); } free(texProj); free(d_cuArrTex); - cudaFreeHost(projSinCosArrayHostParallel); - cudaFreeHost(projParamsArrayHostParallel); + hipHostFree(projSinCosArrayHostParallel); + hipHostFree(projParamsArrayHostParallel); - cudaFree(dimage); + hipFree(dimage); if (isHostRegisterSupported){ - cudaHostUnregister(projections); + hipHostUnregister(projections); } for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); + hipStreamDestroy(stream[i]); -// cudaDeviceReset(); +// hipDeviceReset(); return 0; } // END voxel_backprojection @@ -583,45 +584,45 @@ void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* } // END computeDeltasCube -void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc) +void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool alloc) { - //cudaArray Descriptor + //hipArray Descriptor #if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles); #else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); + const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles); #endif - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array if (alloc){ - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); cudaCheckErrors("Texture memory allocation fail"); } - cudaMemcpy3DParms copyParams = {0}; + hipMemcpy3DParms copyParams = {0}; //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.srcPtr = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); copyParams.dstArray = d_cuArrTex[0]; copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[0+1]); + copyParams.kind = hipMemcpyHostToDevice; + hipMemcpy3DAsync(©Params,stream[0+1]); cudaCheckErrors("Texture memory data copy fail"); //Array creation End - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; + hipResourceDesc texRes; + memset(&texRes, 0, sizeof(hipResourceDesc)); + texRes.resType = hipResourceTypeArray; texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); + hipTextureDesc texDescr; + memset(&texDescr, 0, sizeof(hipTextureDesc)); texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + texDescr.filterMode = hipFilterModeLinear; + texDescr.addressMode[0] = hipAddressModeBorder; + texDescr.addressMode[1] = hipAddressModeBorder; + texDescr.addressMode[2] = hipAddressModeBorder; + texDescr.readMode = hipReadModeElementType; + hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); cudaCheckErrors("Texture object creation fail"); } \ No newline at end of file diff --git a/Common/CUDA/voxel_backprojection_parallel.cu.prehip b/Common/CUDA/voxel_backprojection_parallel.cu.prehip new file mode 100644 index 00000000..03703576 --- /dev/null +++ b/Common/CUDA/voxel_backprojection_parallel.cu.prehip @@ -0,0 +1,627 @@ +/*------------------------------------------------------------------------- + * + * CUDA function for backrpojection for parallel beam + * + * + * CODE by Ander Biguri + * Optimized and modified by RB + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + + +#define PI_2 1.57079632679489661923 +#include +#include +#include +#include "voxel_backprojection.hpp" +#include "voxel_backprojection_parallel.hpp" + +#include "TIGRE_common.hpp" +#include + +// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror +#define cudaCheckErrors(msg) \ +do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + mexPrintf("%s \n",msg);\ + mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ + } \ +} while (0) + + +#define MAXTREADS 1024 + /*GEOMETRY DEFINITION + * + * Detector plane, behind + * |-----------------------------| + * | | + * | | + * | | + * | | + * | +--------+ | + * | / /| | + * A Z | / / |*D | + * | | +--------+ | | + * | | | | | | + * | | | *O | + | + * *--->y | | | / | + * / | | |/ | + * V X | +--------+ | + * |-----------------------------| + * + * *S + * + * + * + * + * + **/ +void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate); + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were: +// PROJ_PER_KERNEL = 32 or 16 (very similar times) +// VOXELS_PER_THREAD = 8 +// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code. +// (e.g. 16.2 s vs. ~62 s). + +const int PROJ_PER_KERNEL = 32; // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck. +const int VOXELS_PER_THREAD = 8; // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck. + +// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection: +// deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec +// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel +// (they will be updated in the main loop before each kernel call). + +__constant__ Point3D projParamsArrayDevParallel[6*PROJ_PER_KERNEL]; // Dev means it is on device + +// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above) +// Point3D projParamsArrayHostParallel[6*PROJ_PER_KERNEL]; // Host means it is host memory + +// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection) +__constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL]; + +// float projSinCosArrayHostParallel[3*PROJ_PER_KERNEL]; + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + +//______________________________________________________________________________ +// +// Function: kernelPixelBackprojectionFDK +// +// Description: Main FDK backprojection kernel +//______________________________________________________________________________ + +__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex) +{ + + // Old kernel call signature: + // kernelPixelBackprojectionFDK<<>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha); + // We just read in most of the params from the constant memory instead of getting them from the param list. + // This is because we now have MANY params, since single kernel processes more than one projection! + /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo, + * float* image, + * const int indAlpha, + * const Point3D deltaX , + * const Point3D deltaY, + * const Point3D deltaZ, + * const Point3D xyzOrigin, + * const Point3D xyzOffset, + * const Point3D uv0Offset, + * const float sinalpha, + * const float cosalpha){ + */ + unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y; + unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x; + // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle + unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle + //Make sure we don't go out of bounds + if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ) + return; + + // We'll keep a local auxiliary array of values of a column of voxels that this thread will update + float voxelColumn[VOXELS_PER_THREAD]; + + // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then + // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes + + unsigned long colIdx; + + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; + voxelColumn[colIdx] = image[idx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) + // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. + } // END copy 3D volume voxels to local array + + // Now iterate through projections + for(unsigned long projNumber=0; projNumber=totalNoOfProjections) + break; + + Point3D deltaX = projParamsArrayDevParallel[6*projNumber]; // 6*projNumber because we have 6 Point3D values per projection + Point3D deltaY = projParamsArrayDevParallel[6*projNumber+1]; + Point3D deltaZ = projParamsArrayDevParallel[6*projNumber+2]; + Point3D xyzOrigin = projParamsArrayDevParallel[6*projNumber+3]; + Point3D xyzOffset = projParamsArrayDevParallel[6*projNumber+4]; + Point3D S = projParamsArrayDevParallel[6*projNumber+5]; + + float DSD = projSinCosArrayDevParallel[3*projNumber]; // 2*projNumber because we have 2 float (sin or cos angle) values per projection + float DSO = projSinCosArrayDevParallel[3*projNumber+1]; + float COR = projSinCosArrayDevParallel[3*projNumber+2]; + + // Geometric trasnformations: + //Source, scaled XYZ coordinates + + // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles. + Point3D P; + S.x=DSO; + P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x); + P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU; + P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z); + S.y=P.y;S.z=P.z; + + // This is the vector defining the line from the source to the Voxel + float vectX,vectY,vectZ; + vectX=(P.x -S.x); + vectY=(P.y -S.y); + vectZ=(P.z -S.z); + + // Get the coordinates in the detector UV where the mid point of the voxel is projected. + float t=(DSO-DSD /*-DOD*/ - S.x)/vectX; + float y,z; + y=vectY*t+S.y; + z=vectZ*t+S.z; + float u,v; + u=y+geo.nDetecU/2.0f-0.5f; + v=z+geo.nDetecV/2.0f-0.5f; + + + + // Get Value in the computed (U,V) and multiply by the corresponding weight. + // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) +#if IS_FOR_MATLAB_TIGRE + voxelColumn[colIdx]+=tex3D(tex, v+0.5f, u+0.5f ,indAlpha+0.5f); +#else + voxelColumn[colIdx]+=tex3D(tex, u+0.5f, v+0.5f ,indAlpha+0.5f); +#endif + + } // END iterating through column of voxels + + } // END iterating through multiple projections + + // And finally copy the updated local voxelColumn array back to our 3D volume (main memory) + for(colIdx=0; colIdx=geo.nVoxelZ) + break; // break the loop. + + unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; + image[idx] = voxelColumn[colIdx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) + // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. + // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write. + // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is + // better for avoiding memory congestion. + } // END copy updated voxels from local array to our 3D volume + +} // END kernelPixelBackprojectionFDK + + + + +//______________________________________________________________________________ +// +// Function: voxel_backprojection_parallel +// +// Description: Main host function for FDK backprojection (invokes the kernel) +//______________________________________________________________________________ + +int voxel_backprojection_parallel(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids) +{ + if (gpuids.GetLength() == 0) { + cudaSetDevice(0); + } else { + cudaSetDevice(gpuids[0]); + } + + /* + * Allocate texture memory on the device + */ + // copy data to CUDA memory + //If it is the first time, lets make sure our image is zeroed. + int nStreamDevice=2; + int nStreams=nStreamDevice; + cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; + + for (int i = 0; i < nStreamDevice; ++i){ + cudaStreamCreate(&stream[i]); + + + } + //Pagelock memory for synchronous copy. + // Lets try to make the host memory pinned: + // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. + int isHostRegisterSupported = 0; +#if CUDART_VERSION >= 9020 + cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); +#endif + if (isHostRegisterSupported){ + cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); + } + cudaCheckErrors("Error pinning memory"); + + + // Allocate result image memory + size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float); + float* dimage; + cudaMalloc((void**)&dimage, num_bytes); + cudaMemset(dimage,0,num_bytes); + cudaCheckErrors("cudaMalloc fail"); + + + Point3D* projParamsArrayHostParallel; + cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D)); + float* projSinCosArrayHostParallel; + cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float)); + + + // Texture buffer objects + cudaTextureObject_t *texProj; + cudaArray **d_cuArrTex; + texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t)); + d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*)); + + + + unsigned int proj_split_overlap_number; + unsigned int split_projections=1; + // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop + // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution + // of the code, as repeated allocation and deallocation only happens when the projection data is very very big, + // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time. + unsigned long long proj_linear_idx_start; + unsigned int current_proj_split_size,current_proj_overlap_split_size; + size_t num_bytes_img_curr; + size_t img_linear_idx_start; + + + current_proj_split_size=nalpha; + // We are going to split it in the same amount of kernels we need to execute. + proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL; + + + // Create pointer to pointers of projections and precompute their location and size. + + float ** partial_projection=(float**)malloc(current_proj_split_size*sizeof(float*)); + size_t * proj_split_size=(size_t*)malloc(current_proj_split_size*sizeof(size_t*)); + + for(unsigned int proj_block_split=0; proj_block_split=proj_split_size[proj_block_split]) + break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. + + if(currProjNumber_global>=nalpha) + break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. + + Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source; + float sinalpha,cosalpha; + + geo.alpha=-alphas[currProjNumber_global*3]; + geo.theta=-alphas[currProjNumber_global*3+1]; + geo.psi =-alphas[currProjNumber_global*3+2]; + + //sinalpha=sin(geo.alpha); +// cosalpha=cos(geo.alpha); + + projSinCosArrayHostParallel[3*j]=geo.DSD[currProjNumber_global]; // 3*j because we have 3 float (sin or cos angle) values per projection + projSinCosArrayHostParallel[3*j+1]=geo.DSO[currProjNumber_global]; + projSinCosArrayHostParallel[3*j+2]=geo.COR[currProjNumber_global]; + + //computeDeltasCubeParallel(geo,geo.alpha,currProjNumber,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); + computeDeltasCubeParallel(geo,currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); + + offOrig.x=geo.offOrigX[currProjNumber_global]; + offOrig.y=geo.offOrigY[currProjNumber_global]; + + + projParamsArrayHostParallel[6*j]=deltaX; // 6*j because we have 6 Point3D values per projection + projParamsArrayHostParallel[6*j+1]=deltaY; + projParamsArrayHostParallel[6*j+2]=deltaZ; + projParamsArrayHostParallel[6*j+3]=xyzOrigin; + projParamsArrayHostParallel[6*j+4]=offOrig; + projParamsArrayHostParallel[6*j+5]=source; + } // END for (preparing params for kernel call) + + // Copy the prepared parameter arrays to constant memory to make it available for the kernel + + cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); + cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); + cudaStreamSynchronize(stream[0]); + + kernelPixelBackprojection_parallel<<>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]); + } // END for + + ////////////////////////////////////////////////////////////////////////////////////// + // END Main reconstruction loop: go through projections (rotation angles) and backproject + ////////////////////////////////////////////////////////////////////////////////////// + } + cudaDeviceSynchronize(); + cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost); + cudaCheckErrors("cudaMemcpy result fail"); + + free(partial_projection); + free(proj_split_size); + + bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1; + for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) + if (!two_buffers_used && i==1) + break; + cudaDestroyTextureObject(texProj[i]); + cudaFreeArray(d_cuArrTex[i]); + } + free(texProj); + + free(d_cuArrTex); + cudaFreeHost(projSinCosArrayHostParallel); + cudaFreeHost(projParamsArrayHostParallel); + + cudaFree(dimage); + if (isHostRegisterSupported){ + cudaHostUnregister(projections); + } + for (int i = 0; i < nStreams; ++i) + cudaStreamDestroy(stream[i]); + +// cudaDeviceReset(); + return 0; + +} // END voxel_backprojection + +void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S) +{ + + Point3Ddouble P, Px,Py,Pz; + // Get coords of Img(0,0,0) + P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i]; + P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i]; + P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i]; + + // Get coors from next voxel in each direction + Px.x=P.x+geo.dVoxelX; Py.x=P.x; Pz.x=P.x; + Px.y=P.y; Py.y=P.y+geo.dVoxelY; Pz.y=P.y; + Px.z=P.z; Py.z=P.z; Pz.z=P.z+geo.dVoxelZ; + + + + // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ + eulerZYZT(geo,&P); + eulerZYZT(geo,&Px); + eulerZYZT(geo,&Py); + eulerZYZT(geo,&Pz); + + //detector offset + P.z =P.z-geo.offDetecV[i]; P.y =P.y-geo.offDetecU[i]; + Px.z =Px.z-geo.offDetecV[i]; Px.y =Px.y-geo.offDetecU[i]; + Py.z =Py.z-geo.offDetecV[i]; Py.y =Py.y-geo.offDetecU[i]; + Pz.z =Pz.z-geo.offDetecV[i]; Pz.y =Pz.y-geo.offDetecU[i]; + + //Detector Roll pitch Yaw + // + // + // first, we need to offset everything so (0,0,0) is the center of the detector + // Only X is required for that + P.x=P.x+(geo.DSD[i]-geo.DSO[i]); + Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]); + Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]); + Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]); + + rollPitchYawT(geo,i,&P); + rollPitchYawT(geo,i,&Px); + rollPitchYawT(geo,i,&Py); + rollPitchYawT(geo,i,&Pz); + + P.x=P.x-(geo.DSD[i]-geo.DSO[i]); + Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]); + Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]); + Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]); + + + Point3Ddouble source; + source.x=0; + source.y=-geo.offDetecU[i]; + source.z=-geo.offDetecV[i]; + + rollPitchYawT(geo,i,&source); + source.x=source.x-(geo.DSD[i]-geo.DSO[i]); + + P.z =P.z /geo.dDetecV; P.y =P.y/geo.dDetecU; + Px.z=Px.z/geo.dDetecV; Px.y=Px.y/geo.dDetecU; + Py.z=Py.z/geo.dDetecV; Py.y=Py.y/geo.dDetecU; + Pz.z=Pz.z/geo.dDetecV; Pz.y=Pz.y/geo.dDetecU; + + source.z=source.z/geo.dDetecV; source.y=source.y/geo.dDetecU; + + // get deltas of the changes in voxels + deltaX->x=Px.x-P.x; deltaX->y=Px.y-P.y; deltaX->z=Px.z-P.z; + deltaY->x=Py.x-P.x; deltaY->y=Py.y-P.y; deltaY->z=Py.z-P.z; + deltaZ->x=Pz.x-P.x; deltaZ->y=Pz.y-P.y; deltaZ->z=Pz.z-P.z; + + + // cast the results from the double precision calculations back to float + *xyzorigin=P.to_float(); + *S=source.to_float(); + + +} // END computeDeltasCube +void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc) +{ + //cudaArray Descriptor +#if IS_FOR_MATLAB_TIGRE + const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); +#else + const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); +#endif + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); + //cuda Array + if (alloc){ + cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + cudaCheckErrors("Texture memory allocation fail"); + } + cudaMemcpy3DParms copyParams = {0}; + + + //Array creation + copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); + copyParams.dstArray = d_cuArrTex[0]; + copyParams.extent = extent; + copyParams.kind = cudaMemcpyHostToDevice; + cudaMemcpy3DAsync(©Params,stream[0+1]); + cudaCheckErrors("Texture memory data copy fail"); + //Array creation End + + cudaResourceDesc texRes; + memset(&texRes, 0, sizeof(cudaResourceDesc)); + texRes.resType = cudaResourceTypeArray; + texRes.res.array.array = d_cuArrTex[0]; + cudaTextureDesc texDescr; + memset(&texDescr, 0, sizeof(cudaTextureDesc)); + texDescr.normalizedCoords = false; + texDescr.filterMode = cudaFilterModeLinear; + texDescr.addressMode[0] = cudaAddressModeBorder; + texDescr.addressMode[1] = cudaAddressModeBorder; + texDescr.addressMode[2] = cudaAddressModeBorder; + texDescr.readMode = cudaReadModeElementType; + cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); + cudaCheckErrors("Texture object creation fail"); + +} \ No newline at end of file diff --git a/Common/CUDA/voxel_backprojection_parallel.hpp.prehip b/Common/CUDA/voxel_backprojection_parallel.hpp.prehip new file mode 100644 index 00000000..92b72023 --- /dev/null +++ b/Common/CUDA/voxel_backprojection_parallel.hpp.prehip @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * Header CUDA function for backrpojection for parallel beam + * + * + * CODE by Ander Biguri + * Optimized and modified by RB + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ +#include "types_TIGRE.hpp" +#include "GpuIds.hpp" + + +#ifndef BACKPROJECTION_PARALLEL_HPP +#define BACKPROJECTION_PARALLEL_HPP + +int voxel_backprojection_parallel(float * projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids); +void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S); +void createGeoArrayParallel(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles); +// void computeDeltasCube(Geometry geo, float alpha,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ); +#endif \ No newline at end of file diff --git a/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip new file mode 100644 index 00000000..650a9815 --- /dev/null +++ b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip @@ -0,0 +1,21 @@ +#include +#include +#include +#include + +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + if (nrhs != 0) { + mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "No input requred."); + return; + } + if (nlhs != 1) { + mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "Too many output arguments. Returns one integer."); + return; + } + int iCount = GetGpuCount(); + size_t dims[2] = {1,1}; + plhs[0] = mxCreateNumericArray(2, dims, mxUINT32_CLASS, mxREAL); + *((int*)mxGetData(plhs[0])) = iCount; +} diff --git a/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip new file mode 100644 index 00000000..c56ca29b --- /dev/null +++ b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip @@ -0,0 +1,29 @@ +#include +#include + +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + // Usage: name = getGpuName_mex(int iId) + if (nrhs != 1) { + mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:invalidNumInputs", "One input required."); + return; + } else if(nlhs > 1) { + mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:maxlhs", "Too many output arguments."); + return; + } + + int iId = 0; + if (mxIsDouble(prhs[0])) { + mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:inputNotInt", "Input must be an integer."); + return; + } else { + iId = *((int*)mxGetData(prhs[0])); + } + int iCount = GetGpuCount(); + char* pcName = (char*)mxCalloc(128, sizeof(char)); + if (iId < iCount) { + GetGpuName(iId, pcName); + } + plhs[0] = mxCreateString(pcName); +} diff --git a/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip new file mode 100644 index 00000000..670c2d3e --- /dev/null +++ b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip @@ -0,0 +1,28 @@ +#define _CRT_SECURE_NO_WARNINGS + +#include +#include + +// Purpose: To fast read .xim files +// Method: based on ReadXim.m by Fredrik Nordström 2015 +// Date: 2017.07 +// Author: Yi Du, yi.du@hotmail.com + +#ifndef STR_XIM +#define STR_XIM +//struct XimPara +typedef struct XimPara +{ + char FileName[256]; + int ImgWidth; // Image Width + int ImgHeight; // Image Height + int PixelNO; + + int BytesPerPixel; // Determine how to read the data + int Compression_Indicator; // Data number in Rec Image Matrix + + double GantryRtn; // Gantry rotation angle + int KVNormChamber; // KV norm chamber reading, date: 2022-05-23 +}XimPara; +#endif + diff --git a/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip new file mode 100644 index 00000000..453c4278 --- /dev/null +++ b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip @@ -0,0 +1,357 @@ +#define _CRT_SECURE_NO_WARNINGS + +#include "io64.h" +#include +#include +#include +#include +#include +#include +//**** C data types are defined in tmwtypes.h +#include +#include "mex.h" +#include +#include "matrix.h" +#include "XimPara.hpp" + +#define GET_BIT(x,bit) ((x & (1 << bit)) >>bit) + +// Purpose: To fast read .xim files +// Method: based on ReadXim.m by Fredrik Nordström 2015 +// Date: 2017.07 +// Author: Yi Du, yi.du@hotmail.com + + +int cReadXim(char *XimFullFile, XimPara *XimStr, int *XimImg); + +void mexFunction( + int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + //check input variable + if (mxIsChar(prhs[0]) != 1) + mexErrMsgIdAndTxt( "MATLAB:revord:inputNotString", + "Input must be a string."); + + // .xim filename + char *filename; + filename = mxArrayToString(prhs[0]); + //mexPrintf("%s\n", filename); + + // file open + FILE *fid = fopen(filename, "rb"); + if(fid == NULL) + { + mexErrMsgIdAndTxt("%s fopen failed.\n", filename); + //getchar(); + //exit(1); + } + + // Parameter structure + XimPara *para = new XimPara[1]; + + // file pointer position + //fpos_t position = {0}; + + // Skip useless information + // 8 * sizeof(char) + sizeof(int32_t); + long int position = 8*sizeof(char) + sizeof(int32_T); + fseek ( fid , position , SEEK_SET ); +// setFilePos(fid, (fpos_t*) &position); + // Read ImgWidth & ImgHeight (int32) + fread(&(para->ImgWidth), sizeof(int32_T), 1, fid); + fread(&(para->ImgHeight), sizeof(int32_T), 1, fid); + fclose(fid); + + para->PixelNO = para->ImgWidth * para->ImgHeight; + + int *frame; + plhs[0] = mxCreateNumericMatrix(para->ImgWidth, para->ImgHeight, mxINT32_CLASS, mxREAL); + frame = (int*)mxGetPr(plhs[0]); + + // empty file return + if (para->PixelNO == 0) + { + plhs[1] = mxCreateDoubleScalar(10000); + mexPrintf("%s is an empty file\n", filename); + return; + } + + /******* Kernel Function *********/ + cReadXim(filename, para, frame); + + /**** KVSourceRtn is the only parameter-of-interest to return ****/ + // KVSourceRtn = GantryRtn + 90 deg; + double KVSourceRtn = para->GantryRtn + 90; + plhs[1] = mxCreateDoubleScalar(KVSourceRtn); + + double NormChamberReading = para->KVNormChamber * 1.0; + plhs[2] = mxCreateDoubleScalar(NormChamberReading); + +} + +/************* Kernel Funtion to read .xim ***************/ +// Kernel function +int cReadXim(char *XimFullFile, + XimPara *XimStr, + int *XimImg) +{ + // Read the .xim file name + +// char *ptr = strrchr(XimFullFile, '\\'); +// sprintf(XimStr->FileName, "%s", ptr + 1); + + // ****** Open .xim File Pointer ***********// + FILE *fid = fopen(XimFullFile, "rb"); + + // Syntax Parsing + if (fid == NULL) + { + mexErrMsgIdAndTxt("Error: file %s doesn't exist, at all\n", XimFullFile); + //getchar(); + //exit(1); + } + + // ******* Stage 1: Portal Image Data ****// + // Skip useless information + fseek(fid, 8 * sizeof(char) + sizeof(int32_T), SEEK_CUR); + + // Read ImgWidth & ImgHeight + fread(&(XimStr->ImgWidth), sizeof(int32_T), 1, fid); + fread(&(XimStr->ImgHeight), sizeof(int32_T), 1, fid); + XimStr->PixelNO = (XimStr->ImgWidth)*(XimStr->ImgHeight); + + // Skip the useless information: bits_per_pixel + fseek(fid, sizeof(int32_T), SEEK_CUR); + + // Load .xim file compression parameters + fread(&(XimStr->BytesPerPixel), sizeof(int32_T), 1, fid); + fread(&(XimStr->Compression_Indicator), sizeof(int32_T), 1, fid); + + // Load .xim Pixel Data + if (1 == XimStr->Compression_Indicator) + { + int LookUpTableSize = 0; + fread(&LookUpTableSize, sizeof(int), 1, fid); + + int *LookUpTable = new int[XimStr->ImgHeight * XimStr->ImgWidth]; + memset(LookUpTable, 0, XimStr->ImgHeight * XimStr->ImgWidth * sizeof(int)); + + // Load the LookUpTable data + for (int ii = 0; ii < LookUpTableSize; ii++) + { + // Load in the 8-bit date + // Updated: 2021-11-05, Yi Du + uint8_T tmp =0; + fread(&tmp, 1, 1, fid); + int Bit2[4] = { 0 }; + Bit2[0] = GET_BIT(tmp,0) + GET_BIT(tmp,1) *2; + Bit2[1] = GET_BIT(tmp,2) + GET_BIT(tmp,3) *2; + Bit2[2] = GET_BIT(tmp,4) + GET_BIT(tmp,5) *2; + Bit2[3] = GET_BIT(tmp,6) + GET_BIT(tmp,7) *2; + + // extract the lookup_table data + for (int jj = 0; jj < 4; jj++) + { + LookUpTable[ii * 4 + jj] = Bit2[jj]; + } + + /** Old Code with bug + int Bit2[4] = { 0 }; + + // extract the lookup_table data + for (int jj = 0; jj < 8; jj = jj +2) + { + Bit2[jj/2] = ((tmp & 1 << jj) != 0); + // It's 4, because 1 unsigned __int8 in tmp is represented by 4 ints in LookUpTable. + LookUpTable[ii * 4 + jj / 2] = Bit2[jj / 2]; + + //printf("Index = %d, LookUpTable = %d\n", ii * 4 + jj / 2, LookUpTable[ii * 4 + jj / 2]); + } + **/ + } + + // Skip compressed_pixel_buffer_size: passed + fseek(fid, sizeof(int32_T), SEEK_CUR); + + // Allocate memory for XimImg + fread(XimImg, sizeof(int32_T), (XimStr->ImgWidth) + 1, fid); + + // load the compressed pixel data + int delta = 0; + int LUT_Pos = 0; + + // Be very careful with all data types!!! + int8_T tmp8 = 0; + int16_T tmp16 = 0; + int32_T tmp32 = 0; + + for (int ImgTag = XimStr->ImgWidth + 1; + ImgTag < (XimStr->ImgHeight) * (XimStr->ImgWidth); + ImgTag++) + { + if (0 == LookUpTable[LUT_Pos]) + { + fread(&tmp8, sizeof(int8_T), 1, fid); + delta = int(tmp8); + } + else if (1 == LookUpTable[LUT_Pos]) + { + fread(&tmp16, sizeof(int16_T), 1, fid); + delta = int(tmp16); + } + else + { + fread(&tmp32, sizeof(int32_T), 1, fid); + delta = int(tmp32); + } + + XimImg[ImgTag] = delta + XimImg[ImgTag - 1] + + XimImg[ImgTag - XimStr->ImgWidth] + - XimImg[ImgTag - XimStr->ImgWidth - 1]; + + LUT_Pos = LUT_Pos + 1; + } + + // Skip uncompressed_pixel_buffer_size + fseek(fid, sizeof(int32_T), SEEK_CUR); + + } + else + { + // Be careful: the code block for uncompressed pixel data readout hasn't been tested yet. + // Date: 2017-09-12 + int BufferSize = 0; + fread(&BufferSize, sizeof(int), 1, fid); + + switch (XimStr->BytesPerPixel) + { + case 1: + { + uint8_t *buffer8 = new uint8_t[XimStr->ImgWidth * XimStr->ImgHeight]; + memset(buffer8, 0, sizeof(uint8_t)* XimStr->ImgWidth * XimStr->ImgHeight); + fread(buffer8, sizeof(uint8_t), BufferSize, fid); + for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight;ii++) + { + XimImg[ii] = int(buffer8[ii]); + } + break; + } + case 2: + { + uint16_t *buffer16 = new uint16_t[XimStr->ImgWidth * XimStr->ImgHeight]; + memset(buffer16, 0, sizeof(uint16_t)* XimStr->ImgWidth * XimStr->ImgHeight); + fread(buffer16, sizeof(uint16_t), BufferSize / 2, fid); + for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight; ii++) + { + XimImg[ii] = int(buffer16[ii]); + } + break; + } + default: + { + fread(XimImg, sizeof(int), BufferSize / 4, fid); + break; + } + } + } + + + // ******* Stage 2: load the gantry angle from the residual property data ****// + // Skip histogram + int tmp = 0; + fread(&tmp, sizeof(int), 1, fid); + if (tmp > 0) + { + fseek(fid, tmp* sizeof(int), SEEK_CUR); + } + + // Decode .xim properties + int nProperties = 0; + fread(&nProperties, sizeof(int), 1, fid); + // Property structure is not NULL + if (nProperties > 0) + { + int pName_len = 0; + // Only load the property name rather than the content + char pName[128] = { 0 }; + int pType = 0; + for (int ii = 0; ii < nProperties; ii++) + { + // load property name length + fread(&pName_len, sizeof(int), 1, fid); + // load property name + fread(pName, sizeof(char)* pName_len, 1, fid); + // load property data type + fread(&pType, sizeof(int), 1, fid); + + //printf("%s\n", pName); + + // extract the Gantry Rotation Angle + if (!strcmp(pName, "GantryRtn")) + { + fread(&(XimStr->GantryRtn), sizeof(double), 1, fid); +// continue; + } + else if(!strcmp(pName, "KVNormChamber")) + { + //printf("KVNormChamber"); + fread(&(XimStr->KVNormChamber), sizeof(int), 1, fid); + break; + } + else + { + switch (pType) + { + case 0: + { + fseek(fid, sizeof(int), SEEK_CUR); + break; + } + case 1: + { + fseek(fid, sizeof(double), SEEK_CUR); + break; + } + case 2: + { + int skiplen = 0; + fread(&skiplen, sizeof(int), 1, fid); + fseek(fid, sizeof(char) * skiplen, SEEK_CUR); + break; + } + case 4: + { + int skiplen = 0; + fread(&skiplen, sizeof(int), 1, fid); + fseek(fid, sizeof(double) * skiplen /8, SEEK_CUR); + break; + } + case 5: + { + int skiplen = 0; + fread(&skiplen, sizeof(int), 1, fid); + fseek(fid, sizeof(int) * skiplen /4, SEEK_CUR); + break; + } + break; + } + } + // reset all the temporary variables + pName_len = 0; + memset(pName, 0, 128*sizeof(char)); + pType = 0; + } + + } + + // ********* END of XIM Reading: Close the File Pointer******* // + if (fclose(fid)) + { + printf("The file `crt_fopen.c' was not closed\n"); + getchar(); + exit(1); + } + + return 1; +} diff --git a/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip new file mode 100644 index 00000000..e38db7d9 --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip @@ -0,0 +1,126 @@ +/*------------------------------------------------------------------------- + * + * MATLAB MEX functions for Random Number Generator. Check inputs and parses + * MATLAB data to C++ data. + * + * + * CODE by Tomoyuki SADAKANE + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include +/** + * MEX gateway + * AddNoise(Im, mu, sigma, "gpuids", gpuids); + * poissrnd(Im)+randn(size(Im)).*sigma + mu; + */ + +void mexFunction(int nlhs, mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + size_t uiLen = 0; + float fGaussMu = 0; + float fGaussSigma = 0; + + GpuIds gpuids; + if (nrhs==5) { + size_t iM = mxGetM(prhs[4]); + if (iM != 1) { + mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[4]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[4]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } else { + int iGpuCount = GetGpuCount(); + int* piDev = (int*)malloc(iGpuCount * sizeof(int)); + for (int iI = 0; iI < iGpuCount; ++iI) { + piDev[iI] = iI; + } + gpuids.SetIds(iGpuCount, piDev); + free(piDev); piDev = 0; + } + if (nrhs < 3) { + mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "At least three input argumet required."); + } else if (nrhs==3 || nrhs==5){ + size_t mrows = mxGetM(prhs[1]); + size_t ncols = mxGetN(prhs[1]); + if (mrows!=1 || ncols !=1) { + mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "2nd parameter should be 1x1"); + } + mrows = mxGetM(prhs[2]); + ncols = mxGetN(prhs[2]); + if (mrows!=1 || ncols !=1) { + mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "3rd parameter should be 1x1"); + } + fGaussMu = (float)mxGetScalar(prhs[1]); + fGaussSigma = (float)mxGetScalar(prhs[2]); + } else if (nrhs>4) { + mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "Too many input arguments"); + } + /////////////// First input argumet. + // First input should be an array, whose elements are lambda. + mxArray const * const image = prhs[0]; + float* pfLambdas = static_cast(mxGetData(image)); + mwSize const numDims = mxGetNumberOfDimensions(image); // get dim of image + const mwSize *size_img= mxGetDimensions(image); //get size of image + uiLen = size_img[0]; // calculate the total length + for (int iI = 1; iI < numDims; ++iI) { + uiLen *= size_img[iI]; + } + ////////////// + //prepare outputs + // Allocte output image + plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL); + float *imgout =(float*) mxGetPr(plhs[0]); + // call CUDA rng + poisson_gaussian_1d(pfLambdas, uiLen, fGaussMu, fGaussSigma, imgout, gpuids); +} diff --git a/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip new file mode 100644 index 00000000..da78bfce --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip @@ -0,0 +1,367 @@ + +/*------------------------------------------------------------------------- + * + * MATLAB MEX gateway for backprojection + * + * This file gets the data from MATLAB, checks it for errors and then + * parses it to C and calls the relevant C/CUDA functions. + * + * CODE by Ander Biguri + * + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + + + +/** + * MEX gateway + * + * This function takes data from MATLAB and passes it to the MEX code. + * It checks and casts the inputs and prepares teh outputs for MATLAB. + * + * + */ + +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]){ + + //Check amount of inputs + if (nrhs != 5) { + mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput", "Wrong number of inputs provided"); + } + //////////////////////////// + // 5th argument is array of GPU-IDs. + GpuIds gpuids; + { + size_t iM = mxGetM(prhs[4]); + if (iM != 1) { + mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[4]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[4]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } + + /* + ** 4th argument is matched or un matched. + */ + bool pseudo_matched=false; // Caled krylov, because I designed it for krylov case.... + /* copy the string data from prhs[0] into a C string input_ buf. */ + char *krylov = mxArrayToString(prhs[3]); + if (!strcmp(krylov,"matched")) // if its 0, they are the same + pseudo_matched=true; + + /* + ** Third argument: angle of projection. + */ + size_t mrows,nangles; + + mrows = mxGetM(prhs[2]); + nangles = mxGetN(prhs[2]); + + + mxArray const * const ptrangles=prhs[2]; + + + double const * const anglesM= static_cast(mxGetData(ptrangles)); + // just copy paste the data to a float array + float * angles= (float*)malloc(nangles*mrows*sizeof(float)); + for (int i=0;i1) && !(numDims==2 && nangles==1) ){ + mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput", "Projection data is not the right size"); + } + if( !mxIsSingle(prhs[0])) { + mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput", + "Input image must be a single noncomplex array."); + } + // Now that input is ok, parse it to C data types. + // NOTE: while Number of dimensions is the size of the matrix in Matlab, the data is 1D row-wise mayor. + + // We need a float image, and, unfortunately, the only way of casting it is by value +// const mwSize *size_proj= mxGetDimensions(image); //get size of image +// mrows = mxGetM(image); +// nangles = mxGetN(image); +// size_t size_proj2; +// if (nangles==1) +// size_proj2=1; +// else +// size_proj2=size_proj[2]; + + + float * projections= static_cast(mxGetData(image)); + + + + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /** + * Second input: Geometry structure + */ + mxArray * geometryMex=(mxArray*)prhs[1]; + + // IMPORTANT-> Make sure Matlab creates the struct in this order. + const char *fieldnames[14]; + fieldnames[0] = "nVoxel"; + fieldnames[1] = "sVoxel"; + fieldnames[2] = "dVoxel"; + fieldnames[3] = "nDetector"; + fieldnames[4] = "sDetector"; + fieldnames[5] = "dDetector"; + fieldnames[6] = "DSD"; + fieldnames[7] = "DSO"; + fieldnames[8] = "offOrigin"; + fieldnames[9] = "offDetector"; + fieldnames[10]= "accuracy"; + fieldnames[11]= "mode"; + fieldnames[12]= "COR"; + fieldnames[13]= "rotDetector"; + // Make sure input is structure + + mxArray *tmp; + + // Now we know that all the input struct is good! Parse it from mxArrays to + // C structures that MEX can understand. + + double * nVoxel, *nDetec; //we need to cast these to int + double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD,*offOrig,*offDetec; + double *acc, *COR,*rotDetector; + const char* mode; + bool coneBeam=true; + Geometry geo; + int c; + geo.unitX=1;geo.unitY=1;geo.unitZ=1; + for(int ifield=0; ifield<14; ifield++) { + tmp=mxGetField(geometryMex,0,fieldnames[ifield]); + if(tmp==NULL){ + //tofix + continue; + } + switch(ifield){ + case 0: + nVoxel=(double *)mxGetData(tmp); + // copy data to MEX memory + geo.nVoxelX=(int)nVoxel[0]; + geo.nVoxelY=(int)nVoxel[1]; + geo.nVoxelZ=(int)nVoxel[2]; + break; + case 1: + sVoxel=(double *)mxGetData(tmp); + geo.sVoxelX=(float)sVoxel[0]; + geo.sVoxelY=(float)sVoxel[1]; + geo.sVoxelZ=(float)sVoxel[2]; + break; + case 2: + dVoxel=(double *)mxGetData(tmp); + geo.dVoxelX=(float)dVoxel[0]; + geo.dVoxelY=(float)dVoxel[1]; + geo.dVoxelZ=(float)dVoxel[2]; + break; + case 3: + nDetec=(double *)mxGetData(tmp); + geo.nDetecU=(int)nDetec[0]; + geo.nDetecV=(int)nDetec[1]; + break; + case 4: + sDetec=(double *)mxGetData(tmp); + geo.sDetecU=(float)sDetec[0]; + geo.sDetecV=(float)sDetec[1]; + break; + case 5: + dDetec=(double *)mxGetData(tmp); + geo.dDetecU=(float)dDetec[0]; + geo.dDetecV=(float)dDetec[1]; + break; + case 6: + geo.DSD=(float*)malloc(nangles * sizeof(float)); + DSD=(double *)mxGetData(tmp); + for (int i=0;i +#include +#include +#include +#include +#include +#include +#include +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ +///////// First check if the amount of inputs is right. + int maxIter; + float alpha; + GpuIds gpuids; + if (nrhs==5) { + size_t iM = mxGetM(prhs[4]); + if (iM != 1) { + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[4]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[4]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } else { + int iGpuCount = GetGpuCount(); + int* piDev = (int*)malloc(iGpuCount * sizeof(int)); + for (int iI = 0; iI < iGpuCount; ++iI) { + piDev[iI] = iI; + } + gpuids.SetIds(iGpuCount, piDev); + free(piDev); piDev = 0; + } + if (nrhs==1){ + maxIter=100; + alpha=15.0f; + } else if (nrhs==2){ + mexErrMsgIdAndTxt("err", "Only 1 POCS hyperparameter inputted"); + } else if (nrhs==4 || nrhs==5){ + size_t mrows = mxGetM(prhs[1]); + size_t ncols = mxGetN(prhs[1]); + if (mrows!=1 || ncols !=1) { + mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1"); + } + mrows = mxGetM(prhs[2]); + ncols = mxGetN(prhs[2]); + if (mrows!=1 || ncols !=1) { + mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1"); + } + alpha= (float)(mxGetScalar(prhs[1])); + maxIter=(int)floor(mxGetScalar(prhs[2])+0.5); + } else { + mexErrMsgIdAndTxt("err", "Too many input arguments"); + } + float delta=(float)(mxGetScalar(prhs[3])); +////////////////////////// First input. + // First input should be x from (Ax=b), or the image. + mxArray const * const image = prhs[0]; + mwSize const numDims = mxGetNumberOfDimensions(image); + mwSize third_dim = 1; + + // Now that input is ok, parse it to C data types. + float * img = static_cast(mxGetData(image)); + const mwSize *size_img= mxGetDimensions(image); //get size of image + + // Image should be dim 3 + if (numDims==3){ + third_dim = size_img[2]; + } + + // Allocte output image + plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL); + float *imgout =(float*) mxGetPr(plhs[0]); + // call C function with the CUDA denoising + + const long imageSize[3]={size_img[0], size_img[1], third_dim }; + + aw_pocs_tv(img,imgout, alpha, imageSize, maxIter, delta, gpuids); + + //prepareotputs +} diff --git a/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip new file mode 100644 index 00000000..3c6f3670 --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip @@ -0,0 +1,338 @@ +/*------------------------------------------------------------------------- + * + * MATLAB MEX gateway for projection + * + * This file gets the data from MATLAB, checks it for errors and then + * parses it to C and calls the relevant C/CUDA functions. + * + * CODE by Ander Biguri + * + * --------------------------------------------------------------------------- + * --------------------------------------------------------------------------- + * Copyright (c) 2015, University of Bath and CERN- European Organization for + * Nuclear Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * --------------------------------------------------------------------------- + * + * Contact: tigre.toolbox@gmail.com + * Codes : https://github.com/CERN/TIGRE + * --------------------------------------------------------------------------- + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * MEX gateway + */ + + + +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ +// clock_t begin, end; +// begin = clock(); + + + //Check amount of inputs + if (nrhs != 5) { + mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput", "Invalid number of inputs to MEX file."); + } + //////////////////////////// + // 5th argument is array of GPU-IDs. + GpuIds gpuids; + { + size_t iM = mxGetM(prhs[4]); + if (iM != 1) { + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[4]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[4]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } + //////////////////////////// + // 4th argument is interpolated or ray-voxel/Siddon + bool rayvoxel=false; + if ( mxIsChar(prhs[3]) != 1) + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be a string"); + + /* copy the string data from prhs[0] into a C string input_ buf. */ + char *krylov = mxArrayToString(prhs[3]); + if (strcmp(krylov,"interpolated") && strcmp(krylov,"Siddon") && strcmp(krylov,"ray-voxel")) + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be either 'interpolated' or 'Siddon'"); + else + // If its not ray-voxel, its "interpolated" + if (strcmp(krylov,"Siddon") == 0 || strcmp(krylov,"ray-voxel") == 0) //strcmp returs 0 if they are equal + rayvoxel=true; + ///////////////////////// 3rd argument: angle of projection. + + size_t mrows = mxGetM(prhs[2]); + size_t nangles = mxGetN(prhs[2]); + + mxArray const * const ptrangles=prhs[2]; + + + double const * const anglesM= static_cast(mxGetData(ptrangles)); + // just copy paste the data to a float array + float * angles= (float*)malloc(nangles*mrows*sizeof(float)); + for (int i=0;i(mxGetData(image)); + // We need a float image, and, unfortunately, the only way of casting it is by value + const mwSize *size_img= mxGetDimensions(image); //get size of image + + + + ///////////////////// Second input argument, + // Geometry structure that has all the needed geometric data. + + + mxArray * geometryMex=(mxArray*)prhs[1]; + + // IMPORTANT-> Make sure Matlab creates the struct in this order. + const char *fieldnames[14]; + fieldnames[0] = "nVoxel"; + fieldnames[1] = "sVoxel"; + fieldnames[2] = "dVoxel"; + fieldnames[3] = "nDetector"; + fieldnames[4] = "sDetector"; + fieldnames[5] = "dDetector"; + fieldnames[6] = "DSD"; + fieldnames[7] = "DSO"; + fieldnames[8] = "offOrigin"; + fieldnames[9] = "offDetector"; + fieldnames[10]= "accuracy"; + fieldnames[11]= "mode"; + fieldnames[12]= "COR"; + fieldnames[13]= "rotDetector"; + + // Now we know that all the input struct is good! Parse it from mxArrays to + // C structures that MEX can understand. + double * nVoxel, *nDetec; //we need to cast these to int + double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD; + double *offOrig,*offDetec,*rotDetector; + double * acc, *COR; + const char* mode; + int c; + mxArray *tmp; + Geometry geo; + geo.unitX=1;geo.unitY=1;geo.unitZ=1; + bool coneBeam=true; +// mexPrintf("%d \n",nfields); + for(int ifield=0; ifield<14; ifield++) { + tmp=mxGetField(geometryMex,0,fieldnames[ifield]); + if(tmp==NULL){ + //tofix + continue; + } + switch(ifield){ + case 0: + nVoxel=(double *)mxGetData(tmp); + // copy data to MEX memory + geo.nVoxelX=(int)nVoxel[0]; + geo.nVoxelY=(int)nVoxel[1]; + geo.nVoxelZ=(int)nVoxel[2]; + break; + case 1: + sVoxel=(double *)mxGetData(tmp); + geo.sVoxelX=(float)sVoxel[0]; + geo.sVoxelY=(float)sVoxel[1]; + geo.sVoxelZ=(float)sVoxel[2]; + break; + case 2: + dVoxel=(double *)mxGetData(tmp); + geo.dVoxelX=(float)dVoxel[0]; + geo.dVoxelY=(float)dVoxel[1]; + geo.dVoxelZ=(float)dVoxel[2]; + break; + case 3: + nDetec=(double *)mxGetData(tmp); + geo.nDetecU=(int)nDetec[0]; + geo.nDetecV=(int)nDetec[1]; + break; + case 4: + sDetec=(double *)mxGetData(tmp); + geo.sDetecU=(float)sDetec[0]; + geo.sDetecV=(float)sDetec[1]; + break; + case 5: + dDetec=(double *)mxGetData(tmp); + geo.dDetecU=(float)dDetec[0]; + geo.dDetecV=(float)dDetec[1]; + break; + case 6: + geo.DSD=(float*)malloc(nangles * sizeof(float)); + DSD=(double *)mxGetData(tmp); + for (int i=0;i +#include +#include +#include +#include +#include +#include +#include +// #include +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ +///////// First check if the amount of imputs is rigth. + int maxIter; + float alpha; + float ratio; + GpuIds gpuids; + if (nrhs<5) + mexErrMsgIdAndTxt("TIGRE:minPICCS", "At least 2 inputs needed: Image and prior image"); + if (nrhs>6){ + mexErrMsgIdAndTxt("TIGRE:minPICCS", "Too many imput argumets"); + } + if (nrhs==6){ + size_t mrows = mxGetM(prhs[2]); + size_t ncols = mxGetN(prhs[2]); + if (mrows!=1 || ncols !=1) + mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1"); + mrows = mxGetM(prhs[3]); + ncols = mxGetN(prhs[3]); + if (mrows!=1 || ncols !=1) + mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1"); + mrows = mxGetM(prhs[4]); + ncols = mxGetN(prhs[4]); + if (mrows!=1 || ncols !=1) + mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1"); + alpha= (float)(mxGetScalar(prhs[2])); + maxIter=(int)floor(mxGetScalar(prhs[3])+0.5); + ratio= (float)(mxGetScalar(prhs[4])); + + size_t uiGpuCount = mxGetN(prhs[5]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "TIGRE:minPICCS","6th parameter must be a row vector"); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[5]); + gpuids.SetIds(uiGpuCount, piGpuIds); + }else{ + int iGpuCount = GetGpuCount(); + int* piDev = (int*)malloc(iGpuCount * sizeof(int)); + for (int iI = 0; iI < iGpuCount; ++iI) { + piDev[iI] = iI; + } + gpuids.SetIds(iGpuCount, piDev); + free(piDev); piDev = 0; + } + if (nrhs==2){ + maxIter=100; + alpha=15.0f; + ratio=0.5; + } + + +////////////////////////// First input. + // First input should be x from (Ax=b), or the image. + mxArray const * const image = prhs[0]; + mwSize const numDims = mxGetNumberOfDimensions(image); + if (numDims!=3){ + mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D"); + } + mxArray const * const prior_mex = prhs[1]; + mwSize const numDims_prior = mxGetNumberOfDimensions(image); + if (numDims_prior!=3){ + mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D"); + } + if(numDims_prior!=numDims) + mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image and prior are not the same size"); + // Image should be dim 3 + + // Now that input is ok, parse it to C data types. + float const * const img = static_cast(mxGetData(image)); + float const * const prior = static_cast(mxGetData(prior_mex)); + const mwSize *size_img= mxGetDimensions(image); //get size of image + + + // Allocte output image + const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] }; + plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL); + float *imgout =(float*) mxGetPr(plhs[0]); + + + piccs_tv(img,prior,imgout, alpha,ratio, imageSize, maxIter,gpuids); + + + +} \ No newline at end of file diff --git a/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip b/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip new file mode 100644 index 00000000..da60446c --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip @@ -0,0 +1,132 @@ +/* +/*------------------------------------------------------------------------- + * + * MATLAB MEX gateway for Total variation minimization via Steepest descend + * + * This file gets the data from MATLAB, checks it for errors and then + * parses it to C and calls the relevant C/CUDA functions. + * + * CODE by Ander Biguri + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + + + + + +#include +#include +#include +#include +#include +#include +#include +#include +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ +///////// First check if the amount of inputs is right. + int maxIter; + float alpha; + GpuIds gpuids; + if (nrhs==4) { + size_t iM = mxGetM(prhs[3]); + if (iM != 1) { + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[3]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "TIGRE:minTV","4th parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[3]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } else { + int iGpuCount = GetGpuCount(); + int* piDev = (int*)malloc(iGpuCount * sizeof(int)); + for (int iI = 0; iI < iGpuCount; ++iI) { + piDev[iI] = iI; + } + gpuids.SetIds(iGpuCount, piDev); + free(piDev); piDev = 0; + } + if (nrhs==1){ + maxIter=100; + alpha=15.0f; + } else if (nrhs==2){ + mexErrMsgIdAndTxt("minTV:mex", "Only 1 POCS hyperparameter inputted"); + } else if (nrhs==3 || nrhs==4){ + size_t mrows = mxGetM(prhs[1]); + size_t ncols = mxGetN(prhs[1]); + if (mrows!=1 || ncols !=1) + mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1"); + mrows = mxGetM(prhs[2]); + ncols = mxGetN(prhs[2]); + if (mrows!=1 || ncols !=1) + mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1"); + alpha= (float)(mxGetScalar(prhs[1])); + maxIter=(int)floor(mxGetScalar(prhs[2])+0.5); + } else { + mexErrMsgIdAndTxt("minTV:mex", "Too many input arguments"); + } + +////////////////////////// First input. + // First input should be x from (Ax=b), or the image. + mxArray const * const image = prhs[0]; + mwSize const numDims = mxGetNumberOfDimensions(image); + mwSize third_dim = 1; + + + // Now that input is ok, parse it to C data types. + float * img = static_cast(mxGetData(image)); + const mwSize *size_img = mxGetDimensions(image); //get size of image + + // Image should be dim 3 + if (numDims==3){ + third_dim = size_img[2]; + } + + // Allocte output image + const long imageSize[3]={size_img[0] ,size_img[1], third_dim }; + plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL); + float *imgout =(float*) mxGetPr(plhs[0]); + + pocs_tv(img,imgout, alpha, imageSize, maxIter, gpuids); +} diff --git a/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip new file mode 100644 index 00000000..1142a5f7 --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip @@ -0,0 +1,124 @@ +/*-------------------------------------------------------------------------- +-------------------------------------------------------------------------- + This file is part of the TIGRE Toolbox + + Copyright (c) 2015, University of Bath and + CERN-European Organization for Nuclear Research + All rights reserved. + + License: Open Source under BSD. + See the full license at + https://github.com/CERN/TIGRE/blob/master/LICENSE + + Contact: tigre.toolbox@gmail.com + Codes: https://github.com/CERN/TIGRE/ + Coded by: Stefanie Kaser, Benjamin Kirchmayer +--------------------------------------------------------------------------*/ + +#include "mex.h" +#include "CUDA/improvedForwardProjections.hpp" +#include +#include +#include + + +void mexFunction(int nlhs, mxArray *plhs[], int nrhs,const mxArray *prhs[]){ + + if (nrhs =! 7){ + mexErrMsgIdAndTxt("CS Projections:", "Check Number of Input arguments!"); + } + + float *posIn, *posOut, *dirIn, *dirOut; + float *Wepl, *pixelSize, *detectorDistanceIn, *detectorDistanceOut, *initEnergy; + + //Load parameters + posIn = (float *)(mxGetPr(prhs[0])); + posOut = (float *)mxGetPr(prhs[1]); + dirIn = (float *)mxGetPr(prhs[2]); + dirOut = (float *)mxGetPr(prhs[3]); + Wepl = (float*) mxGetPr(prhs[4]); + initEnergy = (float*) mxGetPr(prhs[5]); + + //Get Number of Protons contained in the root files + int numOfProtons = (int) mxGetM(prhs[4]); + + mxArray * geometryMex=(mxArray*)prhs[6]; + + const char *fieldnames_geo[7]; + fieldnames_geo[0] = "dDetector"; + fieldnames_geo[1] = "DSD"; + fieldnames_geo[2] = "DSID"; + fieldnames_geo[3] = "DSO"; + fieldnames_geo[4] = "hull"; + fieldnames_geo[5] = "sDetector"; + fieldnames_geo[6] = "mode"; + + double * pix0, *dsd0, *dsid0, *hull0, *det0, *dso0; + float pix[2], dsd, dsid, dso, hull[4], det[2]; + const char* mode; + bool coneBeam = true; + mxArray *tmp; + for (int ifield=0; ifield<7; ifield++){ + tmp=mxGetField(geometryMex,0,fieldnames_geo[ifield]); + switch(ifield){ + case 0: + pix0 =(double *)mxGetData(tmp); + pix[0] = (float)pix0[0]; + pix[1] = (float)pix0[1]; + break; + case 1: + dsd0 =(double *)mxGetData(tmp); + dsd = (float)dsd0[0]; + break; + case 2: + dsid0 =(double *)mxGetData(tmp); + dsid = (float)dsid0[0]; + break; + case 3: + dso0 =(double *)mxGetData(tmp); + dso = (float)dso0[0]; + break; + case 4: + hull0 =(double *)mxGetData(tmp); + hull[0] = (float)hull0[0]; + hull[1] = (float)hull0[1]; + hull[2] = (float)hull0[2]; + hull[3] = (float)hull0[3]; + break; + case 5: + det0 =(double *)mxGetData(tmp); + det[0] = (float)det0[0]; + det[1] = (float)det0[1]; + break; + case 6: + mode=""; + mode=mxArrayToString(tmp); + if (!strcmp(mode,"parallel")) + coneBeam=false; + break; + } + } + + + if (hull[3] == 0){std::cout << "Info: Calculation of optimized proton radiographies will be performed without object hull!" << std::endl;} + + if (hull[2] > 6.28318530717958648){std::cout << "Info: Hull rotation angle exceeds 2 Pi. Please check the input! Continuing with calculation..." << std::endl;} + + mwSize outSize[2]; + outSize[0] = int(det[1]/pix[1]); + outSize[1] = int(det[0]/pix[0]); + plhs[0] = mxCreateNumericArray(2, outSize, mxSINGLE_CLASS, mxREAL); + float *outProjections = (float*)mxGetPr(plhs[0]); + + //For Calculation 2 historgrams are needed + // + if(coneBeam == false){ + std::cout << "Info: Parallel geometry selected..." << std::endl; + ParticleProjections(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, *initEnergy, hull); + } + else{ + std::cout << "Info: Cone beam geometry selected..." << std::endl; + ParticleProjectionsCone(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, -1*dso, *initEnergy, hull); + } + +} diff --git a/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip new file mode 100644 index 00000000..f905bcbd --- /dev/null +++ b/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip @@ -0,0 +1,147 @@ +/*------------------------------------------------------------------------- + * + * MATLAB MEX functions for TV image denoising. Check inputs and parses + * MATLAB data to C++ data. + * + * + * CODE by Ander Biguri + * +--------------------------------------------------------------------------- +--------------------------------------------------------------------------- +Copyright (c) 2015, University of Bath and CERN- European Organization for +Nuclear Research +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + --------------------------------------------------------------------------- + +Contact: tigre.toolbox@gmail.com +Codes : https://github.com/CERN/TIGRE +--------------------------------------------------------------------------- + */ + + + + + + +#include +#include +#include +#include +#include +#include +#include +#include +/** + * MEX gateway + */ +void mexFunction(int nlhs , mxArray *plhs[], + int nrhs, mxArray const *prhs[]) +{ + int maxIter; + float lambda; + GpuIds gpuids; + if (nrhs==4) { + size_t iM = mxGetM(prhs[3]); + if (iM != 1) { + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); + return; + } + size_t uiGpuCount = mxGetN(prhs[3]); + if (uiGpuCount == 0) { + mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); + return; + } + int* piGpuIds = (int*)mxGetData(prhs[3]); + gpuids.SetIds(uiGpuCount, piGpuIds); + } else { + int iGpuCount = GetGpuCount(); + int* piDev = (int*)malloc(iGpuCount * sizeof(int)); + for (int iI = 0; iI < iGpuCount; ++iI) { + piDev[iI] = iI; + } + gpuids.SetIds(iGpuCount, piDev); + free(piDev); piDev = 0; + } + if (nrhs == 0) { + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "At least one input argumet required."); + } else if (nrhs==1){ + maxIter=100; + lambda=15.0f; + } else if (nrhs==2){ + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Only 1 TV hyperparameter inputted"); + } else if (nrhs==3 || nrhs==4){ + size_t mrows = mxGetM(prhs[1]); + size_t ncols = mxGetN(prhs[1]); + if (mrows!=1 || ncols !=1) { + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1"); + } + mrows = mxGetM(prhs[2]); + ncols = mxGetN(prhs[2]); + if (mrows!=1 || ncols !=1) { + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1"); + } + lambda= (float)(mxGetScalar(prhs[1])); + maxIter=(int)round(mxGetScalar(prhs[2])); + } else if (nrhs>4) { + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Too many input arguments"); + } + ////////////////////////// First input. + // First input should be x from (Ax=b), or the image. + mxArray const * const image = prhs[0]; + mwSize const numDims = mxGetNumberOfDimensions(image); + + // Image should be dim 3 + if (numDims!=3){ + mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Image is not 3D"); + } + // Now that input is ok, parse it to C data types. + float * img = static_cast(mxGetData(image)); + // We need a float image, and, unfortunately, the only way of casting it is by value + const mwSize *size_img= mxGetDimensions(image); //get size of image + + ////////////// + //prepareotputs + plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL); + float *imgout =(float*) mxGetPr(plhs[0]); + // Allocte output image + // call C function with the CUDA denoising + const float spacing[3]={1,1,1}; + const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] }; + + tvdenoising(img,imgout, lambda, spacing, imageSize, maxIter, gpuids); + + + +// memcpy(mxImgout,imgout,size_img[0] *size_img[1] *size_img[2]*sizeof(float)); + //free memory +// free(img); +// free(imgout); + + +} From 29e4f6a3b29860cf2a130081075870dfdbc3ebcb Mon Sep 17 00:00:00 2001 From: purepani Date: Wed, 19 Mar 2025 19:38:13 -0500 Subject: [PATCH 2/3] Remove prehip files --- Common/CUDA/GD_AwTV.cu.prehip | 713 ---------- Common/CUDA/GD_AwTV.hpp.prehip | 62 - Common/CUDA/GD_TV.cu.prehip | 702 ---------- Common/CUDA/GD_TV.hpp.prehip | 61 - Common/CUDA/GpuIds.cpp.prehip | 70 - Common/CUDA/GpuIds.hpp.prehip | 17 - Common/CUDA/PICCS.cu.prehip | 398 ------ Common/CUDA/PICCS.hpp.prehip | 61 - Common/CUDA/RandomNumberGenerator.cu.prehip | 193 --- Common/CUDA/RandomNumberGenerator.hpp.prehip | 49 - Common/CUDA/Siddon_projection.cu.prehip | 859 ------------ Common/CUDA/Siddon_projection.hpp.prehip | 66 - .../CUDA/Siddon_projection_parallel.cu.prehip | 540 -------- .../Siddon_projection_parallel.hpp.prehip | 65 - Common/CUDA/TIGRE_common.cpp.prehip | 20 - Common/CUDA/TIGRE_common.hpp.prehip | 24 - Common/CUDA/errors.hpp.prehip | 10 - Common/CUDA/gpuUtils.cu.prehip | 70 - Common/CUDA/gpuUtils.hpp.prehip | 18 - .../CUDA/improvedForwardProjections.cu.prehip | 1032 -------------- .../improvedForwardProjections.hpp.prehip | 263 ---- .../improvedForwardProjections_cone.cu.prehip | 1230 ----------------- Common/CUDA/projection.cpp.prehip | 35 - Common/CUDA/projection.hpp.prehip | 9 - .../ray_interpolated_projection.cu.prehip | 843 ----------- .../ray_interpolated_projection.hpp.prehip | 66 - ...interpolated_projection_parallel.cu.prehip | 449 ------ ...nterpolated_projection_parallel.hpp.prehip | 65 - Common/CUDA/tv_proximal.cu.prehip | 693 ---------- Common/CUDA/tv_proximal.hpp.prehip | 57 - Common/CUDA/types_TIGRE.hpp.prehip | 109 -- Common/CUDA/voxel_backprojection.cu.prehip | 920 ------------ Common/CUDA/voxel_backprojection.hpp.prehip | 59 - Common/CUDA/voxel_backprojection2.cu.prehip | 844 ----------- Common/CUDA/voxel_backprojection2.hpp.prehip | 64 - .../voxel_backprojection_parallel.cu.prehip | 627 --------- .../voxel_backprojection_parallel.hpp.prehip | 57 - .../cuda_interface/AddNoise.cpp.prehip | 126 -- .../cuda_interface/Atb_mex.cpp.prehip | 367 ----- .../cuda_interface/AwminTV.cpp.prehip | 137 -- .../cuda_interface/Ax_mex.cpp.prehip | 338 ----- .../cuda_interface/minPICCS.cpp.prehip | 147 -- .../Utilities/cuda_interface/minTV.cpp.prehip | 132 -- .../pCTCubicSpline_mex.cpp.prehip | 124 -- .../cuda_interface/tvDenoise.cpp.prehip | 147 -- 45 files changed, 12938 deletions(-) delete mode 100644 Common/CUDA/GD_AwTV.cu.prehip delete mode 100644 Common/CUDA/GD_AwTV.hpp.prehip delete mode 100644 Common/CUDA/GD_TV.cu.prehip delete mode 100644 Common/CUDA/GD_TV.hpp.prehip delete mode 100644 Common/CUDA/GpuIds.cpp.prehip delete mode 100644 Common/CUDA/GpuIds.hpp.prehip delete mode 100644 Common/CUDA/PICCS.cu.prehip delete mode 100644 Common/CUDA/PICCS.hpp.prehip delete mode 100644 Common/CUDA/RandomNumberGenerator.cu.prehip delete mode 100644 Common/CUDA/RandomNumberGenerator.hpp.prehip delete mode 100644 Common/CUDA/Siddon_projection.cu.prehip delete mode 100644 Common/CUDA/Siddon_projection.hpp.prehip delete mode 100644 Common/CUDA/Siddon_projection_parallel.cu.prehip delete mode 100644 Common/CUDA/Siddon_projection_parallel.hpp.prehip delete mode 100644 Common/CUDA/TIGRE_common.cpp.prehip delete mode 100644 Common/CUDA/TIGRE_common.hpp.prehip delete mode 100644 Common/CUDA/errors.hpp.prehip delete mode 100644 Common/CUDA/gpuUtils.cu.prehip delete mode 100644 Common/CUDA/gpuUtils.hpp.prehip delete mode 100644 Common/CUDA/improvedForwardProjections.cu.prehip delete mode 100644 Common/CUDA/improvedForwardProjections.hpp.prehip delete mode 100644 Common/CUDA/improvedForwardProjections_cone.cu.prehip delete mode 100644 Common/CUDA/projection.cpp.prehip delete mode 100644 Common/CUDA/projection.hpp.prehip delete mode 100644 Common/CUDA/ray_interpolated_projection.cu.prehip delete mode 100644 Common/CUDA/ray_interpolated_projection.hpp.prehip delete mode 100644 Common/CUDA/ray_interpolated_projection_parallel.cu.prehip delete mode 100644 Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip delete mode 100644 Common/CUDA/tv_proximal.cu.prehip delete mode 100644 Common/CUDA/tv_proximal.hpp.prehip delete mode 100644 Common/CUDA/types_TIGRE.hpp.prehip delete mode 100644 Common/CUDA/voxel_backprojection.cu.prehip delete mode 100644 Common/CUDA/voxel_backprojection.hpp.prehip delete mode 100644 Common/CUDA/voxel_backprojection2.cu.prehip delete mode 100644 Common/CUDA/voxel_backprojection2.hpp.prehip delete mode 100644 Common/CUDA/voxel_backprojection_parallel.cu.prehip delete mode 100644 Common/CUDA/voxel_backprojection_parallel.hpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/minTV.cpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip delete mode 100644 MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip diff --git a/Common/CUDA/GD_AwTV.cu.prehip b/Common/CUDA/GD_AwTV.cu.prehip deleted file mode 100644 index d98c13c1..00000000 --- a/Common/CUDA/GD_AwTV.cu.prehip +++ /dev/null @@ -1,713 +0,0 @@ -/*------------------------------------------------------------------------- - * - * CUDA functions for Steepest descend in POCS-type algorithms. - * - * This file will iteratively minimize by steepest descend the total variation - * of the input image, with the parameters given, using GPUs. - * - * CODE by Ander Biguri - * - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - - - - - - - -#define MAXTHREADS 1024 -#define MAX_BUFFER 60 - -#include "GD_AwTV.hpp" - - - - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",cudaGetErrorString(__err));\ - } \ -} while (0) - -// CUDA kernels -//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927 - __global__ void divideArrayScalar(float* vec,float scalar,const size_t n){ - unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x; - for(; i= 0 && z= 0 && y= 0 && x= cols || y >= rows || z >= depth ) - return; - - - float df[3] ={0.f,0.f,0.f}; - float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k} - float dfj[3]={0.f,0.f,0.f}; - float dfk[3]={0.f,0.f,0.f}; - gradient(f,df ,z ,y ,x , depth,rows,cols); - gradient(f,dfi ,z ,y ,x+1, depth,rows,cols); - gradient(f,dfj ,z ,y+1,x , depth,rows,cols); - gradient(f,dfk ,z+1,y ,x , depth,rows,cols); - float eps=0.00000001; //% avoid division by zero - - float wx=__expf(-(df[0]/delta)*(df[0]/delta)); - float wy=__expf(-(df[1]/delta)*(df[1]/delta)); - float wz=__expf(-(df[2]/delta)*(df[2]/delta)); - - float wxi=__expf(-(dfi[0]/delta)*(dfi[0]/delta)); - float wyi=__expf(-(dfi[1]/delta)*(dfi[1]/delta)); - float wzi=__expf(-(dfi[2]/delta)*(dfi[2]/delta)); - - float wxj=__expf(-(dfj[0]/delta)*(dfj[0]/delta)); - float wyj=__expf(-(dfj[1]/delta)*(dfj[1]/delta)); - float wzj=__expf(-(dfj[2]/delta)*(dfj[2]/delta)); - - float wxk=__expf(-(dfk[0]/delta)*(dfk[0]/delta)); - float wyk=__expf(-(dfk[1]/delta)*(dfk[1]/delta)); - float wzk=__expf(-(dfk[2]/delta)*(dfk[2]/delta)); - - - // this hsould do the trick I think - - dftv[idx]=(wx*df[0]+wy*df[1]+wz*df[2])/(sqrt(wx*df[0] *df[0] +wy*df[1] *df[1] +wz*df[2] *df[2])+eps) - -wzi*dfi[2]/(sqrt(wxi*dfi[0]*dfi[0]+wyi*dfi[1]*dfi[1]+wzi*dfi[2]*dfi[2]) +eps) // I wish I coudl precompute this, but if I do then Id need to recompute the gradient. - -wyj*dfj[1]/(sqrt(wxj*dfj[0]*dfj[0]+wyj*dfj[1]*dfj[1]+wzj*dfj[2]*dfj[2]) +eps) - -wxk*dfk[0]/(sqrt(wxk*dfk[0]*dfk[0]+wyk*dfk[1]*dfk[1]+wzk*dfk[2]*dfk[2]) +eps); - - - return; - - } - - __device__ void warpReduce(volatile float *sdata, size_t tid) { - sdata[tid] += sdata[tid + 32]; - sdata[tid] += sdata[tid + 16]; - sdata[tid] += sdata[tid + 8]; - sdata[tid] += sdata[tid + 4]; - sdata[tid] += sdata[tid + 2]; - sdata[tid] += sdata[tid + 1]; - } - - __global__ void reduceNorm2(float *g_idata, float *g_odata, size_t n){ - extern __shared__ volatile float sdata[]; - //http://stackoverflow.com/a/35133396/1485872 - size_t tid = threadIdx.x; - size_t i = blockIdx.x*blockDim.x + tid; - size_t gridSize = blockDim.x*gridDim.x; - float mySum = 0; - float value=0; - while (i < n) { - value=g_idata[i]; //avoid reading twice - mySum += value*value; - i += gridSize; - } - sdata[tid] = mySum; - __syncthreads(); - - if (tid < 512) - sdata[tid] += sdata[tid + 512]; - __syncthreads(); - if (tid < 256) - sdata[tid] += sdata[tid + 256]; - __syncthreads(); - - if (tid < 128) - sdata[tid] += sdata[tid + 128]; - __syncthreads(); - - if (tid < 64) - sdata[tid] += sdata[tid + 64]; - __syncthreads(); - - -#if (__CUDART_VERSION >= 9000) - if ( tid < 32 ) - { - mySum = sdata[tid] + sdata[tid + 32]; - for (int offset = warpSize/2; offset > 0; offset /= 2) { - mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); - } - } -#else - if (tid < 32) { - warpReduce(sdata, tid); - mySum = sdata[0]; - } -#endif - if (tid == 0) g_odata[blockIdx.x] = mySum; - } - - __global__ void reduceSum(float *g_idata, float *g_odata, size_t n){ - extern __shared__ volatile float sdata[]; - //http://stackoverflow.com/a/35133396/1485872 - size_t tid = threadIdx.x; - size_t i = blockIdx.x*blockDim.x + tid; - size_t gridSize = blockDim.x*gridDim.x; - float mySum = 0; - // float value=0; - while (i < n) { - mySum += g_idata[i]; - i += gridSize; - } - sdata[tid] = mySum; - __syncthreads(); - - if (tid < 512) - sdata[tid] += sdata[tid + 512]; - __syncthreads(); - if (tid < 256) - sdata[tid] += sdata[tid + 256]; - __syncthreads(); - - if (tid < 128) - sdata[tid] += sdata[tid + 128]; - __syncthreads(); - - if (tid < 64) - sdata[tid] += sdata[tid + 64]; - __syncthreads(); - - -#if (__CUDART_VERSION >= 9000) - if ( tid < 32 ) - { - mySum = sdata[tid] + sdata[tid + 32]; - for (int offset = warpSize/2; offset > 0; offset /= 2) { - mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); - } - } -#else - if (tid < 32) { - warpReduce(sdata, tid); - mySum = sdata[0]; - } -#endif - if (tid == 0) g_odata[blockIdx.x] = mySum; - } - - - - -// main function -void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter,const float delta, const GpuIds& gpuids){ - // Prepare for MultiGPU - int deviceCount = gpuids.GetLength(); - cudaCheckErrors("Device query fail"); - if (deviceCount == 0) { - mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","There are no available device(s) that support CUDA\n"); - } - // - // CODE assumes - // 1.-All available devices are usable by this code - // 2.-All available devices are equal, they are the same machine (warning thrown) - // Check the available devices, and if they are the same - if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); - } - int dev; - - // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. - // check free memory - size_t mem_GPU_global; - checkFreeMemory(gpuids, &mem_GPU_global); - - - - // %5 of free memory should be enough, we have almost no variables in these kernels - size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; - size_t mem_slice_image = sizeof(float)* image_size[0] * image_size[1] ; - size_t mem_size_image = sizeof(float)* total_pixels; - size_t mem_auxiliary = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS; - - // Decide how are we handling the distribution of computation - size_t mem_img_each_GPU; - - unsigned int buffer_length=2; - //Does everything fit in the GPU? - unsigned int slices_per_split; - - // if it is a thin problem (no need to split), just use one GPU - if (image_size[2]<4){deviceCount=1;} - - unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this. - if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary) { - // We only need to split if we have extra GPUs - slices_per_split=(image_size[2]+deviceCount-1)/deviceCount; - mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2)); - }else{ - // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all - size_t mem_free=mem_GPU_global-mem_auxiliary; - - splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free)); - // Now, there is an overhead here, as each splits should have 2 slices more, to account for overlap of images. - // lets make sure these 2 slices fit, if they do not, add 1 to splits. - slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); - mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); - - // if the new stuff does not fit in the GPU, it means we are in the edge case where adding that extra slice will overflow memory - if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ - // one more split should do the job, as its an edge case. - splits++; - //recompute for later - slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap - mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); - } - - - // How many EXTRA buffer slices should be able to fit in here??!?! - // Only do it if there are splits needed. - if(splits>1){ - mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary); - unsigned int extra_buff=(mem_free/mem_slice_image); - buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down. - buffer_length=max(buffer_length,2);// minimum 2 - buffer_length=min(MAX_BUFFER,buffer_length); - - mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2); - - }else{ - buffer_length=2; - } - - // Assert - if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ - mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n"); - } - } - - - // Assert - - if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){ - mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n"); - } - - - - - - - float** d_image= (float**)malloc(deviceCount*sizeof(float*)); - float** d_dimgTV= (float**)malloc(deviceCount*sizeof(float*)); - float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*)); - float** d_norm2= (float**)malloc(deviceCount*sizeof(float*)); - - // allocate memory in each GPU - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - - cudaMalloc((void**)&d_image[dev] , mem_img_each_GPU); - cudaMemset( d_image[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); - cudaMemset( d_dimgTV[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); - cudaMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); - cudaMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); - cudaMemset( d_norm2aux[dev],0 , mem_auxiliary); - cudaCheckErrors("Malloc error"); - - - } - unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1]; - float* buffer; - if(splits>1){ - mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); - }else{ - cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); - } - - - - // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported = 0; -#if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - // splits>2 is completely empirical observation - if (isHostRegisterSupported & splits>2){ - cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - } - cudaCheckErrors("Error pinning memory"); - - - - // Create streams - int nStream_device=2; - int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); - } - } - cudaCheckErrors("Stream creation fail"); - - - // For the reduction - - double totalsum_prev; - double totalsum; - float sum_curr_spl; - float * sumnorm2; - cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); - - unsigned int curr_slices; - unsigned long long curr_pixels; - size_t linear_idx_start; - unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); - unsigned long long* offset_host =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); - unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); - bool is_first_chunk; - bool is_last_chunk; - for(unsigned int i=0;i1){ - totalsum_prev=0; - } - for(unsigned int sp=0;sp1 & i>0){ - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); - - - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - } - cudaCheckErrors("Memcpy failure on multi split"); - - for(unsigned int ib=0; (ib<(buffer_length-1)) && ((i+ib)>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0],delta); - - } - - - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); - - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { - reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); - } - else { - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); - } - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - cudaCheckErrors("Reduction error"); - - - // Accumulate the norm accross devices - sum_curr_spl=0; - // this is CPU code - for (dev = 0; dev < deviceCount; dev++){ - sum_curr_spl+=sumnorm2[dev]; - } - sum_curr_spl+=0.0000001f; // avoid division by zero - - // If we have more than one splits, lets use the result from prior calls - if(i>0 && splits>1){ - // this is already stored: - //totalsum=totalsum_prev; - }else{ - totalsum=sum_curr_spl; - } - - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels); - //MULTIPLY HYPERPARAMETER - multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - cudaCheckErrors("Scalar operations error"); - - //SUBSTRACT GRADIENT - ////////////////////////////////////////////// - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels); - } - } - - // Synchronize mathematics, make sure bounding pixels are correct - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - - if(splits==1){ - for(dev=0; dev0){ - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); - } - } - }else{ - - // We need to take it out :( - for(dev=0; dev2){ - cudaHostUnregister(img); - cudaHostUnregister(dst); - } - for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; - cudaCheckErrors("Memory free"); -// cudaDeviceReset(); - } - -void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){ - size_t memfree; - size_t memtotal; - const int deviceCount = gpuids.GetLength(); - for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); - if(dev==0) *mem_GPU_global=memfree; - if(memfree= 0 && z= 0 && y= 0 && x= cols || y >= rows || z >= depth ) - return; - - - float df[3] ={0.f,0.f,0.f}; - float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k} - float dfj[3]={0.f,0.f,0.f}; - float dfk[3]={0.f,0.f,0.f}; - gradient(f,df ,z ,y ,x , depth,rows,cols); - gradient(f,dfi ,z ,y ,x+1, depth,rows,cols); - gradient(f,dfj ,z ,y+1,x , depth,rows,cols); - gradient(f,dfk ,z+1,y ,x , depth,rows,cols); - float eps=0.00000001; //% avoid division by zero - - dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps) - -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps) // I wish I coudl precompute this, but if I do then Id need to recompute the gradient. - -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps) - -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps); - return; - - } - - __device__ void warpReduce(volatile float *sdata, size_t tid) { - sdata[tid] += sdata[tid + 32]; - sdata[tid] += sdata[tid + 16]; - sdata[tid] += sdata[tid + 8]; - sdata[tid] += sdata[tid + 4]; - sdata[tid] += sdata[tid + 2]; - sdata[tid] += sdata[tid + 1]; - } - - __global__ void reduceNorm2(float *g_idata, float *g_odata, size_t n){ - extern __shared__ volatile float sdata[]; - //http://stackoverflow.com/a/35133396/1485872 - size_t tid = threadIdx.x; - size_t i = blockIdx.x*blockDim.x + tid; - size_t gridSize = blockDim.x*gridDim.x; - float mySum = 0; - float value=0; - while (i < n) { - value=g_idata[i]; //avoid reading twice - mySum += value*value; - i += gridSize; - } - sdata[tid] = mySum; - __syncthreads(); - - if (tid < 512) - sdata[tid] += sdata[tid + 512]; - __syncthreads(); - if (tid < 256) - sdata[tid] += sdata[tid + 256]; - __syncthreads(); - - if (tid < 128) - sdata[tid] += sdata[tid + 128]; - __syncthreads(); - - if (tid < 64) - sdata[tid] += sdata[tid + 64]; - __syncthreads(); - - -#if (__CUDART_VERSION >= 9000) - if ( tid < 32 ) - { - mySum = sdata[tid] + sdata[tid + 32]; - for (int offset = warpSize/2; offset > 0; offset /= 2) { - mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); - } - } -#else - if (tid < 32) { - warpReduce(sdata, tid); - mySum = sdata[0]; - } -#endif - if (tid == 0) g_odata[blockIdx.x] = mySum; - } - - __global__ void reduceSum(float *g_idata, float *g_odata, size_t n){ - extern __shared__ volatile float sdata[]; - //http://stackoverflow.com/a/35133396/1485872 - size_t tid = threadIdx.x; - size_t i = blockIdx.x*blockDim.x + tid; - size_t gridSize = blockDim.x*gridDim.x; - float mySum = 0; - // float value=0; - while (i < n) { - mySum += g_idata[i]; - i += gridSize; - } - sdata[tid] = mySum; - __syncthreads(); - - if (tid < 512) - sdata[tid] += sdata[tid + 512]; - __syncthreads(); - if (tid < 256) - sdata[tid] += sdata[tid + 256]; - __syncthreads(); - - if (tid < 128) - sdata[tid] += sdata[tid + 128]; - __syncthreads(); - - if (tid < 64) - sdata[tid] += sdata[tid + 64]; - __syncthreads(); - - -#if (__CUDART_VERSION >= 9000) - if ( tid < 32 ) - { - mySum = sdata[tid] + sdata[tid + 32]; - for (int offset = warpSize/2; offset > 0; offset /= 2) { - mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); - } - } -#else - if (tid < 32) { - warpReduce(sdata, tid); - mySum = sdata[0]; - } -#endif - if (tid == 0) g_odata[blockIdx.x] = mySum; - } - - - - -// main function - void pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter, const GpuIds& gpuids){ - - - - - // Prepare for MultiGPU - int deviceCount = gpuids.GetLength(); - cudaCheckErrors("Device query fail"); - if (deviceCount == 0) { - mexErrMsgIdAndTxt("GD_TV:GPU","There are no available device(s) that support CUDA\n"); - } - // - // CODE assumes - // 1.-All available devices are usable by this code - // 2.-All available devices are equal, they are the same machine (warning thrown) - // Check the available devices, and if they are the same - if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("minimizeTV:GD_TV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); - } - - int dev; - - // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. - - size_t mem_GPU_global; - checkFreeMemory(gpuids, &mem_GPU_global); - - - - // %5 of free memory should be enough, we have almost no variables in these kernels - size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; - size_t mem_slice_image = sizeof(float)* image_size[0] * image_size[1] ; - size_t mem_size_image = sizeof(float)* total_pixels; - size_t mem_auxiliary = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS; - - // Decide how are we handling the distribution of computation - size_t mem_img_each_GPU; - - unsigned int buffer_length=2; - //Does everything fit in the GPU? - unsigned int slices_per_split; - - // if it is a thin problem (no need to split), just use one GPU - if (image_size[2]<4){deviceCount=1;} - - unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this. - if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary){ - // We only need to split if we have extra GPUs - slices_per_split=(image_size[2]+deviceCount-1)/deviceCount; - mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2)); - }else{ - // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all - size_t mem_free=mem_GPU_global-mem_auxiliary; - - splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free)); - // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images. - // lets make sure these 2 slices fit, if they do not, add 1 to splits. - slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); - mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); - - // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory - if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ - // one more split should do the job, as its an edge case. - splits++; - //recompute for later - slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap - mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); - } - - - // How many EXTRA buffer slices should be able to fit in here??!?! - // Only do it if there are splits needed. - if(splits>1){ - mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary); - unsigned int extra_buff=(mem_free/mem_slice_image); - buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down. - buffer_length=max(buffer_length,2);// minimum 2 - buffer_length=min(MAX_BUFFER,buffer_length); - - mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2); - - }else{ - buffer_length=2; - } - - // Assert - if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){ - mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n"); - } - } - - - // Assert - - if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){ - mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n"); - } - - - - - - - float** d_image= (float**)malloc(deviceCount*sizeof(float*)); - float** d_dimgTV= (float**)malloc(deviceCount*sizeof(float*)); - float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*)); - float** d_norm2= (float**)malloc(deviceCount*sizeof(float*)); - - // allocate memory in each GPU - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - - cudaMalloc((void**)&d_image[dev] , mem_img_each_GPU); - cudaMemset( d_image[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_dimgTV[dev] , mem_img_each_GPU); - cudaMemset( d_dimgTV[dev],0 , mem_img_each_GPU); - cudaMalloc((void**)&d_norm2[dev] , slices_per_split*mem_slice_image); - cudaMemset( d_norm2[dev],0 , slices_per_split*mem_slice_image); - cudaMalloc((void**)&d_norm2aux[dev] , mem_auxiliary); - cudaMemset( d_norm2aux[dev],0 , mem_auxiliary); - cudaCheckErrors("Malloc error"); - - - } - unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1]; - float* buffer; - if(splits>1){ - mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed."); - }else{ - cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float)); - } - - - - // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported = 0; -#if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - // splits>2 is completely empirical observation - if (isHostRegisterSupported & splits>2){ - cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - } - cudaCheckErrors("Error pinning memory"); - - - - // Create streams - int nStream_device=2; - int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); - } - } - cudaCheckErrors("Stream creation fail"); - - - // For the reduction - - double totalsum_prev; - double totalsum; - float sum_curr_spl; - float * sumnorm2; - cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float)); - - unsigned int curr_slices; - unsigned long long curr_pixels; - size_t linear_idx_start; - unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); - unsigned long long* offset_host =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); - unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long)); - bool is_first_chunk; - bool is_last_chunk; - for(unsigned int i=0;i1){ - totalsum_prev=0; - } - for(unsigned int sp=0;sp1 & i>0){ - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev] , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]); - - - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - } - cudaCheckErrors("Memcpy failure on multi split"); - - for(unsigned int ib=0; (ib<(buffer_length-1)) && ((i+ib)>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0]); - - } - - - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split> >(d_norm2[dev], d_norm2aux[dev], total_pixels); - - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split 1) { - reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); - } - else { - cudaStreamSynchronize(stream[dev*nStream_device]); - cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]); - } - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - cudaCheckErrors("Reduction error"); - - - // Accumulate the norm accross devices - sum_curr_spl=0; - // this is CPU code - for (dev = 0; dev < deviceCount; dev++){ - sum_curr_spl+=sumnorm2[dev]; - } - sum_curr_spl+=0.0000001f; // avoid division by zero - - // If we have more than one splits, lets use the result from prior calls - if(i>0 && splits>1){ - // this is already stored: - //totalsum=totalsum_prev; - }else{ - totalsum=sum_curr_spl; - } - - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels); - //MULTIPLY HYPERPARAMETER - multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha, total_pixels); - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - cudaCheckErrors("Scalar operations error"); - - //SUBSTRACT GRADIENT - ////////////////////////////////////////////// - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels); - } - } - - // Synchronize mathematics, make sure bounding pixels are correct - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - - if(splits==1){ - for(dev=0; dev0){ - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost); - cudaSetDevice(gpuids[dev]); - cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); - } - } - }else{ - - // We need to take it out :( - for(dev=0; dev2){ - cudaHostUnregister(img); - cudaHostUnregister(dst); - } - for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - cudaCheckErrors("Memory free"); - cudaDeviceReset(); - } - -void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ - size_t memfree; - size_t memtotal; - int deviceCount = gpuids.GetLength(); - for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); - if(dev==0) *mem_GPU_global=memfree; - if(memfree -#include -#include - -GpuIds::~GpuIds() { - free(m_piDeviceIds); m_piDeviceIds = nullptr; - m_iCount = 0; -} -GpuIds::GpuIds() : m_piDeviceIds (nullptr), m_iCount(0) { - -} -void GpuIds::SetIds(int iCount, int* piDeviceIds) { - if (iCount > 0 && piDeviceIds != 0) { - if (m_piDeviceIds) { - free(m_piDeviceIds); m_piDeviceIds = nullptr; - m_iCount = 0; - } - m_piDeviceIds = (int*)malloc(iCount * sizeof(int)); - if (m_piDeviceIds) { - for (int iI = 0; iI < iCount; ++iI) { - m_piDeviceIds[iI] = piDeviceIds[iI]; - } - m_iCount = iCount; - } - } -} - -int GpuIds::GetLength() const { - return m_iCount; -} -int& GpuIds::operator[](int iIndex){ - return m_piDeviceIds[iIndex]; -} -int GpuIds::operator[](int iIndex) const { - return m_piDeviceIds[iIndex]; -} - -void GpuIds::SetAllGpus(int iTotalDeviceCount) { - // Set all GPUs for compatibility - // Makeup valid GpuIds. - int* aiIds = nullptr; - if (iTotalDeviceCount == 0) { - (int*)malloc(iTotalDeviceCount*sizeof(int)); - for (int iI = 0; iI < iTotalDeviceCount; ++iI) { - aiIds[iI] = iI; - } - } - SetIds(iTotalDeviceCount, aiIds); - free(aiIds); aiIds = 0; -} - -bool GpuIds::AreEqualDevices() const { - int deviceCount = this->GetLength(); - const int devicenamelength = 256; // The length 256 is fixed by spec of cudaDeviceProp::name - char devicename[devicenamelength]; - cudaDeviceProp deviceProp; - for (int dev = 0; dev < deviceCount; dev++) { - // cudaSetDevice(m_piDeviceIds[dev]); - cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]); - if (dev>0) { - if (strcmp(devicename, deviceProp.name) != 0) { - return false; - } - } - memset(devicename, 0, devicenamelength); - strcpy(devicename, deviceProp.name); - } - return true; -} diff --git a/Common/CUDA/GpuIds.hpp.prehip b/Common/CUDA/GpuIds.hpp.prehip deleted file mode 100644 index e0223f86..00000000 --- a/Common/CUDA/GpuIds.hpp.prehip +++ /dev/null @@ -1,17 +0,0 @@ - -#ifndef GPUIDS_H -#define GPUIDS_H -struct GpuIds { - int* m_piDeviceIds; - int m_iCount; - ~GpuIds(); - GpuIds(); - void SetIds(int iCount, int* piDeviceIds); - int GetLength() const; - void SetAllGpus(int iTotalDeviceCount); - int& operator[](int iIndex); - int operator[](int iIndex) const; - bool AreEqualDevices() const; -}; -#endif - diff --git a/Common/CUDA/PICCS.cu.prehip b/Common/CUDA/PICCS.cu.prehip deleted file mode 100644 index 481ede08..00000000 --- a/Common/CUDA/PICCS.cu.prehip +++ /dev/null @@ -1,398 +0,0 @@ -/*------------------------------------------------------------------------- - * - * CUDA functions for Steepest descend in POCS-type algorithms. - * - * This file will iteratively minimize by stepest descend the total variation - * of the input image, with the parameters given, using GPUs. - * - * CODE by Ander Biguri - * ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ - - - - - - - -#define MAXTHREADS 1024 - -#include "PICCS.hpp" - - - - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("ERROR in: %s \n",msg);\ - mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\ - } \ -} while (0) - -// CUDA kernels -//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927 - __global__ void divideArrayScalar(float* vec,float scalar,const size_t n) - { - unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x; - for(; i= 0 && z= 0 && y= 0 && x= cols || y >= rows || z >= depth ) - return; - - float df[3] ={0,0,0}; - float dfi[3]={0,0,0}; // dfi== \partial f_{i+1,j,k} - float dfj[3]={0,0,0}; - float dfk[3]={0,0,0}; - gradient(f,df ,z ,y ,x , depth,rows,cols); - gradient(f,dfi ,z ,y ,x+1, depth,rows,cols); - gradient(f,dfj ,z ,y+1,x , depth,rows,cols); - gradient(f,dfk ,z+1,y ,x , depth,rows,cols); - float eps=0.000001; //% avoid division by zero - dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps) - -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps) // I wish I coudl precompute this, but if I do then Id need to recompute the gradient. - -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps) - -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps); - - } - - __device__ void warpReduce(volatile float *sdata, size_t tid) { - sdata[tid] += sdata[tid + 32]; - sdata[tid] += sdata[tid + 16]; - sdata[tid] += sdata[tid + 8]; - sdata[tid] += sdata[tid + 4]; - sdata[tid] += sdata[tid + 2]; - sdata[tid] += sdata[tid + 1]; - } - - __global__ void reduceNorm2(float *g_idata, float *g_odata, size_t n){ - extern __shared__ volatile float sdata[]; - //http://stackoverflow.com/a/35133396/1485872 - size_t tid = threadIdx.x; - size_t i = blockIdx.x*blockDim.x + tid; - size_t gridSize = blockDim.x*gridDim.x; - float mySum = 0; - float value=0; - while (i < n) { - value=g_idata[i]; //avoid reading twice - mySum += value*value; - i += gridSize; - } - sdata[tid] = mySum; - __syncthreads(); - - if (tid < 512) - sdata[tid] += sdata[tid + 512]; - __syncthreads(); - if (tid < 256) - sdata[tid] += sdata[tid + 256]; - __syncthreads(); - - if (tid < 128) - sdata[tid] += sdata[tid + 128]; - __syncthreads(); - - if (tid < 64) - sdata[tid] += sdata[tid + 64]; - __syncthreads(); - - -#if (__CUDART_VERSION >= 9000) - if ( tid < 32 ) - { - mySum = sdata[tid] + sdata[tid + 32]; - for (int offset = warpSize/2; offset > 0; offset /= 2) { - mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); - } - } -#else - if (tid < 32) { - warpReduce(sdata, tid); - mySum = sdata[0]; - } -#endif - if (tid == 0) g_odata[blockIdx.x] = mySum; - } - __global__ void reduceSum(float *g_idata, float *g_odata, size_t n){ - extern __shared__ volatile float sdata[]; - //http://stackoverflow.com/a/35133396/1485872 - size_t tid = threadIdx.x; - size_t i = blockIdx.x*blockDim.x + tid; - size_t gridSize = blockDim.x*gridDim.x; - float mySum = 0; - // float value=0; - while (i < n) { - mySum += g_idata[i]; - i += gridSize; - } - sdata[tid] = mySum; - __syncthreads(); - - if (tid < 512) - sdata[tid] += sdata[tid + 512]; - __syncthreads(); - if (tid < 256) - sdata[tid] += sdata[tid + 256]; - __syncthreads(); - - if (tid < 128) - sdata[tid] += sdata[tid + 128]; - __syncthreads(); - - if (tid < 64) - sdata[tid] += sdata[tid + 64]; - __syncthreads(); - - -#if (__CUDART_VERSION >= 9000) - if ( tid < 32 ) - { - mySum = sdata[tid] + sdata[tid + 32]; - for (int offset = warpSize/2; offset > 0; offset /= 2) { - mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32); - } - } -#else - if (tid < 32) { - warpReduce(sdata, tid); - mySum = sdata[0]; - } -#endif - if (tid == 0) g_odata[blockIdx.x] = mySum; - } - - -bool isnan_cuda(float* vec, size_t size){ - bool*d_nan; - bool h_nan; - cudaMalloc((void **)&d_nan, sizeof (bool)); - isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan); - cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost); - return h_nan; - -} - -// main function - void piccs_tv(const float* img,const float* prior, float* dst,float alpha,float ratio, const long* image_size, int maxIter, const GpuIds& gpuids){ - - - - - size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; - size_t mem_size = sizeof(float) * total_pixels; - - float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2; - // memory for image - cudaMalloc(&d_image, mem_size); - cudaMalloc(&d_prior, mem_size); - - cudaCheckErrors("Malloc Image error"); - cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice); - cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice); - cudaCheckErrors("Memory Malloc and Memset: SRC"); - // memory for df - cudaMalloc(&d_dimgTV, mem_size); - cudaMalloc(&d_dpiccsTV, mem_size); - cudaCheckErrors("Memory Malloc and Memset: TV"); - cudaMalloc(&d_norm2, mem_size); - cudaCheckErrors("Memory Malloc and Memset: TV"); - cudaMalloc(&d_aux_image, mem_size); - cudaCheckErrors("Memory Malloc and Memset: TV"); - - // memory for L2norm auxiliar - cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS); - cudaCheckErrors("Memory Malloc and Memset: NORMAux"); - - - - // For the gradient - dim3 blockGrad(10, 10, 10); - dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (image_size[2]+blockGrad.z-1)/blockGrad.z); - - // For the reduction - float sumnorm2; - size_t dimblockRed = MAXTHREADS; - size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS; - - - for(unsigned int i=0;i>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]); - cudaDeviceSynchronize(); - cudaCheckErrors("Gradient"); -// mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); - - - multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio), total_pixels); - cudaDeviceSynchronize(); - cudaCheckErrors("Multiplication error"); - - substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels); - cudaDeviceSynchronize(); - cudaCheckErrors("Substraction error"); - - gradientTV<<>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]); - cudaDeviceSynchronize(); - cudaCheckErrors("Gradient"); -// mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); - - multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio, total_pixels); - cudaDeviceSynchronize(); - cudaCheckErrors("Multiplication error"); -// mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); - - - addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels); - cudaDeviceSynchronize(); - //NOMRALIZE via reduction - //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false"); - cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice); - cudaCheckErrors("Copy from gradient call error"); - reduceNorm2 << > >(d_norm2, d_aux_small, total_pixels); - cudaDeviceSynchronize(); - cudaCheckErrors("reduce1"); - if (dimgridRed > 1) { - reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed); - cudaDeviceSynchronize(); - cudaCheckErrors("reduce2"); - cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy"); - - } - else { - cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy"); - } -// mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2)); - //MULTIPLY HYPERPARAMETER sqrt(sumnorm2) - multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2), total_pixels); - cudaDeviceSynchronize(); - cudaCheckErrors("Multiplication error"); - //SUBSTRACT GRADIENT - substractArrays <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels); - cudaDeviceSynchronize(); - cudaCheckErrors("Substraction error"); -// mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false"); -// mexPrintf("\n"); - sumnorm2=0; - } - - cudaCheckErrors("TV minimization"); - - cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost); - cudaCheckErrors("Copy result back"); - - cudaFree(d_image); - cudaFree(d_dpiccsTV); - cudaFree(d_aux_image); - cudaFree(d_aux_small); - cudaFree(d_prior); - cudaFree(d_norm2); - - - cudaCheckErrors("Memory free"); - cudaDeviceReset(); - } - diff --git a/Common/CUDA/PICCS.hpp.prehip b/Common/CUDA/PICCS.hpp.prehip deleted file mode 100644 index e3592dbb..00000000 --- a/Common/CUDA/PICCS.hpp.prehip +++ /dev/null @@ -1,61 +0,0 @@ -/*------------------------------------------------------------------------- - * - * Header for CUDA functions for Steepest descend in POCS-type algorithms. - * - * This file has the required headers for POCS_TV.cu - * - * CODE by Ander Biguri - * ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ - - - - - - - -#ifndef GD_TV_HPP -#define GD_TV_HPP -#include "TIGRE_common.hpp" -#include "GpuIds.hpp" - -void piccs_tv(const float* img,const float* prior, float* dst,float alpha, float ratio, const long* image_size, int maxIter, const GpuIds& gpuids); - - -#endif \ No newline at end of file diff --git a/Common/CUDA/RandomNumberGenerator.cu.prehip b/Common/CUDA/RandomNumberGenerator.cu.prehip deleted file mode 100644 index d7d1224a..00000000 --- a/Common/CUDA/RandomNumberGenerator.cu.prehip +++ /dev/null @@ -1,193 +0,0 @@ -/*------------------------------------------------------------------------- - * - * CUDA functions for random number generator - * - * Adds noise of Poisson and normal distribution to the input. - * - * CODE by Tomoyuki SADAKANE - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - -#include -#include -#include -#include -#include - -#include "gpuUtils.hpp" -#include "RandomNumberGenerator.hpp" - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\ - } \ -} while (0) - - -__global__ void setup_kernel(curandState *state) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - /* Each thread gets same seed, a different sequence number, no offset */ - curand_init(1234, idx, 0, &state[idx]); -} - -__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - /* Copy state to local memory for efficiency */ - curandState localState = state[idx]; - int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); - for (int iI = 0; iI < iIter; ++iI) { - size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; - if (uiPos < uiLen) { - /* Poisson */ - unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); - pfOut[uiPos] = (float)uiPoisson; - } - } - /* Copy state back to global memory */ - state[idx] = localState; -} - -__global__ void GeneratePoissonAddGaussian(curandState *state, - const float* pfIn, - size_t uiLen, - float fGaussMu, - float fGaussSigma, - float* pfOut) -{ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - /* Copy state to local memory for efficiency */ - curandState localState = state[idx]; - int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x); - for (int iI = 0; iI < iIter; ++iI) { - size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx; - if (uiPos < uiLen) { - /* Poisson */ - unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]); - /* Gaussian */ - float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu; - pfOut[uiPos] = fNormal + (float)uiPoisson; - } - } - /* Copy state back to global memory */ - state[idx] = localState; -} - - -template -void GetMinMax(const T_value* pfIn, size_t uiLen, T_value& tvMin, T_value& tvMax) { - tvMin = pfIn[0]; - tvMax = pfIn[0]; - T_value tvVal; - for (int iI = 1; iI < uiLen; ++iI) { - tvVal = pfIn[iI]; - if (tvMax < tvVal) { tvMax = tvVal; continue;} - if (tvMin > tvVal) { tvMin = tvVal; continue;} - } -} -void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids) { - // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut); - float* d_pfIn = nullptr; - float* d_pfOut = nullptr; - cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); - cudaCheckErrors("poisson_1d fail cudaMalloc 1"); - cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); - cudaCheckErrors("poisson_1d fail cudaMalloc 2"); - cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); - cudaCheckErrors("poisson_1d fail cudaMemcpy 1"); - - // float fMin, fMax; - // GetMinMax(pfIn, uiLen, fMin, fMax); - // printf("fMin, fMax = %f, %f\n", fMin, fMax); - curandState *curandStates = nullptr; - const int kiBlockDim = 1024; // Threads per Block - const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; - cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); - cudaCheckErrors("poisson_1d fail cudaMalloc 3"); - setup_kernel<<>>(curandStates); - GeneratePoisson<<>>(curandStates, d_pfIn, uiLen, d_pfOut); - cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("poisson_1d fail cudaMemcpy 2"); - // GetMinMax(pfOut, uiLen, fMin, fMax); - // printf("fMin, fMax = %f, %f\n", fMin, fMax); - - cudaFree(d_pfIn); d_pfIn = nullptr; - cudaFree(d_pfOut); d_pfOut = nullptr; - cudaFree(curandStates); curandStates = nullptr; -} - -void poisson_gaussian_1d(const float* pfIn, - size_t uiLen, - float fGaussMu, - float fGaussSigma, - float* pfOut, - GpuIds& gpuids) -{ - // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut); - float* d_pfIn = nullptr; - float* d_pfOut = nullptr; - cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1"); - cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2"); - cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice); - cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1"); - - // float fMin, fMax; - // GetMinMax(pfIn, uiLen, fMin, fMax); - // printf("fMin, fMax = %f, %f\n", fMin, fMax); - curandState *curandStates = nullptr; - const int kiBlockDim = 64; // Threads per Block - const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim; - cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState)); - cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3"); - setup_kernel<<>>(curandStates); - GeneratePoissonAddGaussian<<>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut); - cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2"); - // GetMinMax(pfOut, uiLen, fMin, fMax); - // printf("fMin, fMax = %f, %f\n", fMin, fMax); - - - cudaFree(d_pfIn); d_pfIn = nullptr; - cudaFree(d_pfOut); d_pfOut = nullptr; - cudaFree(curandStates); curandStates = nullptr; -} diff --git a/Common/CUDA/RandomNumberGenerator.hpp.prehip b/Common/CUDA/RandomNumberGenerator.hpp.prehip deleted file mode 100644 index 4ba68d8d..00000000 --- a/Common/CUDA/RandomNumberGenerator.hpp.prehip +++ /dev/null @@ -1,49 +0,0 @@ -/*------------------------------------------------------------------------- - * - * Header CUDA functions for random number generator - * - * Adds noise of Poisson and normal distribution to the input. - * - * CODE by Tomoyuki SADAKANE - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - -#include "TIGRE_common.hpp" -#include "GpuIds.hpp" -void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids); -void poisson_gaussian_1d(const float* pfPoissonL, size_t uiLen, float fGaussMu, float fGaussSigma, float* pfOut, GpuIds& gpuids); diff --git a/Common/CUDA/Siddon_projection.cu.prehip b/Common/CUDA/Siddon_projection.cu.prehip deleted file mode 100644 index 2a025f8c..00000000 --- a/Common/CUDA/Siddon_projection.cu.prehip +++ /dev/null @@ -1,859 +0,0 @@ -/*------------------------------------------------------------------------- - * - * CUDA functions for ray-voxel intersection based projection - * - * This file has the necessary fucntiosn to perform X-ray CBCT projection - * operation given a geaometry, angles and image. It usesthe so-called - * Jacobs algorithm to compute efficiently the length of the x-rays over - * voxel space. - * - * CODE by Ander Biguri - * Sepideh Hatamikia (arbitrary rotation) - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - -#include -#include -#include -#include "Siddon_projection.hpp" -#include "TIGRE_common.hpp" -#include - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\ - } \ -} while (0) - - -#define MAXTREADS 1024 -#define PROJ_PER_BLOCK 9 -#define PIXEL_SIZE_BLOCK 9 - /*GEOMETRY DEFINITION - * - * Detector plane, behind - * |-----------------------------| - * | | - * | | - * | | - * | | - * | +--------+ | - * | / /| | - * A Z | / / |*D | - * | | +--------+ | | - * | | | | | | - * | | | *O | + | - * --->y | | | / | - * / | | |/ | - * V X | +--------+ | - * |-----------------------------| - * - * *S - * - * - * - * - * - **/ - - void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc); - -__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device - - -__global__ void vecAddInPlace(float *a, float *b, unsigned long n) -{ - int idx = blockIdx.x*blockDim.x+threadIdx.x; - // Make sure we do not go out of bounds - if (idx < n) - a[idx] = a[idx] + b[idx]; -} - -__global__ void kernelPixelDetector( Geometry geo, - float* detector, - const int currProjSetNumber, - const int totalNoOfProjections, - cudaTextureObject_t tex){ - - - unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; - unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; - unsigned long long projNumber=threadIdx.z; - - - if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) - return; - -#if IS_FOR_MATLAB_TIGRE - size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#else - size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#endif - unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array (for a given GPU) - - if(indAlpha>=totalNoOfProjections) - return; - - Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection - Point3D deltaU = projParamsArrayDev[4*projNumber+1]; - Point3D deltaV = projParamsArrayDev[4*projNumber+2]; - Point3D source = projParamsArrayDev[4*projNumber+3]; - - /////// Get coordinates XYZ of pixel UV - unsigned long pixelV = geo.nDetecV-v-1; - unsigned long pixelU = u; - Point3D pixel1D; - pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); - pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); - pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); - /////// - // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516 - ////// - // Also called Jacobs algorithms - Point3D ray; - // vector of Xray - ray.x=pixel1D.x-source.x; - ray.y=pixel1D.y-source.y; - ray.z=pixel1D.z-source.z; - float eps=0.001; - ray.x=(fabsf(ray.x) Nvoxel+1 - - axm=fminf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x)); - aym=fminf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y)); - azm=fminf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z)); - axM=fmaxf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x)); - ayM=fmaxf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y)); - azM=fmaxf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z)); - - float am=fmaxf(fmaxf(axm,aym),azm); - float aM=fminf(fminf(axM,ayM),azM); - - // line intersects voxel space -> am=aM) - detector[idx]=0; - - // Compute max/min image INDEX for intersection eq(11-19) - // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version - float imin,imax,jmin,jmax,kmin,kmax; - // for X - if( source.x(tex, i, j, k); - i=i+iu; - ac=ax; - ax+=axu; - }else if(ay==aminc){ - sum+=(ay-ac)*tex3D(tex, i, j, k); - j=j+ju; - ac=ay; - ay+=ayu; - }else if(az==aminc){ - sum+=(az-ac)*tex3D(tex, i, j, k); - k=k+ku; - ac=az; - az+=azu; - } - aminc=fminf(fminf(ax,ay),az); - } - detector[idx]=sum*maxlength; -} - - -int siddon_ray_projection(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ - // Prepare for MultiGPU - int deviceCount = gpuids.GetLength(); - cudaCheckErrors("Device query fail"); - if (deviceCount == 0) { - mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUselect","There are no available device(s) that support CUDA\n"); - } - // - // CODE assumes - // 1.-All available devices are usable by this code - // 2.-All available devices are equal, they are the same machine (warning thrown) - // Check the available devices, and if they are the same - if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("Ax:Siddon_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); - } - int dev; - - // Check free memory - size_t mem_GPU_global; - checkFreeMemory(gpuids, &mem_GPU_global); - - size_t mem_image= (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); - size_t mem_proj= (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float); - - // Does everything fit in the GPUs? - const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to - // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. -#ifndef NO_PINNED_MEMORY - if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ - cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); - } -#endif - cudaCheckErrors("Error pinning memory"); - - - - // auxiliary variables - Point3D source, deltaU, deltaV, uvOrigin; - Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); - cudaCheckErrors("Error allocating auxiliary constant memory"); - - // Create Streams for overlapping memcopy and compute - int nStreams=deviceCount*2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; - - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i+dev*2]); - - } - } - cudaCheckErrors("Stream creation fail"); - - int nangles_device=(nangles+deviceCount-1)/deviceCount; - int nangles_last_device=(nangles-(deviceCount-1)*nangles_device); - unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK - unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management. - int projection_this_block; - cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; - cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; - - for (unsigned int sp=0;sp=nangles) - break; - if ((i*PROJ_PER_BLOCK+j)>=nangles_device) - break; - geoArray[sp].alpha=angles[proj_global*3]; - geoArray[sp].theta=angles[proj_global*3+1]; - geoArray[sp].psi =angles[proj_global*3+2]; - - - //precomute distances for faster execution - //Precompute per angle constant stuff for speed - computeDeltas_Siddon(geoArray[sp],proj_global, &uvOrigin, &deltaU, &deltaV, &source); - //Ray tracing! - projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection - projParamsArrayHost[4*j+1]=deltaU; - projParamsArrayHost[4*j+2]=deltaV; - projParamsArrayHost[4*j+3]=source; - - } - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*2]); - cudaStreamSynchronize(stream[dev*2]); - cudaCheckErrors("kernel fail"); - kernelPixelDetector<<>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); - } - - - // Now that the computation is happening, we need to either prepare the memory for - // combining of the projections (splits>1) and start removing previous results. - - - // If our image does not fit in memory then we need to make sure we accumulate previous results too. - // This is done in 2 steps: - // 1)copy previous results back into GPU - // 2)accumulate with current results - // The code to take them out is the same as when there are no splits needed - if( !fits_in_memory&&sp>0) - { - // 1) grab previous results and put them in the auxiliary variable dProjection_accum - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on this set on this GPU - proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; - if(proj_global>=nangles) - break; - - // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... - if(i+1==noOfKernelCalls) //is it the last block? - projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) - nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - else - projection_this_block=PROJ_PER_BLOCK; - - cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); - } - // 2) take the results from current compute call and add it to the code in execution. - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on this set on this GPU - proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; - if(proj_global>=nangles) - break; - - // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... - if(i+1==noOfKernelCalls) //is it the last block? - projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) - nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - else - projection_this_block=PROJ_PER_BLOCK; - - cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished - vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); - } - } // end accumulation case, where the image needs to be split - - // Now, lets get out the projections from the previous execution of the kernels. - if (i>0){ - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on previous set on this GPU - proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; - if (dev+1==deviceCount) { //is it the last device? - // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device - if (i-1 < noOfKernelCallsLastDev) { - // The previous set(block) was not empty. - projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global); - } - else { - // The previous set was empty. - // This happens if deviceCount > PROJ_PER_BLOCK+1. - // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199. - // e.g. PROJ_PER_BLOCK = 1, deviceCount = 3, nangles = 7. - break; - } - } - else { - projection_this_block=PROJ_PER_BLOCK; - } - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); - } - } - // Make sure Computation on kernels has finished before we launch the next batch. - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*2]); - } - } - - - // We still have the last set of projections to get out of GPUs - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on this set on this GPU - proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; - if(proj_global>=nangles) - break; - // How many projections are left here? - projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) - nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - - cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. - cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); - } - // Make sure everyone has done their bussiness before the next image split: - cudaDeviceSynchronize(); - } // End image split loop. - - cudaCheckErrors("Main loop fail"); - /////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////// - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texImg[dev]); - cudaFreeArray(d_cuArrTex[dev]); - } - delete[] texImg; texImg = 0; - delete[] d_cuArrTex; d_cuArrTex = 0; - // Freeing Stage - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection[dev*2]); - cudaFree(dProjection[dev*2+1]); - - } - free(dProjection); - - if(!fits_in_memory){ - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection_accum[dev*2]); - cudaFree(dProjection_accum[dev*2+1]); - - } - free(dProjection_accum); - } - freeGeoArray(splits,geoArray); - cudaFreeHost(projParamsArrayHost); - - - for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; -#ifndef NO_PINNED_MEMORY - if (isHostRegisterSupported & (splits>1 |deviceCount>1)){ - cudaHostUnregister(img); - } - cudaCheckErrors("cudaFree fail"); -#endif - //cudaDeviceReset(); - return 0; -} - - - - -void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc) -{ - //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - const unsigned int num_devices = gpuids.GetLength(); - if(alloc){ - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); - } - } - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; - //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); - copyParams.dstArray = d_cuArrTex[dev]; - copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params); - } - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); - - } - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - cudaCheckErrors("Texture object creation fail"); -} - -/* This code generates the geometries needed to split the image properly in - * cases where the entire image does not fit in the memory of the GPU - **/ -void splitImage(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){ - - unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible - for(unsigned int sp=0;spx; - auxPoint.y=point->y; - auxPoint.z=point->z; - - point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x - +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y - +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; - - point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x - +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y - +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; - - point->z=-sin(geo.dPitch[i])*auxPoint.x - +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y - +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z; - -} -void eulerZYZ(Geometry geo, Point3D* point){ - Point3D auxPoint; - auxPoint.x=point->x; - auxPoint.y=point->y; - auxPoint.z=point->z; - - point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+ - (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+ - cos(geo.alpha)*sin(geo.theta)*auxPoint.z; - - point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+ - (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+ - sin(geo.alpha)*sin(geo.theta)*auxPoint.z; - - point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+ - sin(geo.theta)*sin(geo.psi)*auxPoint.y+ - cos(geo.theta)*auxPoint.z; - - -} -//______________________________________________________________________________ -// -// Function: freeGeoArray -// -// Description: Frees the memory from the geometry array for multiGPU. -//______________________________________________________________________________ -void freeGeoArray(unsigned int splits,Geometry* geoArray){ - for(unsigned int sp=0;sp -#include -#include -#include "Siddon_projection_parallel.hpp" -#include "TIGRE_common.hpp" -#include - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\ - } \ -} while (0) - - -// Declare the texture reference. -void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); - - -#define MAXTREADS 1024 -#define PROJ_PER_BLOCK 9 -#define PIXEL_SIZE_BLOCK 9 -/*GEOMETRY DEFINITION - * - * Detector plane, behind - * |-----------------------------| - * | | - * | | - * | | - * | | - * | +--------+ | - * | / /| | - * A Z | / / |*D | - * | | +--------+ | | - * | | | | | | - * | | | *O | + | - * --->y | | | / | - * / | | |/ | - * V X | +--------+ | - * |-----------------------------| - * - * *S - * - * - * - * - * - **/ - - -__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device - - -__global__ void kernelPixelDetector_parallel( Geometry geo, - float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){ - - unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; - unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; - unsigned long long projNumber=threadIdx.z; - - if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) - return; - - unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array - - -#if IS_FOR_MATLAB_TIGRE - size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#else - size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#endif - - if(indAlpha>=totalNoOfProjections) - return; - - Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection - Point3D deltaU = projParamsArrayDev[4*projNumber+1]; - Point3D deltaV = projParamsArrayDev[4*projNumber+2]; - Point3D source = projParamsArrayDev[4*projNumber+3]; - - - /////// Get coordinates XYZ of pixel UV - unsigned long pixelV = geo.nDetecV-v-1; - unsigned long pixelU = u; - Point3D pixel1D; - pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); - pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); - pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); - - - source.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x); - source.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y); - source.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z); - /////// - // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516 - ////// - Point3D ray; - // vector of Xray - ray.x=pixel1D.x-source.x; - ray.y=pixel1D.y-source.y; - ray.z=pixel1D.z-source.z; - // This variables are ommited because - // bx,by,bz ={0,0,0} - // dx,dy,dz ={1,1,1} - // compute parameter values for x-ray parametric equation. eq(3-10) - float axm,aym,azm; - float axM,ayM,azM; - - /************************************** - * - * - * Problem. In paralel beam, often ray.y or ray.x=0; - * This leads to infinities progpagating and breaking everything. - * - * We need to fix it. - * - ***************************************/ - - // In the paper Nx= number of X planes-> Nvoxel+1 - axm=fminf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x); - aym=fminf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y); -// azm=min(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z); - axM=fmaxf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x); - ayM=fmaxf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y); -// azM=max(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z); - float am=(fmaxf(axm,aym)); - float aM=(fminf(axM,ayM)); - - // line intersects voxel space -> am=aM) - detector[idx]=0.0f; - - // Compute max/min image INDEX for intersection eq(11-19) - // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version - float imin,imax,jmin,jmax; - // for X - if( source.x(tex, i, j, k);//(ax-ac)* - i=i+iu; - ac=ax; - ax+=axu; - }else if(ay==aminc){ - sum+=(ay-ac)*tex3D(tex, i, j, k);//(ay-ac)* - j=j+ju; - ac=ay; - ay+=ayu; -// }else if(az==aminc){ -// sum+=(az-ac)*tex3D(tex, i+0.5, j+0.5, k+0.5); -// k=k+ku; -// ac=az; -// az+=azu; - } - aminc=fminf(ay,ax); - } - detector[idx]=maxlength*sum; -} - - -int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ - - - - - - size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float); - float** dProjection=(float **)malloc(2*sizeof(float *)); - for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[i], num_bytes); - cudaCheckErrors("cudaMalloc projections fail"); - } - int nStreams=2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); - - for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i]); - } - - - - // Texture object variables - cudaTextureObject_t *texImg = 0; - cudaArray **d_cuArrTex = 0; - texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); - - CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg [0],stream); - cudaCheckErrors("Texture allocation fail"); - //Done! Image put into texture memory. - - - - Point3D source, deltaU, deltaV, uvOrigin; - - - Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); - - // 16x16 gave the best performance empirically - // Funnily that makes it compatible with most GPUs..... - int divU,divV,divangle; - divU=PIXEL_SIZE_BLOCK; - divV=PIXEL_SIZE_BLOCK; - - dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1); - - dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK); - - unsigned int proj_global; - unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK - unsigned int i; - for ( i=0; i=nangles) - break; - geo.alpha=angles[proj_global*3]; - geo.theta=angles[proj_global*3+1]; - geo.psi =angles[proj_global*3+2]; - if(geo.alpha==0.0 || abs(geo.alpha-1.5707963267949)<0.0000001){ - geo.alpha=geo.alpha+1.1920929e-07; - } - - //precomute distances for faster execution - //Precompute per angle constant stuff for speed - computeDeltas_Siddon_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source); - //Ray tracing! - projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection - projParamsArrayHost[4*j+1]=deltaU; - projParamsArrayHost[4*j+2]=deltaV; - projParamsArrayHost[4*j+3]=source; - - } - - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); - kernelPixelDetector_parallel<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); - // copy result to host - if (i>0) - cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); - } - cudaDeviceSynchronize(); - - int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; - cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); - - - - cudaDestroyTextureObject(texImg[0]); - cudaFreeArray(d_cuArrTex[0]); - free(texImg); texImg = 0; - free(d_cuArrTex); d_cuArrTex = 0; - cudaCheckErrors("Unbind fail"); - cudaFree(dProjection[0]); - cudaFree(dProjection[1]); - free(dProjection); - cudaFreeHost(projParamsArrayHost); - cudaCheckErrors("cudaFree d_imagedata fail"); - - - for (int i = 0; i < 2; ++i){ - cudaStreamDestroy(stream[i]); - } -// cudaDeviceReset(); - return 0; -} - - - -/* This code precomputes The location of the source and the Delta U and delta V (in the warped space) - * to compute the locations of the x-rays. While it seems verbose and overly-optimized, - * it does saves about 30% of each of the kernel calls. Thats something! - **/ -void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){ - Point3D S; - - S.x =geo.DSO[i]; S.y = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); S.z = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); - - //End point - Point3D P,Pu0,Pv0; - - P.x =-(geo.DSD[i]-geo.DSO[i]); P.y = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); P.z = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); - Pu0.x=-(geo.DSD[i]-geo.DSO[i]); Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5); Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); - Pv0.x=-(geo.DSD[i]-geo.DSO[i]); Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1); - // Geometric trasnformations: - P.x=0;Pu0.x=0;Pv0.x=0; - - // Roll pitch yaw - rollPitchYaw(geo,i,&P); - rollPitchYaw(geo,i,&Pu0); - rollPitchYaw(geo,i,&Pv0); - //Now lets translate the points where they should be: - P.x=P.x-(geo.DSD[i]-geo.DSO[i]); - Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]); - Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]); - - S.x=0; - // Roll pitch yaw - rollPitchYaw(geo,i,&S); - //Now lets translate the points where they should be: - S.x=S.x+geo.DSO[i]; - - //1: Offset detector - - //P.x - P.y =P.y +geo.offDetecU[i]; P.z =P.z +geo.offDetecV[i]; - Pu0.y=Pu0.y+geo.offDetecU[i]; Pu0.z=Pu0.z+geo.offDetecV[i]; - Pv0.y=Pv0.y+geo.offDetecU[i]; Pv0.z=Pv0.z+geo.offDetecV[i]; - //S doesnt need to chagne - - - //3: Rotate (around z)! - Point3D Pfinal, Pfinalu0, Pfinalv0; - - Pfinal.x =P.x*cos(geo.alpha)-P.y*sin(geo.alpha); Pfinal.y =P.y*cos(geo.alpha)+P.x*sin(geo.alpha); Pfinal.z =P.z; - Pfinalu0.x=Pu0.x*cos(geo.alpha)-Pu0.y*sin(geo.alpha); Pfinalu0.y=Pu0.y*cos(geo.alpha)+Pu0.x*sin(geo.alpha); Pfinalu0.z=Pu0.z; - Pfinalv0.x=Pv0.x*cos(geo.alpha)-Pv0.y*sin(geo.alpha); Pfinalv0.y=Pv0.y*cos(geo.alpha)+Pv0.x*sin(geo.alpha); Pfinalv0.z=Pv0.z; - - Point3D S2; - S2.x=S.x*cos(geo.alpha)-S.y*sin(geo.alpha); - S2.y=S.y*cos(geo.alpha)+S.x*sin(geo.alpha); - S2.z=S.z; - - //2: Offset image (instead of offseting image, -offset everything else) - - Pfinal.x =Pfinal.x-geo.offOrigX[i]; Pfinal.y =Pfinal.y-geo.offOrigY[i]; Pfinal.z =Pfinal.z-geo.offOrigZ[i]; - Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i]; Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i]; Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i]; - Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i]; Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i]; Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i]; - S2.x=S2.x-geo.offOrigX[i]; S2.y=S2.y-geo.offOrigY[i]; S2.z=S2.z-geo.offOrigZ[i]; - - // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation); - Pfinal.x =Pfinal.x+geo.sVoxelX/2; Pfinal.y =Pfinal.y+geo.sVoxelY/2; Pfinal.z =Pfinal.z +geo.sVoxelZ/2; - Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2; Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2; Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2; - Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2; Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2; Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2; - S2.x =S2.x+geo.sVoxelX/2; S2.y =S2.y+geo.sVoxelY/2; S2.z =S2.z +geo.sVoxelZ/2; - - //4. Scale everything so dVoxel==1 - Pfinal.x =Pfinal.x/geo.dVoxelX; Pfinal.y =Pfinal.y/geo.dVoxelY; Pfinal.z =Pfinal.z/geo.dVoxelZ; - Pfinalu0.x=Pfinalu0.x/geo.dVoxelX; Pfinalu0.y=Pfinalu0.y/geo.dVoxelY; Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ; - Pfinalv0.x=Pfinalv0.x/geo.dVoxelX; Pfinalv0.y=Pfinalv0.y/geo.dVoxelY; Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ; - S2.x =S2.x/geo.dVoxelX; S2.y =S2.y/geo.dVoxelY; S2.z =S2.z/geo.dVoxelZ; - - - - //5. apply COR. Wherever everything was, now its offesetd by a bit - float CORx, CORy; - CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX; - CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY; - Pfinal.x+=CORx; Pfinal.y+=CORy; - Pfinalu0.x+=CORx; Pfinalu0.y+=CORy; - Pfinalv0.x+=CORx; Pfinalv0.y+=CORy; - S2.x+=CORx; S2.y+=CORy; - - // return - - *uvorigin=Pfinal; - - deltaU->x=Pfinalu0.x-Pfinal.x; - deltaU->y=Pfinalu0.y-Pfinal.y; - deltaU->z=Pfinalu0.z-Pfinal.z; - - deltaV->x=Pfinalv0.x-Pfinal.x; - deltaV->y=Pfinalv0.y-Pfinal.y; - deltaV->z=Pfinalv0.z-Pfinal.z; - - *source=S2; -} -void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - - - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - //cuda Array - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); - - - cudaMemcpy3DParms copyParams = {0}; - //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); - copyParams.dstArray = d_cuArrTex[0]; - copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[1]); - - - //Array creation End - - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModePoint; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); - -} - -#ifndef PROJECTION_HPP - -float maxDistanceCubeXY(Geometry geo, float alpha,int i){ - /////////// - // Compute initial "t" so we access safely as less as out of bounds as possible. - ////////// - - - float maxCubX,maxCubY; - // Forgetting Z, compute max distance: diagonal+offset - maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX; - maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY; - - return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY); - -} - -#endif diff --git a/Common/CUDA/Siddon_projection_parallel.hpp.prehip b/Common/CUDA/Siddon_projection_parallel.hpp.prehip deleted file mode 100644 index c9c6fc77..00000000 --- a/Common/CUDA/Siddon_projection_parallel.hpp.prehip +++ /dev/null @@ -1,65 +0,0 @@ -/*------------------------------------------------------------------------- - * - * Header CUDA functions for ray-voxel intersection based projection - * - * - * CODE by Ander Biguri - * ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ - - - - - -#include "ray_interpolated_projection.hpp" -#include "types_TIGRE.hpp" -#include "GpuIds.hpp" - -#ifndef PROJECTION_PARALLEL_HPP_SIDDON -#define PROJECTION_PARALLEL_HPP_SIDDON -int siddon_ray_projection_parallel(float * img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids); - -//double computeMaxLength(Geometry geo, double alpha); -void computeDeltas_Siddon_parallel(Geometry geo, float alpha,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source); - -//double maxDistanceCubeXY(Geometry geo, double alpha,int i); - -// below, not used -//Geometry nomralizeGeometryImage(Geometry geo); -#endif \ No newline at end of file diff --git a/Common/CUDA/TIGRE_common.cpp.prehip b/Common/CUDA/TIGRE_common.cpp.prehip deleted file mode 100644 index cf98e4b9..00000000 --- a/Common/CUDA/TIGRE_common.cpp.prehip +++ /dev/null @@ -1,20 +0,0 @@ -#if defined(IS_FOR_PYTIGRE) -#include -#include -#include -#include "TIGRE_common.hpp" -void mexPrintf(const char* format, ...) { - PRINT_HERE(""); - va_list argpointer; - va_start(argpointer, format); - vprintf(format, argpointer); - va_end(argpointer); -} -void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg) { - PRINT_HERE("%s %s\n", pcTag, pcMsg); - exit(1); -} -void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg) { - PRINT_HERE("%s %s\n", pcTag, pcMsg); -} -#endif // IS_FOR_PYTIGRE diff --git a/Common/CUDA/TIGRE_common.hpp.prehip b/Common/CUDA/TIGRE_common.hpp.prehip deleted file mode 100644 index faf8d7ab..00000000 --- a/Common/CUDA/TIGRE_common.hpp.prehip +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _COMMON_HPP_20201017_ -#define _COMMON_HPP_20201017_ - -#define STRINGIFY(n) #n -#define TOSTRING(n) STRINGIFY(n) -#define __HERE__ __FILE__ " (" TOSTRING(__LINE__) "): " -#define PRINT_HERE printf(__HERE__);printf -// #define PRINT_HERE (void*)0 - -#if defined(IS_FOR_PYTIGRE) -#ifndef IS_FOR_MATLAB_TIGRE - #define IS_FOR_MATLAB_TIGRE 0 -#endif // IS_FOR_MATLAB_TIGRE -void mexPrintf(const char*, ...); -void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg); -void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg); -#else -#ifndef IS_FOR_MATLAB_TIGRE - #define IS_FOR_MATLAB_TIGRE 1 -#endif // IS_FOR_MATLAB_TIGRE -#include "mex.h" -#include "tmwtypes.h" -#endif // IS_TIGRE_FOR_PYTHON -#endif // _COMMON_HPP_20201017_ diff --git a/Common/CUDA/errors.hpp.prehip b/Common/CUDA/errors.hpp.prehip deleted file mode 100644 index 05518b20..00000000 --- a/Common/CUDA/errors.hpp.prehip +++ /dev/null @@ -1,10 +0,0 @@ -#define CUDA_SUCCESS 0 -#define ERR_CUDA 1 - -#define ERR_NO_CAPABLE_DEVICES 2 -#define ERR_NO_FREE_DEVICES 3 -#define ERR_BAD_ASSERT 4 -#define ERR_ASSERT_FAIL 5 - - - diff --git a/Common/CUDA/gpuUtils.cu.prehip b/Common/CUDA/gpuUtils.cu.prehip deleted file mode 100644 index 8f2754e4..00000000 --- a/Common/CUDA/gpuUtils.cu.prehip +++ /dev/null @@ -1,70 +0,0 @@ - -#include "gpuUtils.hpp" -#include -#include -#include -#include - -int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage) { - if (pcMessage) { - for (int iI = 0; iI < 65535; ++iI) { - pcMessage[iI] = '\0'; - } - } - if (piDeviceIds == 0 || iIdCountMax == 0) { - return 0; - } - int iMessagePos = 0; - // Count installed GPUs. - int iCudaDeviceCount = GetGpuCount(); - iMessagePos += sprintf(pcMessage + iMessagePos, "Found GPUs: %d\n", iCudaDeviceCount); - if (iCudaDeviceCount == 0) { - // printf("No GPU found\n"); - return 0; - } - - iCudaDeviceCount = min(iCudaDeviceCount, iIdCountMax); - iMessagePos += sprintf(pcMessage + iMessagePos, "Max GPUs: %d\n", iCudaDeviceCount); - if (strlen(kacGPUName) == 0) { - // Semi-compatible mode: - // Return all GPUs - for (int iI = 0; iI < iCudaDeviceCount; ++iI) { - piDeviceIds[iI] = iI; - } - return iCudaDeviceCount; - } - - cudaError_t err; - cudaDeviceProp propDevice; - int nMatch = 0; - for (int iId = 0; iId < iCudaDeviceCount; ++iId) { - err = cudaGetDeviceProperties(&propDevice, iId); - iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name); - if (strcmp(propDevice.name, kacGPUName) == 0) { - piDeviceIds[nMatch] = iId; - ++nMatch; - } - } - - for (int iI = 0; iI < nMatch; ++iI) { - iMessagePos += sprintf(pcMessage + iMessagePos, "%d, ", piDeviceIds[iI]); - } - return nMatch; - -} - -void GetGpuName(int iDeviceId, char* pcName) { - memset(pcName, 0, 128); - cudaError_t err; - cudaDeviceProp propDevice; - int id = iDeviceId; - err = cudaGetDeviceProperties(&propDevice, id); - memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char)); -} - - -int GetGpuCount() { - int iCudaDeviceCount = 0; - cudaGetDeviceCount(&iCudaDeviceCount); - return iCudaDeviceCount; -} diff --git a/Common/CUDA/gpuUtils.hpp.prehip b/Common/CUDA/gpuUtils.hpp.prehip deleted file mode 100644 index 38b518cf..00000000 --- a/Common/CUDA/gpuUtils.hpp.prehip +++ /dev/null @@ -1,18 +0,0 @@ - -#ifndef GPUUTILS_HPP -#define GPUUTILS_HPP -//! @brief # of installed GPUs -int GetGpuCount(); - -//! @brief IDs of GPUs whose name is kacGPUName. -//! @note Call GetGpuCount and allocate sufficient memory for piDeviceIds. -//! @param [in] kacGPUName -//! @param [in, out] piDeviceIds. -//! @param [in] iIdCountMax. Return value of GetGpuCount() -int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage); - -//! @brief GPU name of index iDeviceId. Allocate 128bytes for pcName before call. -void GetGpuName(int iDeviceId, char* pcName); - -#endif // GPUUTILS_HPP - diff --git a/Common/CUDA/improvedForwardProjections.cu.prehip b/Common/CUDA/improvedForwardProjections.cu.prehip deleted file mode 100644 index 0f32be72..00000000 --- a/Common/CUDA/improvedForwardProjections.cu.prehip +++ /dev/null @@ -1,1032 +0,0 @@ -/*------------------------------------------------------------------------- - * CUDA function for optimized proton CT radiographies - * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP) - * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232) - */ - -/*-------------------------------------------------------------------------- - This file is part of the TIGRE Toolbox - - Copyright (c) 2015, University of Bath and - CERN-European Organization for Nuclear Research - All rights reserved. - - License: Open Source under BSD. - See the full license at - https://github.com/CERN/TIGRE/blob/master/LICENSE - - Contact: tigre.toolbox@gmail.com - Codes: https://github.com/CERN/TIGRE/ - Coded by: Stefanie Kaser, Benjamin Kirchmayer ---------------------------------------------------------------------------*/ - -#include -#include "mex.h" -#include -#include "improvedForwardProjections.hpp" -#include -#include - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ - } \ -} while (0) - - -__device__ int SolvePolynomial(float*x, float a, float b, float c){ - // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method - // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 - // Herefore, we have to make a substitution: x = z - a/3 - float p = b - a*a / 3.0; - float q = 2*a*a*a/27.0 - a*b / 3.0 + c; - float disc = q*q/4.0 + p*p*p/27.0; - if(disc > 0){ - float u = cbrt(-0.5*q + sqrt(disc)); - float v = cbrt(-0.5*q - sqrt(disc)); - x[0] = u + v - a/3.0; // don't forget to substitute back z --> x - return 1; - } - else if(disc == 0 && p == 0){ - x[0] = -a/3.0; // don't forget to substitute back z --> x - return 1; - } - else if(disc == 0 && p != 0){ - x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x - x[1] = -3.0*q/(2.0*p) - a/3.0; - return 2; - } - else{ - x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x - x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0; - x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0; - return 3; - } -} - -__device__ float cspline(float t, float a, float b, float c, float d){ - - return a*(t*t*t) + b*(t*t) + c*t +d; - -} - -__device__ void SimpleSort(float* arr, int size_arr){ - // Insertion sorting method - float curr_elem; - int j; - - for (int i=1; i=0 && curr_elem0){ - - float z_1 = -p/2.0 + sqrt(disc); - float z_2 = -p/2.0 - sqrt(disc); - float z_solve; - - if(in_or_out == 1){ - z_solve = min(z_1, z_2); - } - else { - z_solve = max(z_1, z_2); - } - - float x_solve = kx*z_solve + dx; - - float ky = direction[1]; - float dy = position[1] - ky*detOff; - float y_solve = ky*z_solve + dy; - - if(-h/2 <= y_solve && y_solve <= h/2){ - - HullIntercept[0] = x_solve; - HullIntercept[1] = y_solve; - HullIntercept[2] = z_solve; - - return 0; - } - else{ - float z1_h = (1.0/ky) * (0.5*h-dy); - float z2_h = (1.0/ky) * (-0.5*h-dy); - - if(in_or_out == 1){ - z_solve = min(z1_h, z2_h); - if(dy > 0){y_solve = -h*0.5;} - else{y_solve = h*0.5;} - x_solve = kx*z_solve + dx; - } - else { - z_solve = max(z1_h, z2_h); - if(dy < 0){y_solve = -h*0.5;} - else{y_solve = h*0.5;} - x_solve = kx*z_solve + dx; - } - - if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){ - - HullIntercept[0] = x_solve; - HullIntercept[1] = y_solve; - HullIntercept[2] = z_solve; - - return 0; - } - - else{return 1;}} - } -else{return 1;} -} - - -__device__ int MinMax(float* solutions, float a, float b, float c){ - float p = 2*b/(3*a); - float q = c / (3*a); - float disc = 0.25*p*p - q; - if (disc > 0){ - solutions[0] = -0.5*p + sqrt(disc); - solutions[1] = -0.5*p - sqrt(disc); - return 0; - } - solutions[0] = -1; - solutions[1] = -1; - return 1; -} - - -__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag){ - float boundary; - int counter = 0; - int nx, ny; - nx = int(abs(stop[0] - start[0])/pix[0]); - ny = int(abs(stop[1] - start[1])/pix[1]); - if(nx+ny>=maxIntercep){ - *protFlag = false; - return 1;} - - if (int(stop[0]/pix[0]) == int(start[0]/pix[0]) && int(stop[1]/pix[1]) == int(start[1]/pix[1])) { - *protFlag = true; - return 0; - } - - if (int(stop[0]/pix[0]) != int(start[0]/pix[0])) { - float k = direction[0]; - float d = start[0] - k*start[2]; - boundary = trunc( ((stop[0] > start[0]) ? stop[0]:start[0])/pix[0])*pix[0]; - - for (int ix=0; ix start[2] && intercept < stop[2]){ - LinInterceptsVec[ix] = intercept; - counter++; - if (counter >= maxIntercep){ - *protFlag = false; - return counter;} - } - } - } - - if (int(stop[1]/pix[1]) != int(start[1]/pix[1])) { - float k = direction[1]; - float d = start[1] - k*start[2]; - boundary = trunc( ((stop[1] > start[1]) ? stop[1]:start[1])/pix[1])*pix[1]; - for (int iy=nx; iy start[2] && intercept < stop[2]){ - LinInterceptsVec[iy] = intercept; - counter++; - if(counter >= maxIntercep){ - *protFlag = false; - return counter;} - } - } - } - int diff = maxIntercep - counter; - for(int j = 0; j 0){ - float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0]; - if (cand > d[0] && cand > pos1[0]){ - (oneX > zeroX) ? oneX:zeroX=cand; - } - else if(cand < d[0] && cand < pos1[0]){ - (oneX < zeroX) ? oneX:zeroX=cand; - } - } - - if (solutions[1] < 1 && solutions[1] > 0){ - float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0]; - if (cand > oneX && cand > zeroX){ - (oneX > zeroX) ? oneX:zeroX=cand; - } - else if(cand < oneX && cand < zeroX){ - (oneX < zeroX) ? oneX:zeroX=cand; - } - } - } - - - test = MinMax(solutions, a[1], b[1], c[1]); - if (test == 0){ - if (solutions[0] < 1 && solutions[0] > 0){ - float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1]; - if (cand > d[1] && cand > pos1[1]){ - (oneY > zeroY) ? oneY:zeroY=cand; - } - else if(cand < d[1] && cand < pos1[1]){ - (oneY < zeroY) ? oneY:zeroY=cand; - } - } - - if (solutions[1] < 1 && solutions[1] > 0){ - float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1]; - if (cand > oneY && cand > zeroY){ - (oneY > zeroY) ? oneY:zeroY=cand; - } - else if(cand < oneY && cand < zeroY){ - (oneY < zeroY) ? oneY:zeroY=cand; - } - } - } - - nx = int(abs(oneX - zeroX) / pixelSize[0]); - ny = int(abs(oneY - zeroY) / pixelSize[1]); - if (nx + ny == 0) { - *protFlag = true; - return 0; - } - - if ((nx + ny) <= maxIntercep){ - - if (int(oneX/pixelSize[0]) != int(zeroX/pixelSize[0])) { - boundary = trunc( ((oneX > zeroX) ? oneX:zeroX)/pixelSize[0])*pixelSize[0]; - for (int ix=0; ix 0. ){ - if (counter >=maxIntercep){break;} - InterceptsVec[counter] = IntercepX[kx]; - counter++; - } - }//kx - if (counter >=maxIntercep){break;} - } - } - - if ( int(oneY/pixelSize[1]) != int(zeroY/pixelSize[1])) { - boundary = trunc( ((oneY > zeroY) ? oneY:zeroY)/pixelSize[1])*pixelSize[1]; - for (int iy=0; iy 0.) ){ - if (counter >=maxIntercep){break;} - InterceptsVec[counter] = IntercepY[ky]; - counter++; - } - }//ky - if (counter >=maxIntercep){break;} - } - } - - if (counter >= maxIntercep){ // || counter == 0){ - *protFlag = false; - return counter; - }else{ - - - int diff = maxIntercep - counter; - for(int j = 0; j this is too slow! 7 s instead of 1.5 s - tInterceptsVec = new float[customsize]; - delete[] tInterceptsVec;*/ - /*float *ptr; ---> this is too slow! 7.3s instead of 1.5 s - ptr = (float*) malloc(customsize * sizeof(float)); - free(ptr);*/ - - unsigned int protonIndex = blockIdx.x*blockDim.x + threadIdx.x; - float dimX, dimY, lk, lenX, lenY; - float lenZ = abs(*detectDistIn) + abs(*detectDistOut); - dimX = (float) *detectSizeX; - dimY = (float) *detectSizeY; - - //Dereference input parameters - int entries, dSizeX, dSizeY; - // float pix; - - entries = *numOfEntries; - dSizeX = *detectSizeX; - dSizeY = *detectSizeY; - // pix = *pixelSize; - - - if(hull[3] == 0){ - lenX = sqrt((devicePosOut[protonIndex] - devicePosIn[protonIndex]) * (devicePosOut[protonIndex] - devicePosIn[protonIndex]) \ - + lenZ*lenZ); - lenY = sqrt((devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) * (devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) \ - + lenZ*lenZ); - - float lambda0, lambda1, ref_wepl; - ref_wepl = 10 * 0.00244 * powf(*ein, 1.75); - lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl); - lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl); - - float a[2], b[2], c[2], d[2], pos1[2]; - - //Allocate memory for all pointers - // Calculate optimized xdir_in - devicedirIn[protonIndex] = devicedirIn[protonIndex] \ - / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0); // ... dz = 1! - devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0; - - // Calculate optimized ydir_in - devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \ - / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0); // ... dz = 1! - devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0; - - // Calculate optimized xdir_out - devicedirOut[protonIndex] = devicedirOut[protonIndex] \ - / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1! - devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1; - - // Calculate optimized ydir_out - devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \ - / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1! - devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1; - - // Calculate spline parameters - a[0] = devicePosIn[protonIndex]*2. + devicedirIn[protonIndex] - 2.*devicePosOut[protonIndex] + devicedirOut[protonIndex]; - a[1] = devicePosIn[protonIndex + entries]*2. + devicedirIn[protonIndex + entries] - \ - 2.*devicePosOut[protonIndex + entries] + devicedirOut[protonIndex + entries]; - - b[0] = -3.*devicePosIn[protonIndex] -2.*devicedirIn[protonIndex] + 3.*devicePosOut[protonIndex] - devicedirOut[protonIndex]; - b[1] = -3.*devicePosIn[protonIndex + entries] -2.* devicedirIn[protonIndex + entries] \ - + 3.*devicePosOut[protonIndex + entries] - devicedirOut[protonIndex + entries]; - - c[0] = devicedirIn[protonIndex]; - c[1] = devicedirIn[protonIndex + entries]; - - d[0] = devicePosIn[protonIndex]; - d[1] = devicePosIn[protonIndex + entries]; - - pos1[0] = devicePosOut[protonIndex]; - pos1[1] = devicePosOut[protonIndex + entries]; - - /* --------------------------------------------------------------------------------- */ - /* ------------------------ Start without Hull (CS only) -------------------------- */ - /* --------------------------------------------------------------------------------- */ - int count; - bool status = false; - float InterceptsVec[vecSizeCS] = {0}; - - count = calcIntercepts(InterceptsVec, a, b, c, d, pos1, pix, &status, vecSizeCS); - - if (status) { - int indX, indY, linInd; - float tOld = 0.0; - if (count==0){ - indX = int(pos1[0]/pix[0]+dimX/2.); // REPLACE: pos1 by pos0 - indY = int(pos1[1]/pix[1]+dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], 1.0f); - } - - } - else{ - for(int i= 0; i<=count; i++){ - lk = (InterceptsVec[i]- tOld)*lenZ; - if(tOld == 0){ - indX = int(d[0]/pix[0] +dimX/2); - indY = int(d[1]/pix[1] +dimY/2); - linInd = indY + indX*(dSizeY); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVec[i]; - - }else if(i == count){ - lk = lenZ - InterceptsVec[i-1]*lenZ; - indX = int(pos1[0]/pix[0] +dimX/2); - indY = int(pos1[1]/pix[1] +dimY/2); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ)); - } - - }else{ - indX = int(cspline(InterceptsVec[i] - eps, a[0], b[0], c[0], d[0])/pix[0] +dimX/2); - indY = int(cspline(InterceptsVec[i] - eps, a[1], b[1], c[1], d[1])/pix[1] +dimY/2); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVec[i]; - } - - }//i - }//if - Intercepts - } - else{ - atomicAdd(reject, 1.0); - } -/* ------------------------ End no Hull calculation (CS only) -------------------------- */ - } - -else{ - // WEIGHTING FACTORS FOR CHANNELS I - float weight_air_in = 0.00479; - float weight_air_out = 0.00479; - - float HullIn[3], HullOut[3], initpos[3], exitpos[3]; - float initdir[2], exitdir[2]; - - initpos[0] = devicePosIn[protonIndex]; - initpos[1] = devicePosIn[protonIndex + entries]; - initpos[2] = *detectDistIn; - - exitpos[0] = devicePosOut[protonIndex]; - exitpos[1] = devicePosOut[protonIndex + entries]; - exitpos[2] = *detectDistOut; - - initdir[0] = devicedirIn[protonIndex]; - initdir[1] = devicedirIn[protonIndex + entries]; - - exitdir[0] = devicedirOut[protonIndex]; - exitdir[1] = devicedirOut[protonIndex + entries]; - - int check = hullEntryExit(HullIn, initpos, initdir, 1, hull, *detectDistIn); - - if(check == 0){ - check = hullEntryExit(HullOut, exitpos, exitdir, 0, hull, *detectDistOut); - } - - if(check == 0 && HullOut[2] > HullIn[2]){ - /* --------------------------------------------------------------------------------- */ - /* ------------------------ Start with Hull + SL outside -------------------------- */ - /* --------------------------------------------------------------------------------- */ - const int hullIntercep = int(vecSizeCS); - const int airIntercepIn = int(vecSizeIn); - const int airIntercepOut = int(vecSizeOut); - bool status1 = false; - bool status2 = false; - bool status3 = false; - - int countIn, countHull, countOut; - float InterceptsVecOut[airIntercepOut] = {0}; - float InterceptsVecIn[airIntercepIn] = {0}; - float InterceptsVecHull[hullIntercep] = {0}; - lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); - lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); - - countIn = calcInterceptsLinear(InterceptsVecIn, initpos, HullIn, initdir, pix, airIntercepIn, &status1); - countOut = calcInterceptsLinear(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2); - - /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */ - float lambda0, lambda1, ref_wepl; - ref_wepl = 10 * 0.00244 * powf(*ein, 1.75); - lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); - lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); - - float a[2], b[2], c[2], d[2], pos1[2]; - - //Allocate memory for all pointers - // Calculate optimized xdir_in - devicedirIn[protonIndex] = devicedirIn[protonIndex] \ - / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0); // ... dz = 1! - devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0; - - // Calculate optimized ydir_in - devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \ - / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0); // ... dz = 1! - devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0; - - // Calculate optimized xdir_out - devicedirOut[protonIndex] = devicedirOut[protonIndex] \ - / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1! - devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1; - - // Calculate optimized ydir_out - devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \ - / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1! - devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1; - - // Calculate spline parameters - a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex]; - a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \ - 2.*HullOut[1] + devicedirOut[protonIndex + entries]; - - b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex]; - b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \ - + 3.*HullOut[1] - devicedirOut[protonIndex + entries]; - - c[0] = devicedirIn[protonIndex]; - c[1] = devicedirIn[protonIndex + entries]; - - d[0] = HullIn[0]; - d[1] = HullIn[1]; - - pos1[0] = HullOut[0]; - pos1[1] = HullOut[1]; - - countHull = calcIntercepts(InterceptsVecHull, a, b, c, d, pos1, pix, &status3, hullIntercep); - /* -------------------- End CS Preparations! -------------- */ - - if(status1 && status2 && status3){ - float tOld = initpos[2]; - int indX, indY, linInd; - - // WEIGHTING FACTORS FOR CHANNELS II - float weight_water = 1; // p_wepl[protonIndex]/(len_b*weight_air_in); - - // ---------------------------------------- Start with SL from detector to hull - if (countIn == 0){ - indX = int(initpos[0]/pix[0] + dimX/2.); - indY = int(initpos[1]/pix[1] + dimY/2.); - lk = HullIn[2] - initpos[2]; - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - for(int i= 0; i<=countIn; i++){ - lk = InterceptsVecIn[i] - tOld; - if(i == 0){ - indX = int(initpos[0]/pix[0] + dimX/2.); - indY = int(initpos[1]/pix[1] + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecIn[i]; - } - } - else if(i == countIn){ - lk = HullIn[2] - InterceptsVecIn[i-1]; - indX = int(HullIn[0]/pix[0] + dimX/2.); - indY = int(HullIn[1]/pix[1] + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2])))/pix[0] + dimX/2.); - indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2])))/pix[1] + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecIn[i]; - } - } - } - } // end else - // --------------------------- CS within hull - - tOld = 0.0; - if (countHull==0){ - indX = int(HullIn[0]/pix[0] + dimX/2.); - indY = int(HullIn[1]/pix[1] + dimY/2.); - lk = HullOut[2] - HullIn[2]; - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - - } else{ - for(int i= 0; i<=countHull; i++){ - lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]); - if(tOld == 0){ - indX = int(d[0]/pix[0] + dimX/2.); - indY = int(d[1]/pix[1] + dimY/2.); - linInd = indY + indX*(dSizeY); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVecHull[i]; - - }else if(i == countHull){ - lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]); - indX = int(pos1[0]/pix[0] + dimX/2.); - indY = int(pos1[1]/pix[1] + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - - }else{ - indX = int(cspline(InterceptsVecHull[i] -eps, a[0], b[0], c[0], d[0])/pix[0] + dimX/2.); - indY = int(cspline(InterceptsVecHull[i] -eps, a[1], b[1], c[1], d[1])/pix[1] + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVecHull[i]; - } - - }//i - } - - // --------------------------- SL from hull to detector - tOld = HullOut[2]; - if (countOut == 0){ - indX = int(exitpos[0]/pix[0] + dimX/2.); - indY = int(exitpos[1]/pix[1] + dimY/2.); - lk = exitpos[2] - HullOut[2]; - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - for(int i= 0; i<=countOut; i++){ - lk = abs(InterceptsVecOut[i] - tOld); - if(i == 0){ - indX = int(HullOut[0]/pix[0] + dimX/2.); - indY = int(HullOut[1]/pix[1] + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecOut[i]; - } - } - else if(i == countOut){ - lk = exitpos[2] - InterceptsVecOut[i-1]; - indX = int(exitpos[0]/pix[0] + dimX/2.); - indY = int(exitpos[1]/pix[1] + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/pix[0] + dimX/2.); - indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/pix[1] + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecOut[i]; - } - } - } - } // end else - } - else{ - atomicAdd(reject, 1.0); - } - - /* --------------------------- End Hull + SL outside ------------------------------- */ - - } - - else{ - - /* --------------------------------------------------------------------------------- */ - /* ----------------------------- Start with SL only! ------------------------------ */ - /* --------------------------------------------------------------------------------- */ - int count; - bool status = false; - float InterceptsVec[vecSizeCS] = {0}; - - float initpos[3], exitpos[3]; - float mydir[2]; - initpos[0] = devicePosIn[protonIndex]; - initpos[1] = devicePosIn[protonIndex + entries]; - initpos[2] = *detectDistIn; - exitpos[0] = devicePosOut[protonIndex]; - exitpos[1] = devicePosOut[protonIndex + entries]; - exitpos[2] = *detectDistOut; - - mydir[0] = (exitpos[0] - initpos[0])/lenZ; - mydir[1] = (exitpos[1] - initpos[1])/lenZ; // dz = 1 - count = calcInterceptsLinear(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status); - - - if (status) { - int indX, indY, linInd; - float tOld = initpos[2]; - if (count==0){ - indX = int(initpos[0]/pix[0] + dimX/2.); - indY = int(initpos[1]/pix[1] + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*1.0f); - } - - } else{ - for(int i= 0; i<=count; i++){ - lk = InterceptsVec[i] - tOld; - if(tOld == initpos[2]){ - indX = int(initpos[0]/pix[0] + dimX/2.); - indY = int(initpos[1]/pix[1] + dimY/2.); - linInd = indY + indX*(dSizeY); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVec[i]; - - }else if(i == count){ - lk = exitpos[2] - InterceptsVec[i-1]; - indX = int(exitpos[0]/pix[0] + dimX/2.); - indY = int(exitpos[1]/pix[1] + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - - }else{ - indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/pix[0] + dimX/2.); - indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/pix[1] + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVec[i]; - } - - } //i - }//if - Intercepts - } - else{ - // *reject += 1; - atomicAdd(reject, 1.0); - } - /* ------------------------------ End SL only! ------ -------------------------- */ - } - } -} - -__global__ void sumHist(float* hist, float* histNorm){ - - unsigned int index = blockIdx.x*blockDim.x + threadIdx.x; - hist[index] = hist[index]/histNorm[index]; -} - -__host__ void ParticleProjections(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \ - float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \ - float detectDistIn, float detectDistOut, float ein, float* ch_param){ - - /* - Detect Size = 400x400 - Prepare Input for GPU*/ - - const int sizeInputs = 2*numOfEntries*sizeof(float); - const int detectorMem = detectSizeX*detectSizeY*sizeof(float); - float reject = 0.0; - - float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull; - int *dnumEntries, *ddetectorX, *ddetectorY; - float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dEin, *dReject; - - float *hist1, *hist2; - hist1 = new float[detectSizeX*detectSizeY]; - hist2 = new float[detectSizeX*detectSizeY]; - for(int i = 0; i>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ - dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject); - cudaError_t _err = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err)); - cudaCheckErrors("Kernel fail!"); - - //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); - //dim3 block_sum(64,1,1); - //sumHist<<>>(dhist1, dhist2); - - //Copy result from device to host - //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); - //cudaError_t _errcp = cudaGetLastError(); - //mexPrintf("%s \n", cudaGetErrorString(_errcp)); - cudaCheckErrors("Device to host transport failed!"); - - for(int j = 0; j -#include -#include -#ifndef improvedForwardProjections_H -#define improvedForwardProjections_H -#define pi 3.14159265359 -#define eps 1e-8 -#define vecSizeCS 220 -#define vecSizeOut 100 -#define vecSizeIn 10 -#define maxthreads 256 -//#include -//#include - -void ParticleProjections(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \ - int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float ein, float* ch_param); - -__device__ int calcIntercepts(float* InterceptsVec ,float* a, float* b, \ - float* c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep); - -__device__ int SolvePolynomial(float*x, float a, float b, float c); - -__device__ int MinMax(float* solutions, float a, float b, float c); - -__device__ void SimpleSort(float* arr, int size_arr); - -__global__ void ParticleKernel(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \ - float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \ - float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject); - -__device__ int hullEntryExit(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff); - -__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \ - bool* protFlag); - -void ParticleProjectionsCone(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \ - int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float sourcePos, \ - float ein, float* ch_param); - -__device__ int calcInterceptsCone(float* InterceptsVec ,float* a, float* b, \ - float* c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep, \ - float sourcePos, float din, float dout); - -__device__ int SolvePolynomialCone(float*x, float a, float b, float c); - -__device__ void SimpleSortCone(float* arr, int size_arr); - -__device__ int MinMaxCone(float* solutions, float a, float b, float c); - -__global__ void ParticleKernelCone(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \ - float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \ - float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject, \ - float* sourceDist); - -__device__ int hullEntryExitCone(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff); - -__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \ - bool* protFlag, float sourcePos); - -#endif - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Common/CUDA/improvedForwardProjections_cone.cu.prehip b/Common/CUDA/improvedForwardProjections_cone.cu.prehip deleted file mode 100644 index 7a4f6b46..00000000 --- a/Common/CUDA/improvedForwardProjections_cone.cu.prehip +++ /dev/null @@ -1,1230 +0,0 @@ -/*------------------------------------------------------------------------- - * CUDA function for optimized proton CT radiographies - * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP) - * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232) - */ - -/*-------------------------------------------------------------------------- - This file is part of the TIGRE Toolbox - - Copyright (c) 2015, University of Bath and - CERN-European Organization for Nuclear Research - All rights reserved. - - License: Open Source under BSD. - See the full license at - https://github.com/CERN/TIGRE/blob/master/LICENSE - - Contact: tigre.toolbox@gmail.com - Codes: https://github.com/CERN/TIGRE/ - Coded by: Stefanie Kaser, Benjamin Kirchmayer ---------------------------------------------------------------------------*/ - - -#include -#include "mex.h" -#include -#include "improvedForwardProjections.hpp" -// #include -// #include - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\ - } \ -} while (0) - - -__device__ int SolvePolynomialCone(float*x, float a, float b, float c){ - // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method - // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 - // Herefore, we have to make a substitution: x = z - a/3 - float p = b - a*a / 3.0; - float q = 2*a*a*a/27.0 - a*b / 3.0 + c; - float disc = q*q/4.0 + p*p*p/27.0; - if(disc > 0){ - float u = cbrt(-0.5*q + sqrt(disc)); - float v = cbrt(-0.5*q - sqrt(disc)); - x[0] = u + v - a/3.0; // don't forget to substitute back z --> x - return 1; - } - else if(disc == 0 && p == 0){ - x[0] = -a/3.0; // don't forget to substitute back z --> x - return 1; - } - else if(disc == 0 && p != 0){ - x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x - x[1] = -3.0*q/(2.0*p) - a/3.0; - return 2; - } - else{ - x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x - x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0; - x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0; - return 3; - } -} - -__device__ float csplineCone(float t, float a, float b, float c, float d){ - - return a*(t*t*t) + b*(t*t) + c*t +d; - -} - -__device__ void SimpleSortCone(float* arr, int size_arr){ - // Insertion sorting method - float curr_elem; - int j; - - for (int i=1; i=0 && curr_elem0){ - - float z_1 = -p/2.0 + sqrt(disc); - float z_2 = -p/2.0 - sqrt(disc); - float z_solve; - - if(in_or_out == 1){ - z_solve = min(z_1, z_2); - } - else { - z_solve = max(z_1, z_2); - } - - float x_solve = kx*z_solve + dx; - - float ky = direction[1]; - float dy = position[1] - ky*detOff; - float y_solve = ky*z_solve + dy; - - if(-h/2 <= y_solve && y_solve <= h/2){ - - HullIntercept[0] = x_solve; - HullIntercept[1] = y_solve; - HullIntercept[2] = z_solve; - - return 0; - } - else{ - float z1_h = (1.0/ky) * (0.5*h-dy); - float z2_h = (1.0/ky) * (-0.5*h-dy); - - if(in_or_out == 1){ - z_solve = min(z1_h, z2_h); - if(dy > 0){y_solve = -h*0.5;} - else{y_solve = h*0.5;} - x_solve = kx*z_solve + dx; - } - else { - z_solve = max(z1_h, z2_h); - if(dy < 0){y_solve = -h*0.5;} - else{y_solve = h*0.5;} - x_solve = kx*z_solve + dx; - } - - if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){ - - HullIntercept[0] = x_solve; - HullIntercept[1] = y_solve; - HullIntercept[2] = z_solve; - - return 0; - } - - else{return 1;}} - } -else{return 1;} -} - - - -__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag, - float sourcePos){ - float tan_alpha, d_channel; - int counter = 0; - int nx, ny; - float sdd = abs(stop[2] - sourcePos); // distance source detector - float sidd = abs(start[2] - sourcePos); // distance sourcce inital detector - int select; - - float pix_start_x = sidd * (pix[0]/sdd); - float pix_start_y = sidd * (pix[1]/sdd); - - nx = int(abs(stop[0]/pix[0] - start[0]/pix_start_x)); - ny = int(abs(stop[1]/pix[1] - start[1]/pix_start_y)); - if(nx+ny>=maxIntercep){ - *protFlag = false; - return 1;} - - if (int(stop[0]/pix[0]) == int(start[0]/pix_start_x) && int(stop[1]/pix[1]) == int(start[1]/pix_start_y)) { - *protFlag = true; - return 0; - } - - if (int(stop[0]/pix[0]) != int(start[0]/pix_start_x)) { - float k = direction[0]; - float d = start[0] - k*start[2]; - if(stop[0]/pix[0] > start[0]/pix_start_x){ - tan_alpha = (trunc(stop[0]/pix[0])*pix[0])/sdd; - d_channel = trunc(stop[0]/pix[0])*pix[0] - tan_alpha * stop[2]; - select = 0; - } - else{ - tan_alpha = (trunc(start[0]/pix_start_x)*pix_start_x)/sidd; - d_channel = trunc(start[0]/pix_start_x)*pix_start_x - tan_alpha * start[2]; - select = 1; - } - - for (int ix=0; ix start[2] && intercept < stop[2]){ - LinInterceptsVec[ix] = intercept; - counter++; - if (counter >= maxIntercep){ - *protFlag = false; - return counter;} - } - } - } - - if (int(stop[1]/pix[1]) != int(start[1]/pix_start_y)) { - float k = direction[1]; - float d = start[1] - k*start[2]; - if(stop[1]/pix[1] > start[1]/pix_start_y){ - tan_alpha = (trunc(stop[1]/pix[1])*pix[1])/sdd; - d_channel = trunc(stop[1]/pix[1])*pix[1] - tan_alpha * stop[2]; - select = 0; - } - else{ - tan_alpha = (trunc(start[1]/pix_start_y)*pix_start_y)/sidd; - d_channel = trunc(start[1]/pix_start_y)*pix_start_y - tan_alpha * start[2]; - select = 1; - } - - for (int iy=nx; iy start[2] && intercept < stop[2]){ - LinInterceptsVec[iy] = intercept; - counter++; - if (counter >= maxIntercep){ - *protFlag = false; - return counter;} - } - } - } - - int diff = maxIntercep - counter; - for(int j = 0; j 0){ - solutions[0] = -0.5*p + sqrt(disc); - solutions[1] = -0.5*p - sqrt(disc); - return 0; - } - solutions[0] = -1; - solutions[1] = -1; - return 1; -} - - - -__device__ int calcInterceptsCone(float* InterceptsVec ,float* a, float* b, \ - float* c, float* d, float* pos1, float* pixelSize, bool* protFlag, int maxIntercep, \ - float sourcePos, float din, float dout){ - - /*Calculates channel Intercepts and the lengths the proton (ion) has spent in the - corresponding channel. - Returns 1 if proton is accepted and 0 if it is rejected due to too many Intercepts - */ - float oneX, oneY, zeroX, zeroY, pix_oneX, pix_oneY, pix_zeroX, pix_zeroY; - float tan_alpha, d_channel; - float sdd_init = abs(dout - sourcePos)/abs(dout-din); // normalize to 1! - float sidd_init = abs(din - sourcePos)/abs(dout-din); - float sdd_x = abs(dout - sourcePos)/abs(dout-din); // normalize to 1! - float sidd_x = abs(din - sourcePos)/abs(dout-din); - float sdd_y = abs(dout - sourcePos)/abs(dout-din); // normalize to 1! - float sidd_y = abs(din - sourcePos)/abs(dout-din); - int select; - float pix_start_x = sidd_init * (pixelSize[0]/sdd_init); - float pix_start_y = sidd_init * (pixelSize[1]/sdd_init); - zeroX = d[0]; - oneX = pos1[0]; - zeroY = d[1]; - oneY = pos1[1]; - pix_zeroX = pix_start_x; - pix_zeroY = pix_start_y; - pix_oneX = pixelSize[0]; - pix_oneY = pixelSize[1]; - - - int status, nx, ny; - float IntercepX[3]; - float IntercepY[3]; - float solutions[2]; - // counter has to be implemented despite the initial discrimination because one can not state beforehand if - // the cubic spline has more than one Intercept with the channel boundary - int counter=0; - - int test = MinMaxCone(solutions, a[0], b[0], c[0]); - if (test == 0){ - if (solutions[0] < 1 && solutions[0] > 0){ - float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0]; - float pix_cand = (sidd_init + solutions[0]) * (pixelSize[0]/sdd_init); - if (cand/pix_cand > d[0]/pix_start_x && cand/pix_cand > pos1[0]/pixelSize[0]){ - (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand; - (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; - (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din); - } - else if(cand/pix_cand < d[0]/pix_start_x && cand/pix_cand < pos1[0]/pixelSize[0]){ - (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand; - (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; - (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din); - } - } - - if (solutions[1] < 1 && solutions[1] > 0){ - float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0]; - float pix_cand = (sidd_init + solutions[1]) * (pixelSize[0]/sdd_init); - if (cand/pix_cand > oneX/pix_oneX && cand/pix_cand > zeroX/pix_zeroX){ - (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand; - (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; - (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din); - } - else if(cand/pix_cand < oneX/pix_oneX && cand/pix_cand < zeroX/pix_zeroX){ - (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand; - (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand; - (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din); - } - } - } - - test = MinMaxCone(solutions, a[1], b[1], c[1]); - if (test == 0){ - if (solutions[0] < 1 && solutions[0] > 0){ - float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1]; - float pix_cand = (sidd_init + solutions[0]) * (pixelSize[1]/sdd_init); - if (cand/pix_cand > d[1]/pix_start_y && cand/pix_cand > pos1[1]/pixelSize[1]){ - (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand; - (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; - (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din); - } - else if(cand/pix_cand < d[1]/pix_start_y && cand/pix_cand < pos1[1]/pixelSize[1]){ - (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand; - (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; - (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din); - } - } - - if (solutions[1] < 1 && solutions[1] > 0){ - float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1]; - float pix_cand = (sidd_init + solutions[1]) * (pixelSize[1]/sdd_init); - if (cand/pix_cand > oneY/pix_oneY && cand/pix_cand > zeroY/pix_zeroY){ - (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand; - (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; - (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din); - } - else if(cand/pix_cand < oneY/pix_oneY && cand/pix_cand < zeroY/pix_zeroY){ - (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand; - (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand; - (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din); - } - } - } - //Check how many Intercepts will occur approximately - nx = int(abs(oneX/pix_oneX - zeroX/pix_zeroX)); - ny = int(abs(oneY/pix_oneY - zeroY/pix_zeroY)); - - if (nx + ny == 0) { - *protFlag = true; - return 0; - } - if ((nx + ny) <= maxIntercep){ - - if (int(oneX/pix_oneX) != int(zeroX/pix_zeroX)) { - if(oneX/pix_oneX > zeroX/pix_zeroX){ - tan_alpha = (trunc(oneX/pix_oneX)*pix_oneX)/sdd_x; - d_channel = trunc(oneX/pix_oneX)*pix_oneX * (sidd_init/sdd_x); - select = 0; - } - else{ - tan_alpha = (trunc(zeroX/pix_zeroX)*pix_zeroX)/sidd_x; - d_channel = trunc(zeroX/pix_zeroX)*pix_zeroX * (sidd_init/sidd_x); - select = 1; - } - for (int ix=0; ix 0. ){ - if (counter >=maxIntercep){break;} - InterceptsVec[counter] = IntercepX[kx]; - counter++; - } - }//kx - if (counter >=maxIntercep){break;} - } - } - - if ( int(oneY/pix_oneY) != int(zeroY/pix_zeroY)) { - if(oneY/pix_oneY > zeroY/pix_zeroY){ - tan_alpha = (trunc(oneY/pix_oneY)*pix_oneY)/sdd_y; - d_channel = trunc(oneY/pix_oneY)*pix_oneY * (sidd_init/sdd_y); - select = 0; - } - else{ - tan_alpha = (trunc(zeroY/pix_zeroY)*pix_zeroY)/sidd_y; - d_channel = trunc(zeroY/pix_zeroY)*pix_zeroY * (sidd_init/sidd_y); - select = 1; - } - for (int iy=0; iy 0. ){ - if (counter >=maxIntercep){break;} - InterceptsVec[counter] = IntercepY[ky]; - counter++; - } - }//kx - if (counter >=maxIntercep){break;} - } - } - - if (counter >= maxIntercep){ // || counter == 0){ - *protFlag = false; - return counter; - } - - else{ - int diff = maxIntercep - counter; - for(int j = 0; j HullIn[2]){ - /* --------------------------------------------------------------------------------- */ - /* ------------------------ Start with Hull + SL outside -------------------------- */ - /* --------------------------------------------------------------------------------- */ - const int hullIntercep = int(vecSizeCS); - const int airIntercepIn = int(vecSizeIn); - const int airIntercepOut = int(vecSizeOut); - bool status1 = false; - bool status2 = false; - bool status3 = false; - - int countIn, countHull, countOut; - float InterceptsVecOut[airIntercepOut] = {0}; - float InterceptsVecIn[airIntercepIn] = {0}; - float InterceptsVecHull[hullIntercep] = {0}; - lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); - lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); - - float newpix[2]; - newpix[0] = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - newpix[1] = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - countIn = calcInterceptsLinearCone(InterceptsVecIn, initpos, HullIn, initdir, newpix, airIntercepIn, &status1, *sourceDist); - countOut = calcInterceptsLinearCone(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2, *sourceDist); - - /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */ - float lambda0, lambda1, ref_wepl; - ref_wepl = 10 * 0.00244 * powf(*ein, 1.75); - lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); - lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl); - - float a[2], b[2], c[2], d[2], pos1[2]; - - //Allocate memory for all pointers - // Calculate optimized xdir_in - devicedirIn[protonIndex] = devicedirIn[protonIndex] \ - / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0); // ... dz = 1! - devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0; - - // Calculate optimized ydir_in - devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \ - / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0); // ... dz = 1! - devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0; - - // Calculate optimized xdir_out - devicedirOut[protonIndex] = devicedirOut[protonIndex] \ - / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1! - devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1; - - // Calculate optimized ydir_out - devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \ - / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1! - devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1; - - // Calculate spline parameters - a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex]; - a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \ - 2.*HullOut[1] + devicedirOut[protonIndex + entries]; - - b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex]; - b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \ - + 3.*HullOut[1] - devicedirOut[protonIndex + entries]; - - c[0] = devicedirIn[protonIndex]; - c[1] = devicedirIn[protonIndex + entries]; - - d[0] = HullIn[0]; - d[1] = HullIn[1]; - - pos1[0] = HullOut[0]; - pos1[1] = HullOut[1]; - - // float newpix[2]; - newpix[0] = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - newpix[1] = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - countHull = calcInterceptsCone(InterceptsVecHull, a, b, c, d, pos1, newpix, &status3, hullIntercep, *sourceDist, HullIn[2], HullOut[2]); - /* -------------------- End CS Preparations! -------------- */ - - if(status1 && status2 && status3){ - float tOld = initpos[2]; - int indX, indY, linInd; - // WEIGHTING FACTORS FOR CHANNELS II - float weight_water = 1; - - // ---------------------------------------- Start with SL from detector to hull - float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - float pix_end_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - float pix_end_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - if (countIn == 0){ - indX = int(initpos[0]/pix_start_x + dimX/2.); - indY = int(initpos[1]/pix_start_y + dimY/2.); - lk = HullIn[2] - initpos[2]; - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - for(int i= 0; i<=countIn; i++){ - lk = InterceptsVecIn[i] - tOld; - if(i == 0){ - indX = int(initpos[0]/pix_start_x + dimX/2.); - indY = int(initpos[1]/pix_start_y + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecIn[i]; - } - } - else if(i == countIn){ - lk = HullIn[2] - InterceptsVecIn[i-1]; - indX = int(HullIn[0]/pix_end_x + dimX/2.); - indY = int(HullIn[1]/pix_end_y + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - float curr_pix_x = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - float curr_pix_y = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2] )))/curr_pix_x + dimX/2.); - indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2] )))/curr_pix_y + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecIn[i]; - } - } - } - } // end else - - // ---cone beam------------------------ CS within hull - - tOld = 0.0; - pix_start_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - pix_start_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - pix_end_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - pix_end_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - if (countHull==0){ - indX = int(HullIn[0]/pix_start_x + dimX/2.); - indY = int(HullIn[1]/pix_start_y + dimY/2.); - lk = HullOut[2] - HullIn[2]; - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - - } else{ - for(int i= 0; i<=countHull; i++){ - lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]); - if(tOld == 0){ - indX = int(d[0]/pix_start_x + dimX/2.); - indY = int(d[1]/pix_start_y + dimY/2.); - linInd = indY + indX*(dSizeY); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVecHull[i]; - - }else if(i == countHull){ - lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]); - indX = int(pos1[0]/pix_end_x + dimX/2.); - indY = int(pos1[1]/pix_end_y + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - - }else{ - float curr_len = (InterceptsVecHull[i]-eps)*(HullOut[2]-HullIn[2]) + (HullIn[2] - *sourceDist); // abs(((InterceptsVecHull[i]-eps)*lenZ + *detectDistIn) - *sourceDist) - float curr_pix_x = curr_len * (pix[0]/abs(exitpos[2] - *sourceDist)); - float curr_pix_y = curr_len * (pix[1]/abs(exitpos[2] - *sourceDist)); - indX = int(csplineCone(InterceptsVecHull[i] - eps, a[0], b[0], c[0], d[0])/curr_pix_x + dimX/2.); - indY = int(csplineCone(InterceptsVecHull[i] - eps, a[1], b[1], c[1], d[1])/curr_pix_y + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVecHull[i]; - } - - }//i - } - - // --------------------------- SL from hull to detector - tOld = HullOut[2]; - pix_start_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - pix_start_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - if (countOut == 0){ - indX = int(exitpos[0]/pix[0] + dimX/2.); - indY = int(exitpos[1]/pix[1] + dimY/2.); - lk = exitpos[2] - HullOut[2]; - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - for(int i= 0; i<=countOut; i++){ - lk = abs(InterceptsVecOut[i] - tOld); - if(i == 0){ - indX = int(HullOut[0]/pix_start_x + dimX/2.); - indY = int(HullOut[1]/pix_start_y + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecOut[i]; - } - } - else if(i == countOut){ - lk = exitpos[2] - InterceptsVecOut[i-1]; - indX = int(exitpos[0]/pix[0] + dimX/2.); - indY = int(exitpos[1]/pix[1] + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - } - - else{ - float curr_pix_x = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - float curr_pix_y = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/curr_pix_x + dimX/2.); - indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/curr_pix_y + dimY/2.); - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - tOld = InterceptsVecOut[i]; - } - } - } - } // end else - - } - else{ - atomicAdd(reject, 1.0); - } - - /* --------------------------- End Hull + SL outside ------------------------------- */ - - } - - else{ - - /* --------------------------------------------------------------------------------- */ - /* ----------------------------- Start with SL only! ------------------------------ */ - /* --------------------------------------------------------------------------------- */ - int count; - bool status = false; - float InterceptsVec[vecSizeCS] = {0}; - //float InterceptsLengths[vecSizeCS+1] = {0}; - - float initpos[3], exitpos[3]; - float mydir[2]; - initpos[0] = devicePosIn[protonIndex]; - initpos[1] = devicePosIn[protonIndex + entries]; - initpos[2] = *detectDistIn; - exitpos[0] = devicePosOut[protonIndex]; - exitpos[1] = devicePosOut[protonIndex + entries]; - exitpos[2] = *detectDistOut; - - mydir[0] = (exitpos[0] - initpos[0])/lenZ; - mydir[1] = (exitpos[1] - initpos[1])/lenZ; // dz = 1 - count = calcInterceptsLinearCone(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status, *sourceDist); - - // for cone beam we need this - /*float lenZ_custom = 0.0; - float head[3], tail[3]; - for (int i=0; i<=count; i++){ - if (i == 0){ - head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]); - head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]); - head[2] = InterceptsVec[i]; - InterceptsLengths[i] = sqrt(powf(head[0] - initpos[0], 2.0) + powf(head[1] - initpos[1], 2.0) + powf(head[2] - initpos[2], 2.0)); - tail[0] = head[0]; - tail[1] = head[1]; - tail[2] = head[2]; - lenZ_custom += InterceptsLengths[i]; - } - else if (i == count){ - InterceptsLengths[i] = sqrt(powf(exitpos[0] - tail[0], 2.0) + powf(exitpos[1] - tail[1], 2.0) + powf(exitpos[2] - tail[2], 2.0)); - lenZ_custom += InterceptsLengths[i]; - } - else{ - head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]); - head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]); - head[2] = InterceptsVec[i]; - InterceptsLengths[i] = sqrt(powf(head[0] - tail[0], 2.0) + powf(head[1] - tail[1], 2.0) + powf(head[2] - tail[2], 2.0)); - tail[0] = head[0]; - tail[1] = head[1]; - tail[2] = head[2]; - lenZ_custom += InterceptsLengths[i]; - } - }*/ - - float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - - if (status) { - int indX, indY, linInd; - // exitpos[0] / (exitpos[2] - *sourceDir); - float tOld = initpos[2]; - if (count==0){ - indX = int(initpos[0]/pix_start_x + dimX/2.); - indY = int(initpos[1]/pix_start_y + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*1.0f); - } - - } else{ - for(int i= 0; i<=count; i++){ - lk = InterceptsVec[i] - tOld; - // lk = InterceptsLengths[i]; - if(i == 0){ - indX = int(initpos[0]/pix_start_x + dimX/2.); - indY = int(initpos[1]/pix_start_y + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVec[i]; - - }else if(i == count){ - lk = exitpos[2] - InterceptsVec[i-1]; - indX = int(exitpos[0]/pix[0] + dimX/2.); - indY = int(exitpos[1]/pix[1] + dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - - }else{ - float curr_pix_x = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist)); - float curr_pix_y = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist)); - indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/curr_pix_x+dimX/2.); - indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/curr_pix_y+dimY/2.); - - if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){ - linInd = indY + indX*(dSizeY); - atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]); - atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)); - } - tOld = InterceptsVec[i]; - } - - } //i - }//if - Intercepts - } - else{ - // *reject += 1; - atomicAdd(reject, 1.0); - } - /* ------------------------------ End SL only! ------ -------------------------- */ - } - } -} - -__global__ void sumHistCone(float* hist, float* histNorm){ - - unsigned int index = blockIdx.x*blockDim.x + threadIdx.x; - hist[index] = hist[index]/histNorm[index]; -} - -__host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \ - float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \ - float detectDistIn, float detectDistOut, float sourcePos, \ - float ein, float* ch_param){ - - /* - Detect Size = 400x400 - Prepare Input for GPU*/ - - const int sizeInputs = 2*numOfEntries*sizeof(float); - const int detectorMem = detectSizeX*detectSizeY*sizeof(float); - float reject = 0.0; - - float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull; - int *dnumEntries, *ddetectorX, *ddetectorY; - float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dSourceDist, *dEin, *dReject; - - float *hist1, *hist2; - hist1 = new float[detectSizeX*detectSizeY]; - hist2 = new float[detectSizeX*detectSizeY]; - for(int i = 0; i>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \ - dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist); - cudaError_t _err = cudaGetLastError(); - mexPrintf("%s \n", cudaGetErrorString(_err)); - cudaCheckErrors("Kernel fail!"); - - //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1); - //dim3 block_sum(64,1,1); - //sumHist<<>>(dhist1, dhist2); - - //Copy result from device to host - //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost); - cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost); - //cudaError_t _errcp = cudaGetLastError(); - //mexPrintf("%s \n", cudaGetErrorString(_errcp)); - cudaCheckErrors("Device to host transport failed!"); - - for(int j = 0; j -#include - -float maxDistanceCubeXY(Geometry geo, float alpha,int i){ - /////////// - // Compute initial "t" so we access safely as less as out of bounds as possible. - ////////// - - float maxCubX,maxCubY; - // Forgetting Z, compute max distance: diagonal+offset - maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX; - maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY; - - return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY); -} - -void rollPitchYaw(Geometry geo,int i, Point3D* point){ - Point3D auxPoint; - auxPoint.x=point->x; - auxPoint.y=point->y; - auxPoint.z=point->z; - - point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x - +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y - +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; - - point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x - +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y - +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; - - point->z=-sin(geo.dPitch[i])*auxPoint.x - +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y - +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z; -} \ No newline at end of file diff --git a/Common/CUDA/projection.hpp.prehip b/Common/CUDA/projection.hpp.prehip deleted file mode 100644 index 54597d92..00000000 --- a/Common/CUDA/projection.hpp.prehip +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef PROJECTION_HPP -#define PROJECTION_HPP - -#include "types_TIGRE.hpp" - -float maxDistanceCubeXY(Geometry geo, float alpha,int i); -void rollPitchYaw(Geometry geo,int i, Point3D* point); - -#endif diff --git a/Common/CUDA/ray_interpolated_projection.cu.prehip b/Common/CUDA/ray_interpolated_projection.cu.prehip deleted file mode 100644 index e71c5b59..00000000 --- a/Common/CUDA/ray_interpolated_projection.cu.prehip +++ /dev/null @@ -1,843 +0,0 @@ -/*------------------------------------------------------------------------- - * - * CUDA functions for texture-memory interpolation based projection - * - * This file has the necessary fucntiosn to perform X-ray CBCT projection - * operation given a geaometry, angles and image. It uses the 3D texture - * memory linear interpolation to uniformily sample a path to integrate the - * X-rays. - * - * CODE by Ander Biguri - * Sepideh Hatamikia (arbitrary rotation) - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - - - - - - -#include -#include -#include -#include "ray_interpolated_projection.hpp" -#include "TIGRE_common.hpp" -#include - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - cudaDeviceReset();\ - mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\ - } \ -} while (0) - - - -#define MAXTREADS 1024 -#define PROJ_PER_BLOCK 9 -#define PIXEL_SIZE_BLOCK 9 - /*GEOMETRY DEFINITION - * - * Detector plane, behind - * |-----------------------------| - * | | - * | | - * | | - * | | - * | +--------+ | - * | / /| | - * A Z | / / |*D | - * | | +--------+ | | - * | | | | | | - * | | | *O | + | - * --->y | | | / | - * / | | |/ | - * V X | +--------+ | - * |-----------------------------| - * - * *S - * - * - * - * - * - **/ - void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate); -__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device -__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device - - -__global__ void vecAddInPlaceInterp(float *a, float *b, unsigned long n) -{ - int idx = blockIdx.x*blockDim.x+threadIdx.x; - // Make sure we do not go out of bounds - if (idx < n) - a[idx] = a[idx] + b[idx]; -} - - -template - __global__ void kernelPixelDetector( Geometry geo, - float* detector, - const int currProjSetNumber, - const int totalNoOfProjections, - cudaTextureObject_t tex){ - - unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; - unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; - unsigned long long projNumber=threadIdx.z; - - if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) - return; - -#if IS_FOR_MATLAB_TIGRE - size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#else - size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#endif - - unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array - - if(indAlpha>=totalNoOfProjections) - return; - - Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection - Point3D deltaU = projParamsArrayDev[4*projNumber+1]; - Point3D deltaV = projParamsArrayDev[4*projNumber+2]; - Point3D source = projParamsArrayDev[4*projNumber+3]; - - float DSO = projFloatsArrayDev[2*projNumber+0]; - float cropdist_init = projFloatsArrayDev[2*projNumber+1]; - - - - /////// Get coordinates XYZ of pixel UV - unsigned long pixelV = geo.nDetecV-v-1; - unsigned long pixelU = u; - - - float vectX,vectY,vectZ; - Point3D P; - P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); - P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); - P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); - - // Length is the ray length in normalized space - float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z)); - //now legth is an integer of Nsamples that are required on this line - length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer - vectX=__fdividef(P.x -source.x,length); - vectY=__fdividef(P.y -source.y,length); - vectZ=__fdividef(P.z -source.z,length); - - -// //Integrate over the line - float tx,ty,tz; - float sum=0; - float i; - - - -// Because I have no idea how to efficiently cutoff the legth path in 3D, a very upper limit is computed (see maxdistanceCuboid) -// for the 3D case. However it would be bad to lose performance in the 3D case -// TODO: can ge really improve this? - if (sphericalrotation){ - if ((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy < length) - length=ceilf((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy); - } - else{ - if ((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy < length) - length=ceilf((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy); - } - - - //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel") - for (i=floorf(cropdist_init/geo.accuracy); i<=length; i=i+1){ - tx=vectX*i+source.x; - ty=vectY*i+source.y; - tz=vectZ*i+source.z; - - sum += tex3D(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time. - } - - float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+ - (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+ - (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) ); - - detector[idx]=sum*deltalength; -} - - - -// legnth(angles)=3 x nagnles, as we have roll, pitch, yaw. -int interpolation_projection(float * img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ - - - // Prepare for MultiGPU - int deviceCount = gpuids.GetLength(); - cudaCheckErrors("Device query fail"); - if (deviceCount == 0) { - mexErrMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","There are no available device(s) that support CUDA\n"); - } - // - // CODE assumes - // 1.-All available devices are usable by this code - // 2.-All available devices are equal, they are the same machine (warning thrown) - // Check the available devices, and if they are the same - if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); - } - int dev; - - // Check free memory - size_t mem_GPU_global; - checkFreeMemory(gpuids,&mem_GPU_global); - - // printf("geo.nDetec (U, V) = %d, %d\n", geo.nDetecU, geo.nDetecV); - - size_t mem_image=(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); - size_t mem_proj =(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV * sizeof(float); - - // Does everything fit in the GPUs? - const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to - // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. - -#ifndef NO_PINNED_MEMORY - if (isHostRegisterSupported & splits>1){ - cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); - } - cudaCheckErrors("Error pinning memory"); -#endif - Point3D source, deltaU, deltaV, uvOrigin; - - Point3D* projParamsArrayHost = 0; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); - float* projFloatsArrayHost = 0; - cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); - cudaCheckErrors("Error allocating auxiliary constant memory"); - - // Create Streams for overlapping memcopy and compute - int nStream_device=2; - int nStreams=deviceCount*nStream_device; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t)); - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - for (int i = 0; i < nStream_device; ++i){ - cudaStreamCreate(&stream[i+dev*nStream_device]); - - } - } - cudaCheckErrors("Stream creation fail"); - int nangles_device=(nangles+deviceCount-1)/deviceCount; - int nangles_last_device=(nangles-(deviceCount-1)*nangles_device); - unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK - unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management. - int projection_this_block; - - - - cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount]; - cudaArray **d_cuArrTex = new cudaArray*[deviceCount]; - for (unsigned int sp=0;sp=nangles) - break; - if ((i*PROJ_PER_BLOCK+j)>=nangles_device) - break; - geoArray[sp].alpha=angles[proj_global*3]; - geoArray[sp].theta=angles[proj_global*3+1]; - geoArray[sp].psi =angles[proj_global*3+2]; - - is_spherical+=abs(geoArray[sp].theta)+abs(geoArray[sp].psi); - - //precomute distances for faster execution - maxdist=maxdistanceCuboid(geoArray[sp],proj_global); - //Precompute per angle constant stuff for speed - computeDeltas(geoArray[sp], proj_global, &uvOrigin, &deltaU, &deltaV, &source); - //Ray tracing! - projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection - projParamsArrayHost[4*j+1]=deltaU; - projParamsArrayHost[4*j+2]=deltaV; - projParamsArrayHost[4*j+3]=source; - - projFloatsArrayHost[2*j]=geo.DSO[proj_global]; - projFloatsArrayHost[2*j+1]=floor(maxdist); - } - - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]); - cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]); - cudaStreamSynchronize(stream[dev*nStream_device]); - - - //TODO: we could do this around X and Y axis too, but we would need to compute the new axis of rotation (not possible to know from jsut the angles) - if (!is_spherical){ - kernelPixelDetector<<>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); - } - else{ - kernelPixelDetector <<>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]); - } - } - - - // Now that the computation is happening, we need to either prepare the memory for - // combining of the projections (splits>1) and start removing previous results. - - - // If our image does not fit in memory then we need to make sure we accumulate previous results too. - // This is done in 2 steps: - // 1)copy previous results back into GPU - // 2)accumulate with current results - // The code to take them out is the same as when there are no splits needed - if( !fits_in_memory&&sp>0) - { - // 1) grab previous results and put them in the auxiliary variable dProjection_accum - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on this set on this GPU - proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; - if(proj_global>=nangles) - break; - - // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... - if(i+1==noOfKernelCalls) //is it the last block? - projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) - nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - else - projection_this_block=PROJ_PER_BLOCK; - cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]); - } - // 2) take the results from current compute call and add it to the code in execution. - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on this set on this GPU - proj_global=i*PROJ_PER_BLOCK+dev*nangles_device; - if(proj_global>=nangles) - break; - - // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise... - if(i+1==noOfKernelCalls) //is it the last block? - projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) - nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - else - projection_this_block=PROJ_PER_BLOCK; - cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished - vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block); - } - } // end accumulation case, where the image needs to be split - - // Now, lets get out the projections from the previous execution of the kernels. - if (i>0) - { - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on previous set on this GPU - proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device; - if (dev+1==deviceCount) { //is it the last device? - // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device - if (i-1 < noOfKernelCallsLastDev) { - // The previous set(block) was not empty. - projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global); - } - else { - // The previous set was empty. - // This happens if deviceCount > PROJ_PER_BLOCK+1. - // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199. - // e.g. PROJ_PER_BLOCK = 1, deviceCount = 3, nangles = 7. - break; - } - } - else { - projection_this_block=PROJ_PER_BLOCK; - } - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); - } - } - // Make sure Computation on kernels has finished before we launch the next batch. - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*2]); - } - } // End noOfKernelCalls (i) loop. - - // We still have the last set of projections to get out of GPUs - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - //Global index of FIRST projection on this set on this GPU - proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device; - if(proj_global>=nangles) - break; - // How many projections are left here? - projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK) - nangles-proj_global); //or whichever amount is left to finish all (this is for the last GPU) - - cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. - cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)"); - cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]); - } - // Make sure everyone has done their bussiness before the next image split: - for (dev = 0; dev < deviceCount; dev++) - { - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - } // End image split loop. - - cudaCheckErrors("Main loop fail"); - /////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////// - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texImg[dev]); - cudaFreeArray(d_cuArrTex[dev]); - } - delete[] texImg; texImg = 0; - delete[] d_cuArrTex; d_cuArrTex = 0; - // Freeing Stage - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection[dev*2]); - cudaFree(dProjection[dev*2+1]); - - } - free(dProjection); - - if(!fits_in_memory){ - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dProjection_accum[dev*2]); - cudaFree(dProjection_accum[dev*2+1]); - - } - free(dProjection_accum); - } - freeGeoArray(splits,geoArray); - cudaFreeHost(projParamsArrayHost); - cudaFreeHost(projFloatsArrayHost); - - - for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]) ; -#ifndef NO_PINNED_MEMORY - if (isHostRegisterSupported & splits>1){ - cudaHostUnregister(img); - } -#endif - cudaCheckErrors("cudaFree fail"); - -// cudaDeviceReset(); - return 0; -} -void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate) -{ - const unsigned int num_devices = gpuids.GetLength(); - //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - if(allocate){ - - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - - //cudaArray Descriptor - - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); - cudaCheckErrors("Texture memory allocation fail"); - } - - } - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaMemcpy3DParms copyParams = {0}; - cudaSetDevice(gpuids[dev]); - //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height); - copyParams.dstArray = d_cuArrTex[dev]; - copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params); - //cudaCheckErrors("Texture memory data copy fail"); - //Array creation End - } - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - if (geo.accuracy>1){ - texDescr.filterMode = cudaFilterModePoint; - geo.accuracy=1; - } - else{ - texDescr.filterMode = cudaFilterModeLinear; - } - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); - cudaCheckErrors("Texture object creation fail"); - } -} - -/* This code generates the geometries needed to split the image properly in - * cases where the entire image does not fit in the memory of the GPU - **/ -void splitImageInterp(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){ - - unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible - for(unsigned int sp=0;spx=Pfinalu0.x-Pfinal.x; - deltaU->y=Pfinalu0.y-Pfinal.y; - deltaU->z=Pfinalu0.z-Pfinal.z; - - deltaV->x=Pfinalv0.x-Pfinal.x; - deltaV->y=Pfinalv0.y-Pfinal.y; - deltaV->z=Pfinalv0.z-Pfinal.z; - - *source=S; -} - -float maxdistanceCuboid(Geometry geo,unsigned int i){ - /////////// - // Compute initial "t" so we access safely as less as out of bounds as possible. - ////////// - - - float maxCubX,maxCubY,maxCubZ; - // Forgetting Z, compute mas distance: diagonal+offset - maxCubX=(geo.nVoxelX/2+ abs(geo.offOrigX[i])/geo.dVoxelX); - maxCubY=(geo.nVoxelY/2+ abs(geo.offOrigY[i])/geo.dVoxelY); - maxCubZ=(geo.nVoxelZ/2+ abs(geo.offOrigZ[i])/geo.dVoxelZ); - - float a,b; - a=geo.DSO[i]/geo.dVoxelX; - b=geo.DSO[i]/geo.dVoxelY; - -// As the return of this value is in "voxel space", the source may have an elliptical curve. -// The distance returned is the safe distance that can be skipped for a given angle alpha, before we need to start sampling. - - if (geo.theta==0.0f & geo.psi==0.0f) // Special case, it will make the code faster - return max(a*b/sqrt(a*a*sin(geo.alpha)*sin(geo.alpha)+b*b*cos(geo.alpha)*cos(geo.alpha))- - sqrt(maxCubX*maxCubX+maxCubY*maxCubY),0.0f); - //TODO: think of more special cases? - return max(geo.DSO[i]/max(max(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)-sqrt(maxCubX*maxCubX+maxCubY*maxCubY+maxCubZ*maxCubZ),0.0f); - -} -void rollPitchYaw(Geometry geo,unsigned int i, Point3D* point){ - Point3D auxPoint; - auxPoint.x=point->x; - auxPoint.y=point->y; - auxPoint.z=point->z; - - point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x - +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y - +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; - - point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x - +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y - +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z; - - point->z=-sin(geo.dPitch[i])*auxPoint.x - +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y - +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z; - -} -void eulerZYZ(Geometry geo, Point3D* point){ - Point3D auxPoint; - auxPoint.x=point->x; - auxPoint.y=point->y; - auxPoint.z=point->z; - - point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+ - (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+ - cos(geo.alpha)*sin(geo.theta)*auxPoint.z; - - point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+ - (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+ - sin(geo.alpha)*sin(geo.theta)*auxPoint.z; - - point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+ - sin(geo.theta)*sin(geo.psi)*auxPoint.y+ - cos(geo.theta)*auxPoint.z; - - -} -//______________________________________________________________________________ -// -// Function: freeGeoArray -// -// Description: Frees the memory from the geometry array for multiGPU. -//______________________________________________________________________________ -void freeGeoArray(unsigned int splits,Geometry* geoArray){ - for(unsigned int sp=0;sp -#include -#include -#include "ray_interpolated_projection_parallel.hpp" -#include "TIGRE_common.hpp" -#include - -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\ - } \ -} while (0) - - - -#define MAXTREADS 1024 -#define PROJ_PER_BLOCK 8 -#define PIXEL_SIZE_BLOCK 8 -/*GEOMETRY DEFINITION - * - * Detector plane, behind - * |-----------------------------| - * | | - * | | - * | | - * | | - * | +--------+ | - * | / /| | - * A Z | / / |*D | - * | | +--------+ | | - * | | | | | | - * | | | *O | + | - * --->y | | | / | - * / | | |/ | - * V X | +--------+ | - * |-----------------------------| - * - * *S - * - * - * - * - * - **/ -void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream); -__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK]; // Dev means it is on device -__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK]; // Dev means it is on device - - - -__global__ void kernelPixelDetector_parallel_interpolated( Geometry geo, - float* detector, - const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) -{ -// Point3D source , -// Point3D deltaU, -// Point3D deltaV, -// Point3D uvOrigin, -// float DSO, -// float maxdist){ - - unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x; - unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y; - unsigned long long projNumber=threadIdx.z; - - if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK) - return; - - int indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber; // This is the ABSOLUTE projection number in the projection array - - -#if IS_FOR_MATLAB_TIGRE - size_t idx = (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#else - size_t idx = (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ; -#endif - - if(indAlpha>=totalNoOfProjections) - return; - - Point3D uvOrigin = projParamsArrayDev[4*projNumber]; // 6*projNumber because we have 6 Point3D values per projection - Point3D deltaU = projParamsArrayDev[4*projNumber+1]; - Point3D deltaV = projParamsArrayDev[4*projNumber+2]; - Point3D source = projParamsArrayDev[4*projNumber+3]; - - float DSO = projFloatsArrayDev[2*projNumber+0]; - float maxdist = projFloatsArrayDev[2*projNumber+1]; - - - /////// Get coordinates XYZ of pixel UV - unsigned long pixelV = geo.nDetecV-v-1; - unsigned long pixelU = u; - - - float vectX,vectY,vectZ; - Point3D P; - P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x); - P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y); - P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); - Point3D S; - S.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x); - S.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y); - S.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z); - - // Length is the ray length in normalized space - double length=sqrtf((S.x-P.x)*(S.x-P.x)+(S.y-P.y)*(S.y-P.y)+(S.z-P.z)*(S.z-P.z)); - //now legth is an integer of Nsamples that are required on this line - length=ceilf(length/geo.accuracy);//Divide the directional vector by an integer - vectX=(P.x -S.x)/(length); - vectY=(P.y -S.y)/(length); - vectZ=(P.z -S.z)/(length); - - -// //Integrate over the line - float tx,ty,tz; - float sum=0; - float i; - - - // limit the amount of mem access after the cube, but before the detector. - if ((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy < length) - length=ceilf((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy); - //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel") - - for (i=floorf(maxdist/geo.accuracy); i<=length; i=i+1){ - tx=vectX*i+S.x; - ty=vectY*i+S.y; - tz=vectZ*i+S.z; - - sum += tex3D(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time. - - } - float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+ - (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+ - (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) ); - detector[idx]=sum*deltalength; -} - - - -int interpolation_projection_parallel(float * img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){ - - - - size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float); - float** dProjection=(float **)malloc(2*sizeof(float *)); - for (int i = 0; i < 2; ++i){ - cudaMalloc((void**)&dProjection[i], num_bytes); - cudaCheckErrors("cudaMalloc projections fail"); - } - // allocate streams for memory and compute - int nStreams=2; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; - - for (int i = 0; i < 2; ++i){ - cudaStreamCreate(&stream[i]); - } - - - // Texture object variables - cudaTextureObject_t *texImg = 0; - cudaArray **d_cuArrTex = 0; - texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*)); - - CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream); - cudaCheckErrors("Texture allocation fail"); - //Done! Image put into texture memory. - - - - Point3D source, deltaU, deltaV, uvOrigin; - - Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D)); - float* projFloatsArrayHost; - cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float)); - - // 16x16 gave the best performance empirically - // Funnily that makes it compatible with most GPUs..... - int divU,divV,divangle; - divU=PIXEL_SIZE_BLOCK; - divV=PIXEL_SIZE_BLOCK; - - dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1); - dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK); - unsigned int proj_global; - unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK - unsigned int i; - - float maxdist; - for ( i=0; i=nangles) - break; - - geo.alpha=angles[proj_global*3]; - geo.theta=angles[proj_global*3+1]; - geo.psi =angles[proj_global*3+2]; - //precomute distances for faster execution - maxdist=maxdistanceCuboid(geo,proj_global); - //Precompute per angle constant stuff for speed - computeDeltas_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source); - //Ray tracing! - projParamsArrayHost[4*j]=uvOrigin; // 6*j because we have 6 Point3D values per projection - projParamsArrayHost[4*j+1]=deltaU; - projParamsArrayHost[4*j+2]=deltaV; - projParamsArrayHost[4*j+3]=source; - - projFloatsArrayHost[2*j]=geo.DSO[proj_global]; - projFloatsArrayHost[2*j+1]=floor(maxdist); - - } - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); - - kernelPixelDetector_parallel_interpolated<<>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]); - // copy result to host - if (i>0) - cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]); - } - cudaDeviceSynchronize(); - - int lastangles=nangles-(i-1)*PROJ_PER_BLOCK; - cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]); - - - cudaDestroyTextureObject(texImg[0]); - cudaFreeArray(d_cuArrTex[0]); - free(texImg); texImg = 0; - free(d_cuArrTex); d_cuArrTex = 0; - cudaCheckErrors("Unbind fail"); - cudaFree(dProjection[0]); - cudaFree(dProjection[1]); - free(dProjection); - cudaFreeHost(projParamsArrayHost); - cudaFreeHost(projFloatsArrayHost); - - cudaCheckErrors("cudaFree d_imagedata fail"); - - - for (int i = 0; i < 2; ++i){ - cudaStreamDestroy(stream[i]); - } -// cudaDeviceReset(); - - return 0; -} - - - - -/* This code precomputes The location of the source and the Delta U and delta V (in the warped space) - * to compute the locations of the x-rays. While it seems verbose and overly-optimized, - * it does saves about 30% of each of the kernel calls. Thats something! - **/ -void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){ - Point3D S; - S.x=geo.DSO[i]; - S.y=geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); - S.z=geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); - - //End point - Point3D P,Pu0,Pv0; - - P.x =-(geo.DSD[i]-geo.DSO[i]); P.y = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); P.z = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); - Pu0.x=-(geo.DSD[i]-geo.DSO[i]); Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5); Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0); - Pv0.x=-(geo.DSD[i]-geo.DSO[i]); Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5); Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1); - // Geometric trasnformations: - P.x=0;Pu0.x=0;Pv0.x=0; - - // Roll pitch yaw - rollPitchYaw(geo,i,&P); - rollPitchYaw(geo,i,&Pu0); - rollPitchYaw(geo,i,&Pv0); - //Now lets translate the points where they should be: - P.x=P.x-(geo.DSD[i]-geo.DSO[i]); - Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]); - Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]); - - S.x=0; - // Roll pitch yaw - rollPitchYaw(geo,i,&S); - //Now lets translate the points where they should be: - S.x=S.x+geo.DSO[i]; - - - //1: Offset detector - - //P.x - P.y =P.y +geo.offDetecU[i]; P.z =P.z +geo.offDetecV[i]; - Pu0.y=Pu0.y+geo.offDetecU[i]; Pu0.z=Pu0.z+geo.offDetecV[i]; - Pv0.y=Pv0.y+geo.offDetecU[i]; Pv0.z=Pv0.z+geo.offDetecV[i]; - //S doesnt need to chagne - - - //3: Rotate (around z)! - Point3D Pfinal, Pfinalu0, Pfinalv0; - Pfinal.x =P.x; - Pfinal.y =P.y +geo.offDetecU[i]; Pfinal.z =P.z +geo.offDetecV[i]; - Pfinalu0.x=Pu0.x; - Pfinalu0.y=Pu0.y +geo.offDetecU[i]; Pfinalu0.z =Pu0.z +geo.offDetecV[i]; - Pfinalv0.x=Pv0.x; - Pfinalv0.y=Pv0.y +geo.offDetecU[i]; Pfinalv0.z =Pv0.z +geo.offDetecV[i]; - - eulerZYZ(geo,&Pfinal); - eulerZYZ(geo,&Pfinalu0); - eulerZYZ(geo,&Pfinalv0); - eulerZYZ(geo,&S); - - - - //2: Offset image (instead of offseting image, -offset everything else) - - Pfinal.x =Pfinal.x-geo.offOrigX[i]; Pfinal.y =Pfinal.y-geo.offOrigY[i]; Pfinal.z =Pfinal.z-geo.offOrigZ[i]; - Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i]; Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i]; Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i]; - Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i]; Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i]; Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i]; - S.x=S.x-geo.offOrigX[i]; S.y=S.y-geo.offOrigY[i]; S.z=S.z-geo.offOrigZ[i]; - - // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation); - Pfinal.x =Pfinal.x+geo.sVoxelX/2-geo.dVoxelX/2; Pfinal.y =Pfinal.y+geo.sVoxelY/2-geo.dVoxelY/2; Pfinal.z =Pfinal.z +geo.sVoxelZ/2-geo.dVoxelZ/2; - Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2-geo.dVoxelX/2; Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2-geo.dVoxelY/2; Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2-geo.dVoxelZ/2; - Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2-geo.dVoxelX/2; Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2-geo.dVoxelY/2; Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2-geo.dVoxelZ/2; - S.x =S.x+geo.sVoxelX/2-geo.dVoxelX/2; S.y =S.y+geo.sVoxelY/2-geo.dVoxelY/2; S.z =S.z +geo.sVoxelZ/2-geo.dVoxelZ/2; - - //4. Scale everything so dVoxel==1 - Pfinal.x =Pfinal.x/geo.dVoxelX; Pfinal.y =Pfinal.y/geo.dVoxelY; Pfinal.z =Pfinal.z/geo.dVoxelZ; - Pfinalu0.x=Pfinalu0.x/geo.dVoxelX; Pfinalu0.y=Pfinalu0.y/geo.dVoxelY; Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ; - Pfinalv0.x=Pfinalv0.x/geo.dVoxelX; Pfinalv0.y=Pfinalv0.y/geo.dVoxelY; Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ; - S.x =S.x/geo.dVoxelX; S.y =S.y/geo.dVoxelY; S.z =S.z/geo.dVoxelZ; - - - - //5. apply COR. Wherever everything was, now its offesetd by a bit - float CORx, CORy; - CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX; - CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY; - Pfinal.x+=CORx; Pfinal.y+=CORy; - Pfinalu0.x+=CORx; Pfinalu0.y+=CORy; - Pfinalv0.x+=CORx; Pfinalv0.y+=CORy; - S.x+=CORx; S.y+=CORy; - - // return - - *uvorigin=Pfinal; - - deltaU->x=Pfinalu0.x-Pfinal.x; - deltaU->y=Pfinalu0.y-Pfinal.y; - deltaU->z=Pfinalu0.z-Pfinal.z; - - deltaV->x=Pfinalv0.x-Pfinal.x; - deltaV->y=Pfinalv0.y-Pfinal.y; - deltaV->z=Pfinalv0.z-Pfinal.z; - - *source=S; -} -void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){ //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - - - const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - //cuda Array - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); - - - cudaMemcpy3DParms copyParams = {0}; - //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height); - copyParams.dstArray = d_cuArrTex[0]; - copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[1]); - - - //Array creation End - - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); - -} \ No newline at end of file diff --git a/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip b/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip deleted file mode 100644 index 1280b6ed..00000000 --- a/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip +++ /dev/null @@ -1,65 +0,0 @@ -/*------------------------------------------------------------------------- - * - * Header CUDA functions for texture-memory interpolation based projection - * - * - * CODE by Ander Biguri - * Sepideh Hatamikia (arbitrary rotation) ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ - - - - -#include "ray_interpolated_projection.hpp" - -#include "types_TIGRE.hpp" -#include "GpuIds.hpp" - -#ifndef PROJECTION_PARALLEL_HPP -#define PROJECTION_PARALLEL_HPP - -int interpolation_projection_parallel(float* img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids); -// float computeMaxLength(Geometry geo, float alpha); -void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source); - -// float maxDistanceCubeXY(Geometry geo, float alpha,int i); - -// below, not used -Geometry nomralizeGeometryImage(Geometry geo); -#endif \ No newline at end of file diff --git a/Common/CUDA/tv_proximal.cu.prehip b/Common/CUDA/tv_proximal.cu.prehip deleted file mode 100644 index 32ae99c2..00000000 --- a/Common/CUDA/tv_proximal.cu.prehip +++ /dev/null @@ -1,693 +0,0 @@ -/*------------------------------------------------------------------------- - * - * MATLAB MEX functions for TV image denoising. Check inputs and parses - * MATLAB data to C++ data. - * - * - * CODE by Imanol Luengo - * PhD student University of Nottingham - * imaluengo@gmail.com - * 2015 - * Modified by Ander Biguri for multi-GPU - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - - - -// http://gpu4vision.icg.tugraz.at/papers/2010/knoll.pdf#pub47 -#define MAXTREADS 1024 -#define MAX_BUFFER 60 -#define BLOCK_SIZE 10 // BLOCK_SIZE^3 must be smaller than MAXTREADS - -#include "tv_proximal.hpp" -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - cudaDeviceReset();\ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",cudaGetErrorString(__err));\ - } \ -} while (0) -void cpy_from_host(float* device_array,float* host_array, - unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, - unsigned long long pixels_per_slice, unsigned int buffer_length, - cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size); - - - __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n) - { - unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x; - for(; i= 0 ) { - _div += (pz[idx] - pz[(z-1)*size2d + y*cols + x]) / dz; - } else { - _div += pz[idx]; - } - - if ( y - 1 >= 0 ) { - _div += (py[idx] - py[z*size2d + (y-1)*cols + x]) / dy; - } else { - _div += py[idx]; - } - - if ( x - 1 >= 0 ) { - _div += (px[idx] - px[z*size2d + y*cols + (x-1)]) / dx; - } else { - _div += px[idx]; - } - - return _div; - } - - __device__ __inline__ - void gradient(const float* u, float* grad, - long z, long y, long x, - long depth, long rows, long cols, - float dz, float dy, float dx) - { - long size2d = rows*cols; - long idx = z * size2d + y * cols + x; - - float uidx = u[idx]; - - if ( z + 1 < depth ) { - grad[0] = (u[(z+1)*size2d + y*cols + x] - uidx) / dz; - } - - if ( y + 1 < rows ) { - grad[1] = (u[z*size2d + (y+1)*cols + x] - uidx) / dy; - } - - if ( x + 1 < cols ) { - grad[2] = (u[z*size2d + y*cols + (x+1)] - uidx) / dx; - } - } - - - __global__ - void update_u(const float* f, const float* pz, const float* py, const float* px, float* u, - float tau, float lambda, - long depth, long rows, long cols, - float dz, float dy, float dx) - { - long x = threadIdx.x + blockIdx.x * blockDim.x; - long y = threadIdx.y + blockIdx.y * blockDim.y; - long z = threadIdx.z + blockIdx.z * blockDim.z; - long idx = z * rows * cols + y * cols + x; - - if ( x >= cols || y >= rows || z >= depth ) - return; - - float _div = divergence(pz, py, px, z, y, x, depth, rows, cols, dz, dy, dx); - - u[idx] = u[idx] * (1.0f - tau) + tau * (f[idx] + (1.0f/lambda) * _div); - } - - - __global__ - void update_p(const float* u, float* pz, float* py, float* px, - float tau, long depth, long rows, long cols, - float dz, float dy, float dx) - { - long x = threadIdx.x + blockIdx.x * blockDim.x; - long y = threadIdx.y + blockIdx.y * blockDim.y; - long z = threadIdx.z + blockIdx.z * blockDim.z; - long idx = z * rows * cols + y * cols + x; - - if ( x >= cols || y >= rows || z >= depth ) - return; - - float grad[3] = {0,0,0}, q[3]; - gradient(u, grad, z, y, x, depth, rows, cols, dz, dy, dx); - - q[0] = pz[idx] + tau * grad[0]; - q[1] = py[idx] + tau * grad[1]; - q[2] = px[idx] + tau * grad[2]; - - float norm = fmaxf(1.0f, sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2])); - - pz[idx] = q[0] / norm; - py[idx] = q[1] / norm; - px[idx] = q[2] / norm; - } - - -// Main function - void tvdenoising(float* src, float* dst, float lambda, - const float* spacing, const long* image_size, int maxIter, const GpuIds& gpuids) { - - // Prepare for MultiGPU - int deviceCount = gpuids.GetLength(); - cudaCheckErrors("Device query fail"); - if (deviceCount == 0) { - mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","There are no available device(s) that support CUDA\n"); - } - // - // CODE assumes - // 1.-All available devices are usable by this code - // 2.-All available devices are equal, they are the same machine (warning thrown) - // Check the available devices, and if they are the same - if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); - } - int dev; - - // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. - - size_t mem_GPU_global; - checkFreeMemory(gpuids, &mem_GPU_global); - - - // %5 of free memory should be enough, we have almost no variables in these kernels - size_t total_pixels = image_size[0] * image_size[1] * image_size[2] ; - const size_t pixels_per_slice = image_size[0] * image_size[1] ; - const size_t mem_slice_image = sizeof(float)* pixels_per_slice ; - const size_t mem_size_image = sizeof(float)* total_pixels; - - // Decide how are we handling the distribution of computation - size_t mem_img_each_GPU; - - unsigned int buffer_length=1; - //Does everything fit in the GPU? - unsigned int slices_per_split; - unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this. - if(mem_GPU_global> 5*mem_size_image+5*mem_slice_image*buffer_length*2){ - // We only need to split if we have extra GPUs - slices_per_split=(image_size[2]+deviceCount-1)/deviceCount; - mem_img_each_GPU=mem_slice_image*( (image_size[2]+deviceCount-1)/deviceCount + buffer_length*2); - }else{ - // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all - size_t mem_free=mem_GPU_global; - - splits=(unsigned int)(ceil(((float)(5*mem_size_image)/(float)(deviceCount))/mem_free)); - // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images. - // lets make sure these 2 slices fit, if they do not, add 1 to splits. - slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); - mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); - - // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory - if (mem_GPU_global< 5*mem_img_each_GPU){ - // one more split should do the job, as its an edge case. - splits++; - //recompute for later - slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap - mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); - } - - // How many EXTRA buffer slices should be able to fit in here??!?! - mem_free=mem_GPU_global-(5*mem_img_each_GPU); - unsigned int extra_buff=(mem_free/mem_slice_image); - buffer_length=(extra_buff/2)/5; // we need double whatever this results in, rounded down. - - buffer_length=min(MAX_BUFFER,buffer_length); - - mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2)); - - // Assert - if (mem_GPU_global< 5*mem_img_each_GPU){ - mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","Bad assert. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n"); - } - } - - - // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported = 0; -#if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - if (isHostRegisterSupported & splits>1){ - cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable); - } - cudaCheckErrors("Error pinning memory"); - - - - // Lets allocate auxiliary variables. - float* buffer_u, *buffer_px, *buffer_py, *buffer_pz; - float* h_px, *h_py, *h_pz, *h_u; - if(splits>1){ - - //These take A LOT of memory and A LOT of time to use. If we can avoid using them, better. - if (buffer_length1 & i>0){ - - for (dev = 0; dev < deviceCount; dev++){ - is_last_chunk=!((sp*deviceCount+dev)>>(d_pz[dev], -1, pixels_per_slice*buffer_length); - } - if (is_last_chunk){ - multiplyArrayScalar<<<60,MAXTREADS,0,stream[dev*nStream_device+4]>>>(d_pz[dev]+bytes_device[dev],-1, pixels_per_slice*buffer_length); - } - } - for (dev = 0; dev < deviceCount; dev++){ - is_last_chunk=!((sp*deviceCount+dev)>>(d_src[dev], d_pz[dev], d_py[dev], d_px[dev], d_u[dev], tau1, lambda, - (long)(curr_slices+buffer_length*2), image_size[1],image_size[0], - spacing[2], spacing[1], spacing[0]); - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - curr_slices=((sp*deviceCount+dev+1)*slices_per_split>>(d_u[dev], d_pz[dev], d_py[dev], d_px[dev], tau2, - (long)(curr_slices+buffer_length*2), image_size[1], image_size[0], - spacing[2], spacing[1], spacing[0]); - } - }// END internal iter - - // Synchronize mathematics, make sure bounding pixels are correct - for(dev=0; dev0){ - // U - cudaSetDevice(gpuids[dev-1]); - cudaMemcpyAsync(buffer_u, d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]); - cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]); - cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]); - cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]); - - - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]); - cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]); - cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]); - cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]); - cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]); - cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]); - - - } - } - // This is the case when we can't solely use GPU memory, as the total size of the images+variables exceeds total amounf of memory among GPUs. - // This situation requires partial results and full memory allocation in the host. - }else{ - // Vopy all the U variable into the host. - for(dev=0; dev1 && buffer_length1){ - cudaHostUnregister(src); - cudaHostUnregister(dst); - } - for(dev=0; dev Origin is at (0,0,0). Image center is there +offOrig - // -> at angle 0, source + image centre (without the offset) + detector centre (without offset) - // are aligned in the Y_Z plane. - // -> detector is orthonormal to projection plane. - - //Parameters part of the image geometry - int nVoxelX, nVoxelY, nVoxelZ; - float sVoxelX, sVoxelY, sVoxelZ; - float dVoxelX, dVoxelY, dVoxelZ; - float *offOrigX,*offOrigY,*offOrigZ; - float* DSO; - // Parameters of the Detector. - int nDetecU, nDetecV; - float sDetecU, sDetecV; - float dDetecU, dDetecV; - float *offDetecU, *offDetecV; - float* DSD; - float* dRoll; - float* dPitch; - float* dYaw; - // The base unit we are working with in mm. - float unitX; - float unitY; - float unitZ; - - //rotation angle for e uler (ZYZ) - float alpha; - float theta; - float psi; - // Centre of Rotation correction. - float* COR; - //Maximum length of cube - float maxLength; - //User option - float accuracy; -}; - - struct Point3D{ - float x; - float y; - float z; -}; - -struct Point3Ddouble{ - double x; - double y; - double z; - - // cast to float member function for "copying" Point3Ddouble to Point3D - Point3D to_float() - { - Point3D castToFloat; - castToFloat.x = (float)x; - castToFloat.y = (float)y; - castToFloat.z = (float)z; - return(castToFloat); - } -}; - -#endif \ No newline at end of file diff --git a/Common/CUDA/voxel_backprojection.cu.prehip b/Common/CUDA/voxel_backprojection.cu.prehip deleted file mode 100644 index bec4d909..00000000 --- a/Common/CUDA/voxel_backprojection.cu.prehip +++ /dev/null @@ -1,920 +0,0 @@ -/*------------------------------------------------------------------------- - * - * CUDA function for backrpojection using FDK weigts for CBCT - * - * - * CODE by Ander Biguri - * Optimized and modified by RB - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - -#define PI_2 1.57079632679489661923 -#include -#include -#include -#include "voxel_backprojection.hpp" -#include "TIGRE_common.hpp" -#include -#include "GpuIds.hpp" - -// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ - } \ -} while (0) - - -#define MAXTREADS 1024 - /*GEOMETRY DEFINITION - * - * Detector plane, behind - * |-----------------------------| - * | | - * | | - * | | - * | | - * | +--------+ | - * | / /| | - * A Z | / / |*D | - * | | +--------+ | | - * | | | | | | - * | | | *O | + | - * *--->y | | | / | - * / | | |/ | - * V X | +--------+ | - * |-----------------------------| - * - * *S - * - * - * - * - * - **/ - - void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate); - - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were: -// PROJ_PER_KERNEL = 32 or 16 (very similar times) -// VOXELS_PER_THREAD = 8 -// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code. -// (e.g. 16.2 s vs. ~62 s). - -const int PROJ_PER_KERNEL = 32; // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck. -const int VOXELS_PER_THREAD = 8; // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck. - -// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection: -// deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec -// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel -// (they will be updated in the main loop before each kernel call). - -__constant__ Point3D projParamsArrayDev[6*PROJ_PER_KERNEL]; // Dev means it is on device - -// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above) -// Point3D projParamsArrayHost[6*PROJ_PER_KERNEL]; // Host means it is host memory - -// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection) -__constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL]; - - - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - -//______________________________________________________________________________ -// -// Function: kernelPixelBackprojectionFDK -// -// Description: Main FDK backprojection kernel -//______________________________________________________________________________ - -__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex) -{ - - // Old kernel call signature: - // kernelPixelBackprojectionFDK<<>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha); - // We just read in most of the params from the constant memory instead of getting them from the param list. - // This is because we now have MANY params, since single kernel processes more than one projection! - /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo, - * float* image, - * const int indAlpha, - * const Point3D deltaX , - * const Point3D deltaY, - * const Point3D deltaZ, - * const Point3D xyzOrigin, - * const Point3D xyzOffset, - * const Point3D uv0Offset, - * const float sinalpha, - * const float cosalpha){ - */ - unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y; - unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x; - // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle - unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle - //Make sure we don't go out of bounds - if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ) - return; - - // We'll keep a local auxiliary array of values of a column of voxels that this thread will update - float voxelColumn[VOXELS_PER_THREAD]; - - // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then - // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes - - unsigned long colIdx; -#pragma unroll - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; - voxelColumn[colIdx] = image[idx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) - // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. - } // END copy 3D volume voxels to local array - - // Now iterate through projections -#pragma unroll - for(unsigned long projNumber=0; projNumber=totalNoOfProjections) - break; - - Point3D deltaX = projParamsArrayDev[6*projNumber]; // 6*projNumber because we have 6 Point3D values per projection - Point3D deltaY = projParamsArrayDev[6*projNumber+1]; - Point3D deltaZ = projParamsArrayDev[6*projNumber+2]; - Point3D xyzOrigin = projParamsArrayDev[6*projNumber+3]; - Point3D xyzOffset = projParamsArrayDev[6*projNumber+4]; - Point3D S = projParamsArrayDev[6*projNumber+5]; - - float sinalpha = projSinCosArrayDev[5*projNumber]; // 2*projNumber because we have 2 float (sin or cos angle) values per projection - float cosalpha = projSinCosArrayDev[5*projNumber+1]; - float COR = projSinCosArrayDev[5*projNumber+2]; - float DSD = projSinCosArrayDev[5*projNumber+3]; - float DSO = projSinCosArrayDev[5*projNumber+4]; - - float auxCOR=COR/geo.dDetecU; - // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION -#pragma unroll - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles. - Point3D P; - P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x); - P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-auxCOR; - P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z); - - // This is the vector defining the line from the source to the Voxel - float vectX,vectY,vectZ; - vectX=(P.x -S.x); - vectY=(P.y -S.y); - vectZ=(P.z -S.z); - - // Get the coordinates in the detector UV where the mid point of the voxel is projected. - float t=__fdividef(DSO-DSD-S.x,vectX); - float y,z; - y=vectY*t+S.y; - z=vectZ*t+S.z; - float u,v; - u=y+(float)geo.nDetecU*0.5f; - v=z+(float)geo.nDetecV*0.5f; - - float weight; - float realx,realy; - realx=-(geo.sVoxelX-geo.dVoxelX)*0.5f +indX*geo.dVoxelX +xyzOffset.x; - realy=-(geo.sVoxelY-geo.dVoxelY)*0.5f +indY*geo.dVoxelY +xyzOffset.y+COR; - - weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO); - - weight=__frcp_rd(weight*weight); - - // Get Value in the computed (U,V) and multiply by the corresponding weight. - // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) - -#if IS_FOR_MATLAB_TIGRE - voxelColumn[colIdx]+=tex3D(tex, v, u ,indAlpha+0.5f)*weight; -#else - voxelColumn[colIdx]+=tex3D(tex, u, v ,indAlpha+0.5f)*weight; -#endif - } // END iterating through column of voxels - - } // END iterating through multiple projections - - // And finally copy the updated local voxelColumn array back to our 3D volume (main memory) -#pragma unroll - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; - image[idx] = voxelColumn[colIdx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) - // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. - // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write. - // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is - // better for avoiding memory congestion. - } // END copy updated voxels from local array to our 3D volume - -} // END kernelPixelBackprojectionFDK - - - - -//______________________________________________________________________________ -// -// Function: voxel_backprojection -// -// Description: Main host function for FDK backprojection (invokes the kernel) -//______________________________________________________________________________ - -int voxel_backprojection(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids) -{ - // printf("voxel_backprojection(geo.nDetector = %d, %d)\n", geo.nDetecU, geo.nDetecV); - // printf("geo.nVoxel = %d, %d, %d\n", geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ); - - // Prepare for MultiGPU - int deviceCount = gpuids.GetLength(); - cudaCheckErrors("Device query fail"); - if (deviceCount == 0) { - mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n"); - } - - // CODE assumes - // 1.-All available devices are usable by this code - // 2.-All available devices are equal, they are the same machine (warning thrown) - // Check the available devices, and if they are the same - if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); - } - - int dev; - // Split the CT problem - unsigned int split_image; - unsigned int split_projections; - splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections); - - - cudaCheckErrors("Error"); - //Pagelock memory for synchronous copy. - // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported = 0; -#if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to - // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. -#ifndef NO_PINNED_MEMORY - if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ - cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); - } - if (isHostRegisterSupported ){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); - } -#endif - cudaCheckErrors("Error pinning memory"); - - - // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the - // image slices. The rest of the Geometry is the same - Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry)); - createGeoArray(split_image*deviceCount,geo,geoArray,nalpha); - - // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly - // in the previous section this should leave enough space for the textures. - size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); - float** dimage=(float**)malloc(deviceCount*sizeof(float*)); - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMalloc((void**)&dimage[dev], num_bytes_img); - cudaCheckErrors("cudaMalloc fail"); - } - - //If it is the first time, lets make sure our image is zeroed. - int nStreamDevice=2; - int nStreams=deviceCount*nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i+dev*nStreamDevice]); - - } - } - - - - - // Kernel auxiliary variables - Point3D* projParamsArrayHost; - cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D)); - float* projSinCosArrayHost; - cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float)); - - - // Texture object variables - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); - - // Auxiliary Host page-locked memory for fast and asycnornous memcpy. - - // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop - // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution - // of the code, as repeated allocation and deallocation only happens when the projection data is very very big, - // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time. - unsigned long long proj_linear_idx_start; - unsigned int proj_split_overlap_number; - unsigned int current_proj_split_size,current_proj_overlap_split_size; - size_t num_bytes_img_curr; - size_t img_linear_idx_start; - float** partial_projection; - size_t* proj_split_size; - - - - for(unsigned int img_slice=0;img_slice=proj_split_size[proj_block_split]) - break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. - if(currProjNumber_global>=nalpha) - break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. - - Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source; - float sinalpha,cosalpha; - - geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now. - geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1]; - geoArray[img_slice*deviceCount+dev].psi =-alphas[currProjNumber_global*3+2]; - -// mexPrintf("%u %f \n",i,geoArray[img_slice*deviceCount+dev].alpha); -// mexPrintf("%u \n",currProjNumber_global); - - sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha); - cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha); - - projSinCosArrayHost[5*j]=sinalpha; // 2*j because we have 2 float (sin or cos angle) values per projection - projSinCosArrayHost[5*j+1]=cosalpha; - projSinCosArrayHost[5*j+2]=geo.COR[currProjNumber_global]; - projSinCosArrayHost[5*j+3]=geo.DSD[currProjNumber_global]; - projSinCosArrayHost[5*j+4]=geo.DSO[currProjNumber_global]; - - computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); - - offOrig.x=geo.offOrigX[currProjNumber_global]; - offOrig.y=geo.offOrigY[currProjNumber_global]; - offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global]; - - projParamsArrayHost[6*j]=deltaX; // 6*j because we have 6 Point3D values per projection - projParamsArrayHost[6*j+1]=deltaY; - projParamsArrayHost[6*j+2]=deltaZ; - projParamsArrayHost[6*j+3]=xyzOrigin; - projParamsArrayHost[6*j+4]=offOrig; - projParamsArrayHost[6*j+5]=source; - } // END for (preparing params for kernel call) - - // Copy the prepared parameter arrays to constant memory to make it available for the kernel - cudaMemcpyToSymbolAsync(projSinCosArrayDev, projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaStreamSynchronize(stream[dev*nStreamDevice]); - - kernelPixelBackprojectionFDK<<>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); - } // END for - ////////////////////////////////////////////////////////////////////////////////////// - // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject - ////////////////////////////////////////////////////////////////////////////////////// - }// END for deviceCount - } // END sub-split of current projection chunk - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - - } // END projection splits - - - // Now we need to take the image out of the GPU - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - // We do not need to sycnronize because the array dealocators already do. - num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); - img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); - cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); - } - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - cudaCheckErrors("Main loop fail"); - } - - } // end image splits - - ///////// Cleaning: - - - bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1; - for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) - if (!two_buffers_used && i==1) - break; - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texProj[i*deviceCount+dev]); - cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); - } - } - cudaCheckErrors("cudadestroy textures result fail"); - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dimage[dev]); - } - cudaFreeHost(projSinCosArrayHost); - cudaFreeHost(projParamsArrayHost); - free(partial_projection); - free(proj_split_size); - - freeGeoArray(split_image*deviceCount,geoArray); -#ifndef NO_PINNED_MEMORY - if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){ - cudaHostUnregister(result); - } - if (isHostRegisterSupported){ - cudaHostUnregister(projections); - } -#endif - - for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); - - cudaCheckErrors("cudaFree fail"); - - //cudaDeviceReset(); // For the Nvidia Visual Profiler - return 0; - -} // END voxel_backprojection -// - -void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){ - - - // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. - - size_t mem_GPU_global; - checkFreeMemory(gpuids, &mem_GPU_global); - - const int deviceCount = gpuids.GetLength(); - - // Compute how much memory each of the relevant memory pieces need - size_t mem_image= (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); - size_t mem_proj= (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float); - - - - - // Does everything fit in the GPU? - - if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2(); - //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); - - } - } - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; - //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); - copyParams.dstArray = d_cuArrTex[dev]; - copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); - } - - //Array creation End - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); - } -} - -//______________________________________________________________________________ -// -// Function: createGeoArray -// -// Description: This code generates the geometries needed to split the image properly in -// cases where the entire image does not fit in the memory of the GPU -//______________________________________________________________________________ - -void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles){ - - - unsigned int splitsize=(geo.nVoxelZ+image_splits-1)/image_splits; - - for(unsigned int sp=0;spx; - auxPoint.y=point->y; - auxPoint.z=point->z; - - // calculate sin and cos of 3 angles (used multiple times) - double sin_alpha, cos_alpha, sin_theta, cos_theta, sin_psi, cos_psi; - sin_alpha = sin((double)geo.alpha); - cos_alpha = cos((double)geo.alpha); - sin_theta = sin((double)geo.theta); - cos_theta = cos((double)geo.theta); - sin_psi = sin((double)geo.psi); - cos_psi = cos((double)geo.psi); - - point->x = auxPoint.x*(cos_psi*cos_theta*cos_alpha-sin_psi*sin_alpha) - +auxPoint.y*(-cos_psi*cos_theta*sin_alpha-sin_psi*cos_alpha) - +auxPoint.z*cos_psi*sin_theta; - point->y = auxPoint.x*(sin_psi*cos_theta*cos_alpha+cos_psi*sin_alpha) - +auxPoint.y*(-sin_psi*cos_theta*sin_alpha+cos_psi*cos_alpha) - +auxPoint.z*sin_psi*sin_theta; - point->z =-auxPoint.x*sin_theta*cos_alpha - +auxPoint.y*sin_theta*sin_alpha - +auxPoint.z*cos_theta; -} - -void rollPitchYawT(Geometry geo,int i, Point3Ddouble* point){ - - Point3Ddouble auxPoint; - auxPoint.x=point->x; - auxPoint.y=point->y; - auxPoint.z=point->z; - - // calculate sin and cos of 3 angles (used multiple times) - double sin_dRoll, cos_dRoll, sin_dPitch, cos_dPitch, sin_dYaw, cos_dYaw; - sin_dRoll = sin((double)geo.dRoll[i]); - cos_dRoll = cos((double)geo.dRoll[i]); - sin_dPitch = sin((double)geo.dPitch[i]); - cos_dPitch = cos((double)geo.dPitch[i]); - sin_dYaw = sin((double)geo.dYaw[i]); - cos_dYaw = cos((double)geo.dYaw[i]); - - point->x=cos_dRoll*cos_dPitch*auxPoint.x - +sin_dRoll*cos_dPitch*auxPoint.y - -sin_dPitch*auxPoint.z; - - point->y=(cos_dRoll*sin_dPitch*sin_dYaw - sin_dRoll*cos_dYaw)*auxPoint.x - +(sin_dRoll*sin_dPitch*sin_dYaw + cos_dRoll*cos_dYaw)*auxPoint.y - +cos_dPitch*sin_dYaw*auxPoint.z; - - point->z=(cos_dRoll*sin_dPitch*cos_dYaw + sin_dRoll*sin_dYaw)*auxPoint.x - +(sin_dRoll*sin_dPitch*cos_dYaw - cos_dRoll*sin_dYaw)*auxPoint.y - +cos_dPitch*cos_dYaw*auxPoint.z; -} - -//______________________________________________________________________________ -// -// Function: computeDeltasCube -// -// Description: Computes relative increments for each projection (volume rotation). -// Increments get passed to the backprojection kernel. -//______________________________________________________________________________ - -void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S) -{ - - // initialize points with double precision - Point3Ddouble P, Px,Py,Pz; - - // Get coords of Img(0,0,0) - P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i]; - P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i]; - P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i]; - - // Get coords from next voxel in each direction - Px.x=P.x+geo.dVoxelX; Py.x=P.x; Pz.x=P.x; - Px.y=P.y; Py.y=P.y+geo.dVoxelY; Pz.y=P.y; - Px.z=P.z; Py.z=P.z; Pz.z=P.z+geo.dVoxelZ; - - // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ - eulerZYZT(geo,&P); - eulerZYZT(geo,&Px); - eulerZYZT(geo,&Py); - eulerZYZT(geo,&Pz); - - //detector offset - P.z =P.z-geo.offDetecV[i]; P.y =P.y-geo.offDetecU[i]; - Px.z =Px.z-geo.offDetecV[i]; Px.y =Px.y-geo.offDetecU[i]; - Py.z =Py.z-geo.offDetecV[i]; Py.y =Py.y-geo.offDetecU[i]; - Pz.z =Pz.z-geo.offDetecV[i]; Pz.y =Pz.y-geo.offDetecU[i]; - - //Detector Roll pitch Yaw - // - // first, we need to offset everything so (0,0,0) is the center of the detector - // Only X is required for that - P.x=P.x+(geo.DSD[i]-geo.DSO[i]); - Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]); - Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]); - Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]); - rollPitchYawT(geo,i,&P); - rollPitchYawT(geo,i,&Px); - rollPitchYawT(geo,i,&Py); - rollPitchYawT(geo,i,&Pz); - - P.x=P.x-(geo.DSD[i]-geo.DSO[i]); - Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]); - Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]); - Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]); - //Done for P, now source - Point3Ddouble source; - source.x=geo.DSD[i]; //already offseted for rotation - source.y=-geo.offDetecU[i]; - source.z=-geo.offDetecV[i]; - rollPitchYawT(geo,i,&source); - - source.x=source.x-(geo.DSD[i]-geo.DSO[i]);// source.y=source.y-auxOff.y; source.z=source.z-auxOff.z; - -// mexPrintf("%f,%f,%f\n",source.x,source.y,source.z); - // Scale coords so detector pixels are 1x1 - - P.z =P.z /geo.dDetecV; P.y =P.y/geo.dDetecU; - Px.z=Px.z/geo.dDetecV; Px.y=Px.y/geo.dDetecU; - Py.z=Py.z/geo.dDetecV; Py.y=Py.y/geo.dDetecU; - Pz.z=Pz.z/geo.dDetecV; Pz.y=Pz.y/geo.dDetecU; - - source.z=source.z/geo.dDetecV; source.y=source.y/geo.dDetecU; - - // get deltas of the changes in voxels - deltaX->x=Px.x-P.x; deltaX->y=Px.y-P.y; deltaX->z=Px.z-P.z; - deltaY->x=Py.x-P.x; deltaY->y=Py.y-P.y; deltaY->z=Py.z-P.z; - deltaZ->x=Pz.x-P.x; deltaZ->y=Pz.y-P.y; deltaZ->z=Pz.z-P.z; - - // cast the results from the double precision calculations back to float - *xyzorigin=P.to_float(); - *S=source.to_float(); -} - -void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ - size_t memfree; - size_t memtotal; - const int deviceCount = gpuids.GetLength(); - - for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); - if(dev==0) *mem_GPU_global=memfree; - if(memfree -#include -#include -#include "voxel_backprojection2.hpp" -#include "TIGRE_common.hpp" -#include -#include "GpuIds.hpp" - -// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ - } \ -} while (0) - - -#define MAXTREADS 1024 - /*GEOMETRY DEFINITION - * - * Detector plane, behind - * |-----------------------------| - * | | - * | | - * | | - * | | - * | +--------+ | - * | / /| | - * A Z | / / |*D | - * | | +--------+ | | - * | | | | | | - * | | | *O | + | - * *--->y | | | / | - * / | | |/ | - * V X | +--------+ | - * |-----------------------------| - * - * *S - * - * - * - * - * - **/ - -// this definitionmust go here. -void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate); - -__global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){ - size_t idx = threadIdx.x + blockIdx.x * blockDim.x; - for(; idx=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ) - return; - - // We'll keep a local auxiliary array of values of a column of voxels that this thread will update - float voxelColumn[VOXELS_PER_THREAD]; - - // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then - // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes - - unsigned long colIdx; -#pragma unroll - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; - voxelColumn[colIdx] = image[idx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) - // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. - } // END copy 3D volume voxels to local array - - // Now iterate through projections -#pragma unroll - for(unsigned long projNumber=0; projNumber=totalNoOfProjections) - break; - - Point3D deltaX = projParamsArray2Dev[7*projNumber]; // 6*projNumber because we have 6 Point3D values per projection - Point3D deltaY = projParamsArray2Dev[7*projNumber+1]; - Point3D deltaZ = projParamsArray2Dev[7*projNumber+2]; - Point3D xyzOrigin = projParamsArray2Dev[7*projNumber+3]; - Point3D xyzOffset = projParamsArray2Dev[7*projNumber+4]; - Point3D uv0Offset = projParamsArray2Dev[7*projNumber+5]; - Point3D S = projParamsArray2Dev[7*projNumber+6]; - - float sinalpha = projSinCosArray2Dev[5*projNumber]; // 2*projNumber because we have 2 float (sin or cos angle) values per projection - float cosalpha = projSinCosArray2Dev[5*projNumber+1]; - float COR = projSinCosArray2Dev[5*projNumber+2]; - float DSD = projSinCosArray2Dev[5*projNumber+3]; - float DSO = projSinCosArray2Dev[5*projNumber+4]; - // Precomputations for the weights: - //Real coords of Source - // We already have S.x (geo.DSO), and S.y and S.z are always zero. we just need to rotate - Point3D realS; - realS.x= DSO*cosalpha; - realS.y=-DSO*sinalpha; - realS.z=0; - - - Point3D realvoxel_init; - realvoxel_init.x=-geo.sVoxelX/2+geo.dVoxelX/2+xyzOffset.x; - realvoxel_init.y=-geo.sVoxelY/2+geo.dVoxelY/2+xyzOffset.y; - realvoxel_init.z=-geo.sVoxelZ/2+geo.dVoxelZ/2+xyzOffset.z; - // Real XYZ coordinates of Detector. - Point3D realD, realDaux; - // We know the index of the detector (u,v). Start from there. - realDaux.x=-(DSD-DSO); - - // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION -#pragma unroll - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles. - Point3D P; - P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x); - P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU; - P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z); - - // This is the vector defining the line from the source to the Voxel - float vectX,vectY,vectZ; - vectX=(P.x -S.x); - vectY=(P.y -S.y); - vectZ=(P.z -S.z); - - // Get the coordinates in the detector UV where the mid point of the voxel is projected. - float t=__fdividef(DSO-DSD-S.x,vectX); - float y,z; - y=vectY*t+S.y; - z=vectZ*t+S.z; - float u,v; - u=y+(float)geo.nDetecU*0.5f; - v=z+(float)geo.nDetecV*0.5f; -#if IS_FOR_MATLAB_TIGRE - float sample=tex3D(tex, v, u ,indAlpha+0.5f); -#else - float sample=tex3D(tex, u, v ,indAlpha+0.5f); -#endif - float weight=0; - // - // - // - // IMPORTANT: The weights are almost 50% of the computational time. Is there a way of speeding this up?? - // - //Real coordinates of Voxel. Instead of reverting the transformation, its less math (faster) to compute it from the indexes. - Point3D realvoxel; - - realvoxel.x=realvoxel_init.x+indX*geo.dVoxelX; - realvoxel.y=realvoxel_init.y+indY*geo.dVoxelY; - realvoxel.z=realvoxel_init.z+indZ*geo.dVoxelZ; - - - - realDaux.y=(-geo.sDetecU+geo.dDetecU)*0.5f + u*geo.dDetecU +uv0Offset.x; - realD.z =(-geo.sDetecV+geo.dDetecV)*0.5f + v*geo.dDetecV +uv0Offset.y; - //rotate the detector - realD.x= realDaux.x*cosalpha + realDaux.y*sinalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x) - realD.y=-realDaux.x*sinalpha + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x) - float L,lsq; - - L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always. - lsq = (realS.x-realvoxel.x)*(realS.x-realvoxel.x) - + (realS.y-realvoxel.y)*(realS.y-realvoxel.y) - + (realS.z-realvoxel.z)*(realS.z-realvoxel.z); - - weight=__fdividef(L*L*L,(DSD*lsq)); -// weight=1; - // Get Value in the computed (U,V) and multiply by the corresponding weight. - // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) - voxelColumn[colIdx]+=sample* weight; - } // END iterating through column of voxels - - } // END iterating through multiple projections - - // And finally copy the updated local voxelColumn array back to our 3D volume (main memory) -#pragma unroll - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; - image[idx] = voxelColumn[colIdx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) - // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. - // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write. - // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is - // better for avoiding memory congestion. - } // END copy updated voxels from local array to our 3D volume - -} // END kernelPixelBackprojectionFDK - - - - -//______________________________________________________________________________ -// -// Function: voxel_backprojection -// -// Description: Main host function for FDK backprojection (invokes the kernel) -//______________________________________________________________________________ - -int voxel_backprojection2(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids){ - - - - - // Prepare for MultiGPU - int deviceCount = gpuids.GetLength(); - cudaCheckErrors("Device query fail"); - if (deviceCount == 0) { - mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n"); - } - - - // CODE assumes - // 1.-All available devices are usable by this code - // 2.-All available devices are equal, they are the same machine (warning thrown) - // Check the available devices, and if they are the same - if (!gpuids.AreEqualDevices()) { - mexWarnMsgIdAndTxt("Atb:Voxel_backprojection2:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed."); - } - - int dev; - - - // Split the CT problem - unsigned int split_image; - unsigned int split_projections; - splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections); - - - // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the - // image slices. The rest of the Geometry is the same - Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry)); - createGeoArray(split_image*deviceCount,geo,geoArray,nalpha); - - // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly - // in the previous section this should leave enough space for the textures. - size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float); - float** dimage=(float**)malloc(deviceCount*sizeof(float*)); - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMalloc((void**)&dimage[dev], num_bytes_img); - cudaCheckErrors("cudaMalloc fail"); - } - - - //Pagelock memory for synchronous copy. - // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported = 0; -#if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to - // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big. - if (isHostRegisterSupported & split_image>1){ - cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable); - } - if (isHostRegisterSupported ){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); - } - cudaCheckErrors("Error pinning memory"); - - - - - - //If it is the first time, lets make sure our image is zeroed. - int nStreamDevice=2; - int nStreams=deviceCount*nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i+dev*nStreamDevice]); - - } - } - - // Kernel auxiliary variables - Point3D* projParamsArray2Host; - cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D)); - float* projSinCosArray2Host; - cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float)); - - // Texture object variables - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*)); - - - - unsigned int proj_split_overlap_number; - // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop - // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution - // of the code, as repeated allocation and deallocation only happens when the projection data is very very big, - // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time. - unsigned long long proj_linear_idx_start; - unsigned int current_proj_split_size,current_proj_overlap_split_size; - size_t num_bytes_img_curr; - size_t img_linear_idx_start; - float** partial_projection; - size_t* proj_split_size; - - for(unsigned int img_slice=0;img_slice=proj_split_size[proj_block_split]) - break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. - if(currProjNumber_global>=nalpha) - break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. - - Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, offDetec,source; - float sinalpha,cosalpha; - - geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now. - geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1]; - geoArray[img_slice*deviceCount+dev].psi =-alphas[currProjNumber_global*3+2]; - - sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha); - cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha); - - projSinCosArray2Host[5*j]=sinalpha; // 2*j because we have 2 float (sin or cos angle) values per projection - projSinCosArray2Host[5*j+1]=cosalpha; - projSinCosArray2Host[5*j+2]=geo.COR[currProjNumber_global]; - projSinCosArray2Host[5*j+3]=geo.DSD[currProjNumber_global]; - projSinCosArray2Host[5*j+4]=geo.DSO[currProjNumber_global]; - - computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); - - offOrig.x=geo.offOrigX[currProjNumber_global]; - offOrig.y=geo.offOrigY[currProjNumber_global]; - offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global]; - - offDetec.x=geo.offDetecU[currProjNumber_global]; - offDetec.y=geo.offDetecV[currProjNumber_global]; - offDetec.z=0;//unused - - projParamsArray2Host[7*j] =deltaX; // 7*j because we have 7 Point3D values per projection - projParamsArray2Host[7*j+1]=deltaY; - projParamsArray2Host[7*j+2]=deltaZ; - projParamsArray2Host[7*j+3]=xyzOrigin; - projParamsArray2Host[7*j+4]=offOrig; - projParamsArray2Host[7*j+5]=offDetec; - projParamsArray2Host[7*j+6]=source; - - } // END for (preparing params for kernel call) - - // Copy the prepared parameter arrays to constant memory to make it available for the kernel - cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]); - cudaStreamSynchronize(stream[dev*nStreamDevice]); - kernelPixelBackprojection<<>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]); - - } // END for - ////////////////////////////////////////////////////////////////////////////////////// - // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject - ////////////////////////////////////////////////////////////////////////////////////// - } - } // END sub-split of current projection chunk - - } // END projection splits - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>( geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV)); - } - - // Now we need to take the image out of the GPU - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaStreamSynchronize(stream[dev*nStreamDevice]); - - num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float); - img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev); - cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]); - } - } // end image splits - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDeviceSynchronize(); - } - - - // Clean the GPU - bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1; - for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) - if (!two_buffers_used && i==1) - break; for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaDestroyTextureObject(texProj[i*deviceCount+dev]); - cudaFreeArray(d_cuArrTex[i*deviceCount+dev]); - } - } - free(d_cuArrTex); - free(texProj); - - for (dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaFree(dimage[dev]); - } - free(dimage); - - cudaFreeHost(projSinCosArray2Host); - cudaFreeHost(projParamsArray2Host); - free(partial_projection); - free(proj_split_size); - - freeGeoArray(split_image*deviceCount,geoArray); -#ifndef NO_PINNED_MEMORY - if (isHostRegisterSupported & split_image>1){ - cudaHostUnregister(result); - } - if (isHostRegisterSupported){ - cudaHostUnregister(projections); - } -#endif - for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); - - cudaCheckErrors("cudaFree fail"); - -// cudaDeviceReset(); // For the Nvidia Visual Profiler - return 0; - -} // END voxel_backprojection - - - - - -void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){ - //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ; - int num_devices = gpuids.GetLength(); -#if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); -#else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); -#endif - if (allocate){ - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - - //cudaArray Descriptor - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - //cuda Array - cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); - - } - } - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemcpy3DParms copyParams = {0}; - //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); - copyParams.dstArray = d_cuArrTex[dev]; - copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[dev*nStreamDevice+1]); - } - - //Array creation End - for (unsigned int dev = 0; dev < num_devices; dev++){ - cudaSetDevice(gpuids[dev]); - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_cuArrTex[dev]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL); - } -} -#ifndef BACKPROJECTION_HPP -void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){ - - - // We don't know if the devices are being used. lets check that. and only use the amount of memory we need. - - size_t mem_GPU_global; - checkFreeMemory(gpuids, &mem_GPU_global); - const int deviceCount = gpuids.GetLength(); - - // Compute how much memory each of the relevant memory pieces need - size_t mem_image= (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float); - size_t mem_proj= (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float); - - - - - // Does everything fit in the GPU? - - if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2x=Px.x-P.x; deltaX->y=Px.y-P.y; deltaX->z=Px.z-P.z; - deltaY->x=Py.x-P.x; deltaY->y=Py.y-P.y; deltaY->z=Py.z-P.z; - deltaZ->x=Pz.x-P.x; deltaZ->y=Pz.y-P.y; deltaZ->z=Pz.z-P.z; - - - *xyzorigin=P.to_float(); - *S=source.to_float(); -} // END computeDeltasCube - -void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){ - size_t memfree; - size_t memtotal; - const int gpuids.GetLength(); - - for (int dev = 0; dev < deviceCount; dev++){ - cudaSetDevice(gpuids[dev]); - cudaMemGetInfo(&memfree,&memtotal); - if(dev==0) *mem_GPU_global=memfree; - if(memfree -#include -#include -#include "voxel_backprojection.hpp" -#include "voxel_backprojection_parallel.hpp" - -#include "TIGRE_common.hpp" -#include - -// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror -#define cudaCheckErrors(msg) \ -do { \ - cudaError_t __err = cudaGetLastError(); \ - if (__err != cudaSuccess) { \ - mexPrintf("%s \n",msg);\ - mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\ - } \ -} while (0) - - -#define MAXTREADS 1024 - /*GEOMETRY DEFINITION - * - * Detector plane, behind - * |-----------------------------| - * | | - * | | - * | | - * | | - * | +--------+ | - * | / /| | - * A Z | / / |*D | - * | | +--------+ | | - * | | | | | | - * | | | *O | + | - * *--->y | | | / | - * / | | |/ | - * V X | +--------+ | - * |-----------------------------| - * - * *S - * - * - * - * - * - **/ -void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate); - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were: -// PROJ_PER_KERNEL = 32 or 16 (very similar times) -// VOXELS_PER_THREAD = 8 -// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code. -// (e.g. 16.2 s vs. ~62 s). - -const int PROJ_PER_KERNEL = 32; // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck. -const int VOXELS_PER_THREAD = 8; // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck. - -// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection: -// deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec -// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel -// (they will be updated in the main loop before each kernel call). - -__constant__ Point3D projParamsArrayDevParallel[6*PROJ_PER_KERNEL]; // Dev means it is on device - -// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above) -// Point3D projParamsArrayHostParallel[6*PROJ_PER_KERNEL]; // Host means it is host memory - -// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection) -__constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL]; - -// float projSinCosArrayHostParallel[3*PROJ_PER_KERNEL]; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - - -//______________________________________________________________________________ -// -// Function: kernelPixelBackprojectionFDK -// -// Description: Main FDK backprojection kernel -//______________________________________________________________________________ - -__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex) -{ - - // Old kernel call signature: - // kernelPixelBackprojectionFDK<<>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha); - // We just read in most of the params from the constant memory instead of getting them from the param list. - // This is because we now have MANY params, since single kernel processes more than one projection! - /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo, - * float* image, - * const int indAlpha, - * const Point3D deltaX , - * const Point3D deltaY, - * const Point3D deltaZ, - * const Point3D xyzOrigin, - * const Point3D xyzOffset, - * const Point3D uv0Offset, - * const float sinalpha, - * const float cosalpha){ - */ - unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y; - unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x; - // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle - unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z; // This is only STARTING z index of the column of voxels that the thread will handle - //Make sure we don't go out of bounds - if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ) - return; - - // We'll keep a local auxiliary array of values of a column of voxels that this thread will update - float voxelColumn[VOXELS_PER_THREAD]; - - // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then - // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes - - unsigned long colIdx; - - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; - voxelColumn[colIdx] = image[idx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) - // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. - } // END copy 3D volume voxels to local array - - // Now iterate through projections - for(unsigned long projNumber=0; projNumber=totalNoOfProjections) - break; - - Point3D deltaX = projParamsArrayDevParallel[6*projNumber]; // 6*projNumber because we have 6 Point3D values per projection - Point3D deltaY = projParamsArrayDevParallel[6*projNumber+1]; - Point3D deltaZ = projParamsArrayDevParallel[6*projNumber+2]; - Point3D xyzOrigin = projParamsArrayDevParallel[6*projNumber+3]; - Point3D xyzOffset = projParamsArrayDevParallel[6*projNumber+4]; - Point3D S = projParamsArrayDevParallel[6*projNumber+5]; - - float DSD = projSinCosArrayDevParallel[3*projNumber]; // 2*projNumber because we have 2 float (sin or cos angle) values per projection - float DSO = projSinCosArrayDevParallel[3*projNumber+1]; - float COR = projSinCosArrayDevParallel[3*projNumber+2]; - - // Geometric trasnformations: - //Source, scaled XYZ coordinates - - // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles. - Point3D P; - S.x=DSO; - P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x); - P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU; - P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z); - S.y=P.y;S.z=P.z; - - // This is the vector defining the line from the source to the Voxel - float vectX,vectY,vectZ; - vectX=(P.x -S.x); - vectY=(P.y -S.y); - vectZ=(P.z -S.z); - - // Get the coordinates in the detector UV where the mid point of the voxel is projected. - float t=(DSO-DSD /*-DOD*/ - S.x)/vectX; - float y,z; - y=vectY*t+S.y; - z=vectZ*t+S.z; - float u,v; - u=y+geo.nDetecU/2.0f-0.5f; - v=z+geo.nDetecV/2.0f-0.5f; - - - - // Get Value in the computed (U,V) and multiply by the corresponding weight. - // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) -#if IS_FOR_MATLAB_TIGRE - voxelColumn[colIdx]+=tex3D(tex, v+0.5f, u+0.5f ,indAlpha+0.5f); -#else - voxelColumn[colIdx]+=tex3D(tex, u+0.5f, v+0.5f ,indAlpha+0.5f); -#endif - - } // END iterating through column of voxels - - } // END iterating through multiple projections - - // And finally copy the updated local voxelColumn array back to our 3D volume (main memory) - for(colIdx=0; colIdx=geo.nVoxelZ) - break; // break the loop. - - unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX; - image[idx] = voxelColumn[colIdx]; // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one) - // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory. - // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write. - // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is - // better for avoiding memory congestion. - } // END copy updated voxels from local array to our 3D volume - -} // END kernelPixelBackprojectionFDK - - - - -//______________________________________________________________________________ -// -// Function: voxel_backprojection_parallel -// -// Description: Main host function for FDK backprojection (invokes the kernel) -//______________________________________________________________________________ - -int voxel_backprojection_parallel(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids) -{ - if (gpuids.GetLength() == 0) { - cudaSetDevice(0); - } else { - cudaSetDevice(gpuids[0]); - } - - /* - * Allocate texture memory on the device - */ - // copy data to CUDA memory - //If it is the first time, lets make sure our image is zeroed. - int nStreamDevice=2; - int nStreams=nStreamDevice; - cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));; - - for (int i = 0; i < nStreamDevice; ++i){ - cudaStreamCreate(&stream[i]); - - - } - //Pagelock memory for synchronous copy. - // Lets try to make the host memory pinned: - // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes. - int isHostRegisterSupported = 0; -#if CUDART_VERSION >= 9020 - cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]); -#endif - if (isHostRegisterSupported){ - cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); - } - cudaCheckErrors("Error pinning memory"); - - - // Allocate result image memory - size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float); - float* dimage; - cudaMalloc((void**)&dimage, num_bytes); - cudaMemset(dimage,0,num_bytes); - cudaCheckErrors("cudaMalloc fail"); - - - Point3D* projParamsArrayHostParallel; - cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D)); - float* projSinCosArrayHostParallel; - cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float)); - - - // Texture buffer objects - cudaTextureObject_t *texProj; - cudaArray **d_cuArrTex; - texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t)); - d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*)); - - - - unsigned int proj_split_overlap_number; - unsigned int split_projections=1; - // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop - // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution - // of the code, as repeated allocation and deallocation only happens when the projection data is very very big, - // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time. - unsigned long long proj_linear_idx_start; - unsigned int current_proj_split_size,current_proj_overlap_split_size; - size_t num_bytes_img_curr; - size_t img_linear_idx_start; - - - current_proj_split_size=nalpha; - // We are going to split it in the same amount of kernels we need to execute. - proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL; - - - // Create pointer to pointers of projections and precompute their location and size. - - float ** partial_projection=(float**)malloc(current_proj_split_size*sizeof(float*)); - size_t * proj_split_size=(size_t*)malloc(current_proj_split_size*sizeof(size_t*)); - - for(unsigned int proj_block_split=0; proj_block_split=proj_split_size[proj_block_split]) - break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. - - if(currProjNumber_global>=nalpha) - break; // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway. - - Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source; - float sinalpha,cosalpha; - - geo.alpha=-alphas[currProjNumber_global*3]; - geo.theta=-alphas[currProjNumber_global*3+1]; - geo.psi =-alphas[currProjNumber_global*3+2]; - - //sinalpha=sin(geo.alpha); -// cosalpha=cos(geo.alpha); - - projSinCosArrayHostParallel[3*j]=geo.DSD[currProjNumber_global]; // 3*j because we have 3 float (sin or cos angle) values per projection - projSinCosArrayHostParallel[3*j+1]=geo.DSO[currProjNumber_global]; - projSinCosArrayHostParallel[3*j+2]=geo.COR[currProjNumber_global]; - - //computeDeltasCubeParallel(geo,geo.alpha,currProjNumber,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); - computeDeltasCubeParallel(geo,currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source); - - offOrig.x=geo.offOrigX[currProjNumber_global]; - offOrig.y=geo.offOrigY[currProjNumber_global]; - - - projParamsArrayHostParallel[6*j]=deltaX; // 6*j because we have 6 Point3D values per projection - projParamsArrayHostParallel[6*j+1]=deltaY; - projParamsArrayHostParallel[6*j+2]=deltaZ; - projParamsArrayHostParallel[6*j+3]=xyzOrigin; - projParamsArrayHostParallel[6*j+4]=offOrig; - projParamsArrayHostParallel[6*j+5]=source; - } // END for (preparing params for kernel call) - - // Copy the prepared parameter arrays to constant memory to make it available for the kernel - - cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); - cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]); - cudaStreamSynchronize(stream[0]); - - kernelPixelBackprojection_parallel<<>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]); - } // END for - - ////////////////////////////////////////////////////////////////////////////////////// - // END Main reconstruction loop: go through projections (rotation angles) and backproject - ////////////////////////////////////////////////////////////////////////////////////// - } - cudaDeviceSynchronize(); - cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost); - cudaCheckErrors("cudaMemcpy result fail"); - - free(partial_projection); - free(proj_split_size); - - bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1; - for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1) - if (!two_buffers_used && i==1) - break; - cudaDestroyTextureObject(texProj[i]); - cudaFreeArray(d_cuArrTex[i]); - } - free(texProj); - - free(d_cuArrTex); - cudaFreeHost(projSinCosArrayHostParallel); - cudaFreeHost(projParamsArrayHostParallel); - - cudaFree(dimage); - if (isHostRegisterSupported){ - cudaHostUnregister(projections); - } - for (int i = 0; i < nStreams; ++i) - cudaStreamDestroy(stream[i]); - -// cudaDeviceReset(); - return 0; - -} // END voxel_backprojection - -void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S) -{ - - Point3Ddouble P, Px,Py,Pz; - // Get coords of Img(0,0,0) - P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i]; - P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i]; - P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i]; - - // Get coors from next voxel in each direction - Px.x=P.x+geo.dVoxelX; Py.x=P.x; Pz.x=P.x; - Px.y=P.y; Py.y=P.y+geo.dVoxelY; Pz.y=P.y; - Px.z=P.z; Py.z=P.z; Pz.z=P.z+geo.dVoxelZ; - - - - // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ - eulerZYZT(geo,&P); - eulerZYZT(geo,&Px); - eulerZYZT(geo,&Py); - eulerZYZT(geo,&Pz); - - //detector offset - P.z =P.z-geo.offDetecV[i]; P.y =P.y-geo.offDetecU[i]; - Px.z =Px.z-geo.offDetecV[i]; Px.y =Px.y-geo.offDetecU[i]; - Py.z =Py.z-geo.offDetecV[i]; Py.y =Py.y-geo.offDetecU[i]; - Pz.z =Pz.z-geo.offDetecV[i]; Pz.y =Pz.y-geo.offDetecU[i]; - - //Detector Roll pitch Yaw - // - // - // first, we need to offset everything so (0,0,0) is the center of the detector - // Only X is required for that - P.x=P.x+(geo.DSD[i]-geo.DSO[i]); - Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]); - Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]); - Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]); - - rollPitchYawT(geo,i,&P); - rollPitchYawT(geo,i,&Px); - rollPitchYawT(geo,i,&Py); - rollPitchYawT(geo,i,&Pz); - - P.x=P.x-(geo.DSD[i]-geo.DSO[i]); - Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]); - Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]); - Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]); - - - Point3Ddouble source; - source.x=0; - source.y=-geo.offDetecU[i]; - source.z=-geo.offDetecV[i]; - - rollPitchYawT(geo,i,&source); - source.x=source.x-(geo.DSD[i]-geo.DSO[i]); - - P.z =P.z /geo.dDetecV; P.y =P.y/geo.dDetecU; - Px.z=Px.z/geo.dDetecV; Px.y=Px.y/geo.dDetecU; - Py.z=Py.z/geo.dDetecV; Py.y=Py.y/geo.dDetecU; - Pz.z=Pz.z/geo.dDetecV; Pz.y=Pz.y/geo.dDetecU; - - source.z=source.z/geo.dDetecV; source.y=source.y/geo.dDetecU; - - // get deltas of the changes in voxels - deltaX->x=Px.x-P.x; deltaX->y=Px.y-P.y; deltaX->z=Px.z-P.z; - deltaY->x=Py.x-P.x; deltaY->y=Py.y-P.y; deltaY->z=Py.z-P.z; - deltaZ->x=Pz.x-P.x; deltaZ->y=Pz.y-P.y; deltaZ->z=Pz.z-P.z; - - - // cast the results from the double precision calculations back to float - *xyzorigin=P.to_float(); - *S=source.to_float(); - - -} // END computeDeltasCube -void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc) -{ - //cudaArray Descriptor -#if IS_FOR_MATLAB_TIGRE - const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles); -#else - const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles); -#endif - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); - //cuda Array - if (alloc){ - cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); - cudaCheckErrors("Texture memory allocation fail"); - } - cudaMemcpy3DParms copyParams = {0}; - - - //Array creation - copyParams.srcPtr = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height); - copyParams.dstArray = d_cuArrTex[0]; - copyParams.extent = extent; - copyParams.kind = cudaMemcpyHostToDevice; - cudaMemcpy3DAsync(©Params,stream[0+1]); - cudaCheckErrors("Texture memory data copy fail"); - //Array creation End - - cudaResourceDesc texRes; - memset(&texRes, 0, sizeof(cudaResourceDesc)); - texRes.resType = cudaResourceTypeArray; - texRes.res.array.array = d_cuArrTex[0]; - cudaTextureDesc texDescr; - memset(&texDescr, 0, sizeof(cudaTextureDesc)); - texDescr.normalizedCoords = false; - texDescr.filterMode = cudaFilterModeLinear; - texDescr.addressMode[0] = cudaAddressModeBorder; - texDescr.addressMode[1] = cudaAddressModeBorder; - texDescr.addressMode[2] = cudaAddressModeBorder; - texDescr.readMode = cudaReadModeElementType; - cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); - cudaCheckErrors("Texture object creation fail"); - -} \ No newline at end of file diff --git a/Common/CUDA/voxel_backprojection_parallel.hpp.prehip b/Common/CUDA/voxel_backprojection_parallel.hpp.prehip deleted file mode 100644 index 92b72023..00000000 --- a/Common/CUDA/voxel_backprojection_parallel.hpp.prehip +++ /dev/null @@ -1,57 +0,0 @@ -/*------------------------------------------------------------------------- - * - * Header CUDA function for backrpojection for parallel beam - * - * - * CODE by Ander Biguri - * Optimized and modified by RB - * ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ -#include "types_TIGRE.hpp" -#include "GpuIds.hpp" - - -#ifndef BACKPROJECTION_PARALLEL_HPP -#define BACKPROJECTION_PARALLEL_HPP - -int voxel_backprojection_parallel(float * projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids); -void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S); -void createGeoArrayParallel(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles); -// void computeDeltasCube(Geometry geo, float alpha,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ); -#endif \ No newline at end of file diff --git a/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip deleted file mode 100644 index e38db7d9..00000000 --- a/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip +++ /dev/null @@ -1,126 +0,0 @@ -/*------------------------------------------------------------------------- - * - * MATLAB MEX functions for Random Number Generator. Check inputs and parses - * MATLAB data to C++ data. - * - * - * CODE by Tomoyuki SADAKANE - * ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ - -#include -#include -#include -#include -#include -#include -#include -#include -/** - * MEX gateway - * AddNoise(Im, mu, sigma, "gpuids", gpuids); - * poissrnd(Im)+randn(size(Im)).*sigma + mu; - */ - -void mexFunction(int nlhs, mxArray *plhs[], - int nrhs, mxArray const *prhs[]) -{ - size_t uiLen = 0; - float fGaussMu = 0; - float fGaussSigma = 0; - - GpuIds gpuids; - if (nrhs==5) { - size_t iM = mxGetM(prhs[4]); - if (iM != 1) { - mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector."); - return; - } - size_t uiGpuCount = mxGetN(prhs[4]); - if (uiGpuCount == 0) { - mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector."); - return; - } - int* piGpuIds = (int*)mxGetData(prhs[4]); - gpuids.SetIds(uiGpuCount, piGpuIds); - } else { - int iGpuCount = GetGpuCount(); - int* piDev = (int*)malloc(iGpuCount * sizeof(int)); - for (int iI = 0; iI < iGpuCount; ++iI) { - piDev[iI] = iI; - } - gpuids.SetIds(iGpuCount, piDev); - free(piDev); piDev = 0; - } - if (nrhs < 3) { - mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "At least three input argumet required."); - } else if (nrhs==3 || nrhs==5){ - size_t mrows = mxGetM(prhs[1]); - size_t ncols = mxGetN(prhs[1]); - if (mrows!=1 || ncols !=1) { - mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "2nd parameter should be 1x1"); - } - mrows = mxGetM(prhs[2]); - ncols = mxGetN(prhs[2]); - if (mrows!=1 || ncols !=1) { - mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "3rd parameter should be 1x1"); - } - fGaussMu = (float)mxGetScalar(prhs[1]); - fGaussSigma = (float)mxGetScalar(prhs[2]); - } else if (nrhs>4) { - mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "Too many input arguments"); - } - /////////////// First input argumet. - // First input should be an array, whose elements are lambda. - mxArray const * const image = prhs[0]; - float* pfLambdas = static_cast(mxGetData(image)); - mwSize const numDims = mxGetNumberOfDimensions(image); // get dim of image - const mwSize *size_img= mxGetDimensions(image); //get size of image - uiLen = size_img[0]; // calculate the total length - for (int iI = 1; iI < numDims; ++iI) { - uiLen *= size_img[iI]; - } - ////////////// - //prepare outputs - // Allocte output image - plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL); - float *imgout =(float*) mxGetPr(plhs[0]); - // call CUDA rng - poisson_gaussian_1d(pfLambdas, uiLen, fGaussMu, fGaussSigma, imgout, gpuids); -} diff --git a/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip deleted file mode 100644 index da78bfce..00000000 --- a/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip +++ /dev/null @@ -1,367 +0,0 @@ - -/*------------------------------------------------------------------------- - * - * MATLAB MEX gateway for backprojection - * - * This file gets the data from MATLAB, checks it for errors and then - * parses it to C and calls the relevant C/CUDA functions. - * - * CODE by Ander Biguri - * - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - - - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - - - -/** - * MEX gateway - * - * This function takes data from MATLAB and passes it to the MEX code. - * It checks and casts the inputs and prepares teh outputs for MATLAB. - * - * - */ - -void mexFunction(int nlhs , mxArray *plhs[], - int nrhs, mxArray const *prhs[]){ - - //Check amount of inputs - if (nrhs != 5) { - mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput", "Wrong number of inputs provided"); - } - //////////////////////////// - // 5th argument is array of GPU-IDs. - GpuIds gpuids; - { - size_t iM = mxGetM(prhs[4]); - if (iM != 1) { - mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector."); - return; - } - size_t uiGpuCount = mxGetN(prhs[4]); - if (uiGpuCount == 0) { - mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector."); - return; - } - int* piGpuIds = (int*)mxGetData(prhs[4]); - gpuids.SetIds(uiGpuCount, piGpuIds); - } - - /* - ** 4th argument is matched or un matched. - */ - bool pseudo_matched=false; // Caled krylov, because I designed it for krylov case.... - /* copy the string data from prhs[0] into a C string input_ buf. */ - char *krylov = mxArrayToString(prhs[3]); - if (!strcmp(krylov,"matched")) // if its 0, they are the same - pseudo_matched=true; - - /* - ** Third argument: angle of projection. - */ - size_t mrows,nangles; - - mrows = mxGetM(prhs[2]); - nangles = mxGetN(prhs[2]); - - - mxArray const * const ptrangles=prhs[2]; - - - double const * const anglesM= static_cast(mxGetData(ptrangles)); - // just copy paste the data to a float array - float * angles= (float*)malloc(nangles*mrows*sizeof(float)); - for (int i=0;i1) && !(numDims==2 && nangles==1) ){ - mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput", "Projection data is not the right size"); - } - if( !mxIsSingle(prhs[0])) { - mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput", - "Input image must be a single noncomplex array."); - } - // Now that input is ok, parse it to C data types. - // NOTE: while Number of dimensions is the size of the matrix in Matlab, the data is 1D row-wise mayor. - - // We need a float image, and, unfortunately, the only way of casting it is by value -// const mwSize *size_proj= mxGetDimensions(image); //get size of image -// mrows = mxGetM(image); -// nangles = mxGetN(image); -// size_t size_proj2; -// if (nangles==1) -// size_proj2=1; -// else -// size_proj2=size_proj[2]; - - - float * projections= static_cast(mxGetData(image)); - - - - -///////////////////////////////////////////////////////////////////////////////////////////////////////////////// - /** - * Second input: Geometry structure - */ - mxArray * geometryMex=(mxArray*)prhs[1]; - - // IMPORTANT-> Make sure Matlab creates the struct in this order. - const char *fieldnames[14]; - fieldnames[0] = "nVoxel"; - fieldnames[1] = "sVoxel"; - fieldnames[2] = "dVoxel"; - fieldnames[3] = "nDetector"; - fieldnames[4] = "sDetector"; - fieldnames[5] = "dDetector"; - fieldnames[6] = "DSD"; - fieldnames[7] = "DSO"; - fieldnames[8] = "offOrigin"; - fieldnames[9] = "offDetector"; - fieldnames[10]= "accuracy"; - fieldnames[11]= "mode"; - fieldnames[12]= "COR"; - fieldnames[13]= "rotDetector"; - // Make sure input is structure - - mxArray *tmp; - - // Now we know that all the input struct is good! Parse it from mxArrays to - // C structures that MEX can understand. - - double * nVoxel, *nDetec; //we need to cast these to int - double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD,*offOrig,*offDetec; - double *acc, *COR,*rotDetector; - const char* mode; - bool coneBeam=true; - Geometry geo; - int c; - geo.unitX=1;geo.unitY=1;geo.unitZ=1; - for(int ifield=0; ifield<14; ifield++) { - tmp=mxGetField(geometryMex,0,fieldnames[ifield]); - if(tmp==NULL){ - //tofix - continue; - } - switch(ifield){ - case 0: - nVoxel=(double *)mxGetData(tmp); - // copy data to MEX memory - geo.nVoxelX=(int)nVoxel[0]; - geo.nVoxelY=(int)nVoxel[1]; - geo.nVoxelZ=(int)nVoxel[2]; - break; - case 1: - sVoxel=(double *)mxGetData(tmp); - geo.sVoxelX=(float)sVoxel[0]; - geo.sVoxelY=(float)sVoxel[1]; - geo.sVoxelZ=(float)sVoxel[2]; - break; - case 2: - dVoxel=(double *)mxGetData(tmp); - geo.dVoxelX=(float)dVoxel[0]; - geo.dVoxelY=(float)dVoxel[1]; - geo.dVoxelZ=(float)dVoxel[2]; - break; - case 3: - nDetec=(double *)mxGetData(tmp); - geo.nDetecU=(int)nDetec[0]; - geo.nDetecV=(int)nDetec[1]; - break; - case 4: - sDetec=(double *)mxGetData(tmp); - geo.sDetecU=(float)sDetec[0]; - geo.sDetecV=(float)sDetec[1]; - break; - case 5: - dDetec=(double *)mxGetData(tmp); - geo.dDetecU=(float)dDetec[0]; - geo.dDetecV=(float)dDetec[1]; - break; - case 6: - geo.DSD=(float*)malloc(nangles * sizeof(float)); - DSD=(double *)mxGetData(tmp); - for (int i=0;i -#include -#include -#include -#include -#include -#include -#include -void mexFunction(int nlhs , mxArray *plhs[], - int nrhs, mxArray const *prhs[]) -{ -///////// First check if the amount of inputs is right. - int maxIter; - float alpha; - GpuIds gpuids; - if (nrhs==5) { - size_t iM = mxGetM(prhs[4]); - if (iM != 1) { - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); - return; - } - size_t uiGpuCount = mxGetN(prhs[4]); - if (uiGpuCount == 0) { - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); - return; - } - int* piGpuIds = (int*)mxGetData(prhs[4]); - gpuids.SetIds(uiGpuCount, piGpuIds); - } else { - int iGpuCount = GetGpuCount(); - int* piDev = (int*)malloc(iGpuCount * sizeof(int)); - for (int iI = 0; iI < iGpuCount; ++iI) { - piDev[iI] = iI; - } - gpuids.SetIds(iGpuCount, piDev); - free(piDev); piDev = 0; - } - if (nrhs==1){ - maxIter=100; - alpha=15.0f; - } else if (nrhs==2){ - mexErrMsgIdAndTxt("err", "Only 1 POCS hyperparameter inputted"); - } else if (nrhs==4 || nrhs==5){ - size_t mrows = mxGetM(prhs[1]); - size_t ncols = mxGetN(prhs[1]); - if (mrows!=1 || ncols !=1) { - mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1"); - } - mrows = mxGetM(prhs[2]); - ncols = mxGetN(prhs[2]); - if (mrows!=1 || ncols !=1) { - mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1"); - } - alpha= (float)(mxGetScalar(prhs[1])); - maxIter=(int)floor(mxGetScalar(prhs[2])+0.5); - } else { - mexErrMsgIdAndTxt("err", "Too many input arguments"); - } - float delta=(float)(mxGetScalar(prhs[3])); -////////////////////////// First input. - // First input should be x from (Ax=b), or the image. - mxArray const * const image = prhs[0]; - mwSize const numDims = mxGetNumberOfDimensions(image); - mwSize third_dim = 1; - - // Now that input is ok, parse it to C data types. - float * img = static_cast(mxGetData(image)); - const mwSize *size_img= mxGetDimensions(image); //get size of image - - // Image should be dim 3 - if (numDims==3){ - third_dim = size_img[2]; - } - - // Allocte output image - plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL); - float *imgout =(float*) mxGetPr(plhs[0]); - // call C function with the CUDA denoising - - const long imageSize[3]={size_img[0], size_img[1], third_dim }; - - aw_pocs_tv(img,imgout, alpha, imageSize, maxIter, delta, gpuids); - - //prepareotputs -} diff --git a/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip deleted file mode 100644 index 3c6f3670..00000000 --- a/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip +++ /dev/null @@ -1,338 +0,0 @@ -/*------------------------------------------------------------------------- - * - * MATLAB MEX gateway for projection - * - * This file gets the data from MATLAB, checks it for errors and then - * parses it to C and calls the relevant C/CUDA functions. - * - * CODE by Ander Biguri - * - * --------------------------------------------------------------------------- - * --------------------------------------------------------------------------- - * Copyright (c) 2015, University of Bath and CERN- European Organization for - * Nuclear Research - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * --------------------------------------------------------------------------- - * - * Contact: tigre.toolbox@gmail.com - * Codes : https://github.com/CERN/TIGRE - * --------------------------------------------------------------------------- - */ - - - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * MEX gateway - */ - - - -void mexFunction(int nlhs , mxArray *plhs[], - int nrhs, mxArray const *prhs[]) -{ -// clock_t begin, end; -// begin = clock(); - - - //Check amount of inputs - if (nrhs != 5) { - mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput", "Invalid number of inputs to MEX file."); - } - //////////////////////////// - // 5th argument is array of GPU-IDs. - GpuIds gpuids; - { - size_t iM = mxGetM(prhs[4]); - if (iM != 1) { - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector."); - return; - } - size_t uiGpuCount = mxGetN(prhs[4]); - if (uiGpuCount == 0) { - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector."); - return; - } - int* piGpuIds = (int*)mxGetData(prhs[4]); - gpuids.SetIds(uiGpuCount, piGpuIds); - } - //////////////////////////// - // 4th argument is interpolated or ray-voxel/Siddon - bool rayvoxel=false; - if ( mxIsChar(prhs[3]) != 1) - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be a string"); - - /* copy the string data from prhs[0] into a C string input_ buf. */ - char *krylov = mxArrayToString(prhs[3]); - if (strcmp(krylov,"interpolated") && strcmp(krylov,"Siddon") && strcmp(krylov,"ray-voxel")) - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be either 'interpolated' or 'Siddon'"); - else - // If its not ray-voxel, its "interpolated" - if (strcmp(krylov,"Siddon") == 0 || strcmp(krylov,"ray-voxel") == 0) //strcmp returs 0 if they are equal - rayvoxel=true; - ///////////////////////// 3rd argument: angle of projection. - - size_t mrows = mxGetM(prhs[2]); - size_t nangles = mxGetN(prhs[2]); - - mxArray const * const ptrangles=prhs[2]; - - - double const * const anglesM= static_cast(mxGetData(ptrangles)); - // just copy paste the data to a float array - float * angles= (float*)malloc(nangles*mrows*sizeof(float)); - for (int i=0;i(mxGetData(image)); - // We need a float image, and, unfortunately, the only way of casting it is by value - const mwSize *size_img= mxGetDimensions(image); //get size of image - - - - ///////////////////// Second input argument, - // Geometry structure that has all the needed geometric data. - - - mxArray * geometryMex=(mxArray*)prhs[1]; - - // IMPORTANT-> Make sure Matlab creates the struct in this order. - const char *fieldnames[14]; - fieldnames[0] = "nVoxel"; - fieldnames[1] = "sVoxel"; - fieldnames[2] = "dVoxel"; - fieldnames[3] = "nDetector"; - fieldnames[4] = "sDetector"; - fieldnames[5] = "dDetector"; - fieldnames[6] = "DSD"; - fieldnames[7] = "DSO"; - fieldnames[8] = "offOrigin"; - fieldnames[9] = "offDetector"; - fieldnames[10]= "accuracy"; - fieldnames[11]= "mode"; - fieldnames[12]= "COR"; - fieldnames[13]= "rotDetector"; - - // Now we know that all the input struct is good! Parse it from mxArrays to - // C structures that MEX can understand. - double * nVoxel, *nDetec; //we need to cast these to int - double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD; - double *offOrig,*offDetec,*rotDetector; - double * acc, *COR; - const char* mode; - int c; - mxArray *tmp; - Geometry geo; - geo.unitX=1;geo.unitY=1;geo.unitZ=1; - bool coneBeam=true; -// mexPrintf("%d \n",nfields); - for(int ifield=0; ifield<14; ifield++) { - tmp=mxGetField(geometryMex,0,fieldnames[ifield]); - if(tmp==NULL){ - //tofix - continue; - } - switch(ifield){ - case 0: - nVoxel=(double *)mxGetData(tmp); - // copy data to MEX memory - geo.nVoxelX=(int)nVoxel[0]; - geo.nVoxelY=(int)nVoxel[1]; - geo.nVoxelZ=(int)nVoxel[2]; - break; - case 1: - sVoxel=(double *)mxGetData(tmp); - geo.sVoxelX=(float)sVoxel[0]; - geo.sVoxelY=(float)sVoxel[1]; - geo.sVoxelZ=(float)sVoxel[2]; - break; - case 2: - dVoxel=(double *)mxGetData(tmp); - geo.dVoxelX=(float)dVoxel[0]; - geo.dVoxelY=(float)dVoxel[1]; - geo.dVoxelZ=(float)dVoxel[2]; - break; - case 3: - nDetec=(double *)mxGetData(tmp); - geo.nDetecU=(int)nDetec[0]; - geo.nDetecV=(int)nDetec[1]; - break; - case 4: - sDetec=(double *)mxGetData(tmp); - geo.sDetecU=(float)sDetec[0]; - geo.sDetecV=(float)sDetec[1]; - break; - case 5: - dDetec=(double *)mxGetData(tmp); - geo.dDetecU=(float)dDetec[0]; - geo.dDetecV=(float)dDetec[1]; - break; - case 6: - geo.DSD=(float*)malloc(nangles * sizeof(float)); - DSD=(double *)mxGetData(tmp); - for (int i=0;i -#include -#include -#include -#include -#include -#include -#include -// #include -void mexFunction(int nlhs , mxArray *plhs[], - int nrhs, mxArray const *prhs[]) -{ -///////// First check if the amount of imputs is rigth. - int maxIter; - float alpha; - float ratio; - GpuIds gpuids; - if (nrhs<5) - mexErrMsgIdAndTxt("TIGRE:minPICCS", "At least 2 inputs needed: Image and prior image"); - if (nrhs>6){ - mexErrMsgIdAndTxt("TIGRE:minPICCS", "Too many imput argumets"); - } - if (nrhs==6){ - size_t mrows = mxGetM(prhs[2]); - size_t ncols = mxGetN(prhs[2]); - if (mrows!=1 || ncols !=1) - mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1"); - mrows = mxGetM(prhs[3]); - ncols = mxGetN(prhs[3]); - if (mrows!=1 || ncols !=1) - mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1"); - mrows = mxGetM(prhs[4]); - ncols = mxGetN(prhs[4]); - if (mrows!=1 || ncols !=1) - mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1"); - alpha= (float)(mxGetScalar(prhs[2])); - maxIter=(int)floor(mxGetScalar(prhs[3])+0.5); - ratio= (float)(mxGetScalar(prhs[4])); - - size_t uiGpuCount = mxGetN(prhs[5]); - if (uiGpuCount == 0) { - mexErrMsgIdAndTxt( "TIGRE:minPICCS","6th parameter must be a row vector"); - return; - } - int* piGpuIds = (int*)mxGetData(prhs[5]); - gpuids.SetIds(uiGpuCount, piGpuIds); - }else{ - int iGpuCount = GetGpuCount(); - int* piDev = (int*)malloc(iGpuCount * sizeof(int)); - for (int iI = 0; iI < iGpuCount; ++iI) { - piDev[iI] = iI; - } - gpuids.SetIds(iGpuCount, piDev); - free(piDev); piDev = 0; - } - if (nrhs==2){ - maxIter=100; - alpha=15.0f; - ratio=0.5; - } - - -////////////////////////// First input. - // First input should be x from (Ax=b), or the image. - mxArray const * const image = prhs[0]; - mwSize const numDims = mxGetNumberOfDimensions(image); - if (numDims!=3){ - mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D"); - } - mxArray const * const prior_mex = prhs[1]; - mwSize const numDims_prior = mxGetNumberOfDimensions(image); - if (numDims_prior!=3){ - mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D"); - } - if(numDims_prior!=numDims) - mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image and prior are not the same size"); - // Image should be dim 3 - - // Now that input is ok, parse it to C data types. - float const * const img = static_cast(mxGetData(image)); - float const * const prior = static_cast(mxGetData(prior_mex)); - const mwSize *size_img= mxGetDimensions(image); //get size of image - - - // Allocte output image - const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] }; - plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL); - float *imgout =(float*) mxGetPr(plhs[0]); - - - piccs_tv(img,prior,imgout, alpha,ratio, imageSize, maxIter,gpuids); - - - -} \ No newline at end of file diff --git a/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip b/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip deleted file mode 100644 index da60446c..00000000 --- a/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip +++ /dev/null @@ -1,132 +0,0 @@ -/* -/*------------------------------------------------------------------------- - * - * MATLAB MEX gateway for Total variation minimization via Steepest descend - * - * This file gets the data from MATLAB, checks it for errors and then - * parses it to C and calls the relevant C/CUDA functions. - * - * CODE by Ander Biguri - * ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ - - - - - -#include -#include -#include -#include -#include -#include -#include -#include -void mexFunction(int nlhs , mxArray *plhs[], - int nrhs, mxArray const *prhs[]) -{ -///////// First check if the amount of inputs is right. - int maxIter; - float alpha; - GpuIds gpuids; - if (nrhs==4) { - size_t iM = mxGetM(prhs[3]); - if (iM != 1) { - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); - return; - } - size_t uiGpuCount = mxGetN(prhs[3]); - if (uiGpuCount == 0) { - mexErrMsgIdAndTxt( "TIGRE:minTV","4th parameter must be a row vector."); - return; - } - int* piGpuIds = (int*)mxGetData(prhs[3]); - gpuids.SetIds(uiGpuCount, piGpuIds); - } else { - int iGpuCount = GetGpuCount(); - int* piDev = (int*)malloc(iGpuCount * sizeof(int)); - for (int iI = 0; iI < iGpuCount; ++iI) { - piDev[iI] = iI; - } - gpuids.SetIds(iGpuCount, piDev); - free(piDev); piDev = 0; - } - if (nrhs==1){ - maxIter=100; - alpha=15.0f; - } else if (nrhs==2){ - mexErrMsgIdAndTxt("minTV:mex", "Only 1 POCS hyperparameter inputted"); - } else if (nrhs==3 || nrhs==4){ - size_t mrows = mxGetM(prhs[1]); - size_t ncols = mxGetN(prhs[1]); - if (mrows!=1 || ncols !=1) - mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1"); - mrows = mxGetM(prhs[2]); - ncols = mxGetN(prhs[2]); - if (mrows!=1 || ncols !=1) - mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1"); - alpha= (float)(mxGetScalar(prhs[1])); - maxIter=(int)floor(mxGetScalar(prhs[2])+0.5); - } else { - mexErrMsgIdAndTxt("minTV:mex", "Too many input arguments"); - } - -////////////////////////// First input. - // First input should be x from (Ax=b), or the image. - mxArray const * const image = prhs[0]; - mwSize const numDims = mxGetNumberOfDimensions(image); - mwSize third_dim = 1; - - - // Now that input is ok, parse it to C data types. - float * img = static_cast(mxGetData(image)); - const mwSize *size_img = mxGetDimensions(image); //get size of image - - // Image should be dim 3 - if (numDims==3){ - third_dim = size_img[2]; - } - - // Allocte output image - const long imageSize[3]={size_img[0] ,size_img[1], third_dim }; - plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL); - float *imgout =(float*) mxGetPr(plhs[0]); - - pocs_tv(img,imgout, alpha, imageSize, maxIter, gpuids); -} diff --git a/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip deleted file mode 100644 index 1142a5f7..00000000 --- a/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip +++ /dev/null @@ -1,124 +0,0 @@ -/*-------------------------------------------------------------------------- --------------------------------------------------------------------------- - This file is part of the TIGRE Toolbox - - Copyright (c) 2015, University of Bath and - CERN-European Organization for Nuclear Research - All rights reserved. - - License: Open Source under BSD. - See the full license at - https://github.com/CERN/TIGRE/blob/master/LICENSE - - Contact: tigre.toolbox@gmail.com - Codes: https://github.com/CERN/TIGRE/ - Coded by: Stefanie Kaser, Benjamin Kirchmayer ---------------------------------------------------------------------------*/ - -#include "mex.h" -#include "CUDA/improvedForwardProjections.hpp" -#include -#include -#include - - -void mexFunction(int nlhs, mxArray *plhs[], int nrhs,const mxArray *prhs[]){ - - if (nrhs =! 7){ - mexErrMsgIdAndTxt("CS Projections:", "Check Number of Input arguments!"); - } - - float *posIn, *posOut, *dirIn, *dirOut; - float *Wepl, *pixelSize, *detectorDistanceIn, *detectorDistanceOut, *initEnergy; - - //Load parameters - posIn = (float *)(mxGetPr(prhs[0])); - posOut = (float *)mxGetPr(prhs[1]); - dirIn = (float *)mxGetPr(prhs[2]); - dirOut = (float *)mxGetPr(prhs[3]); - Wepl = (float*) mxGetPr(prhs[4]); - initEnergy = (float*) mxGetPr(prhs[5]); - - //Get Number of Protons contained in the root files - int numOfProtons = (int) mxGetM(prhs[4]); - - mxArray * geometryMex=(mxArray*)prhs[6]; - - const char *fieldnames_geo[7]; - fieldnames_geo[0] = "dDetector"; - fieldnames_geo[1] = "DSD"; - fieldnames_geo[2] = "DSID"; - fieldnames_geo[3] = "DSO"; - fieldnames_geo[4] = "hull"; - fieldnames_geo[5] = "sDetector"; - fieldnames_geo[6] = "mode"; - - double * pix0, *dsd0, *dsid0, *hull0, *det0, *dso0; - float pix[2], dsd, dsid, dso, hull[4], det[2]; - const char* mode; - bool coneBeam = true; - mxArray *tmp; - for (int ifield=0; ifield<7; ifield++){ - tmp=mxGetField(geometryMex,0,fieldnames_geo[ifield]); - switch(ifield){ - case 0: - pix0 =(double *)mxGetData(tmp); - pix[0] = (float)pix0[0]; - pix[1] = (float)pix0[1]; - break; - case 1: - dsd0 =(double *)mxGetData(tmp); - dsd = (float)dsd0[0]; - break; - case 2: - dsid0 =(double *)mxGetData(tmp); - dsid = (float)dsid0[0]; - break; - case 3: - dso0 =(double *)mxGetData(tmp); - dso = (float)dso0[0]; - break; - case 4: - hull0 =(double *)mxGetData(tmp); - hull[0] = (float)hull0[0]; - hull[1] = (float)hull0[1]; - hull[2] = (float)hull0[2]; - hull[3] = (float)hull0[3]; - break; - case 5: - det0 =(double *)mxGetData(tmp); - det[0] = (float)det0[0]; - det[1] = (float)det0[1]; - break; - case 6: - mode=""; - mode=mxArrayToString(tmp); - if (!strcmp(mode,"parallel")) - coneBeam=false; - break; - } - } - - - if (hull[3] == 0){std::cout << "Info: Calculation of optimized proton radiographies will be performed without object hull!" << std::endl;} - - if (hull[2] > 6.28318530717958648){std::cout << "Info: Hull rotation angle exceeds 2 Pi. Please check the input! Continuing with calculation..." << std::endl;} - - mwSize outSize[2]; - outSize[0] = int(det[1]/pix[1]); - outSize[1] = int(det[0]/pix[0]); - plhs[0] = mxCreateNumericArray(2, outSize, mxSINGLE_CLASS, mxREAL); - float *outProjections = (float*)mxGetPr(plhs[0]); - - //For Calculation 2 historgrams are needed - // - if(coneBeam == false){ - std::cout << "Info: Parallel geometry selected..." << std::endl; - ParticleProjections(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, *initEnergy, hull); - } - else{ - std::cout << "Info: Cone beam geometry selected..." << std::endl; - ParticleProjectionsCone(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, -1*dso, *initEnergy, hull); - } - -} diff --git a/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip deleted file mode 100644 index f905bcbd..00000000 --- a/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip +++ /dev/null @@ -1,147 +0,0 @@ -/*------------------------------------------------------------------------- - * - * MATLAB MEX functions for TV image denoising. Check inputs and parses - * MATLAB data to C++ data. - * - * - * CODE by Ander Biguri - * ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- -Copyright (c) 2015, University of Bath and CERN- European Organization for -Nuclear Research -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors -may be used to endorse or promote products derived from this software without -specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------- - -Contact: tigre.toolbox@gmail.com -Codes : https://github.com/CERN/TIGRE ---------------------------------------------------------------------------- - */ - - - - - - -#include -#include -#include -#include -#include -#include -#include -#include -/** - * MEX gateway - */ -void mexFunction(int nlhs , mxArray *plhs[], - int nrhs, mxArray const *prhs[]) -{ - int maxIter; - float lambda; - GpuIds gpuids; - if (nrhs==4) { - size_t iM = mxGetM(prhs[3]); - if (iM != 1) { - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); - return; - } - size_t uiGpuCount = mxGetN(prhs[3]); - if (uiGpuCount == 0) { - mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector."); - return; - } - int* piGpuIds = (int*)mxGetData(prhs[3]); - gpuids.SetIds(uiGpuCount, piGpuIds); - } else { - int iGpuCount = GetGpuCount(); - int* piDev = (int*)malloc(iGpuCount * sizeof(int)); - for (int iI = 0; iI < iGpuCount; ++iI) { - piDev[iI] = iI; - } - gpuids.SetIds(iGpuCount, piDev); - free(piDev); piDev = 0; - } - if (nrhs == 0) { - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "At least one input argumet required."); - } else if (nrhs==1){ - maxIter=100; - lambda=15.0f; - } else if (nrhs==2){ - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Only 1 TV hyperparameter inputted"); - } else if (nrhs==3 || nrhs==4){ - size_t mrows = mxGetM(prhs[1]); - size_t ncols = mxGetN(prhs[1]); - if (mrows!=1 || ncols !=1) { - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1"); - } - mrows = mxGetM(prhs[2]); - ncols = mxGetN(prhs[2]); - if (mrows!=1 || ncols !=1) { - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1"); - } - lambda= (float)(mxGetScalar(prhs[1])); - maxIter=(int)round(mxGetScalar(prhs[2])); - } else if (nrhs>4) { - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Too many input arguments"); - } - ////////////////////////// First input. - // First input should be x from (Ax=b), or the image. - mxArray const * const image = prhs[0]; - mwSize const numDims = mxGetNumberOfDimensions(image); - - // Image should be dim 3 - if (numDims!=3){ - mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Image is not 3D"); - } - // Now that input is ok, parse it to C data types. - float * img = static_cast(mxGetData(image)); - // We need a float image, and, unfortunately, the only way of casting it is by value - const mwSize *size_img= mxGetDimensions(image); //get size of image - - ////////////// - //prepareotputs - plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL); - float *imgout =(float*) mxGetPr(plhs[0]); - // Allocte output image - // call C function with the CUDA denoising - const float spacing[3]={1,1,1}; - const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] }; - - tvdenoising(img,imgout, lambda, spacing, imageSize, maxIter, gpuids); - - - -// memcpy(mxImgout,imgout,size_img[0] *size_img[1] *size_img[2]*sizeof(float)); - //free memory -// free(img); -// free(imgout); - - -} From f610f8c9c6627df9e42585cf04dc95d87c01e781 Mon Sep 17 00:00:00 2001 From: purepani Date: Wed, 19 Mar 2025 19:38:13 -0500 Subject: [PATCH 3/3] Successful compilation of HIP --- Common/CUDA/GD_AwTV.cu | 4 +- Common/CUDA/GD_TV.cu | 4 +- Common/CUDA/RandomNumberGenerator.cu | 3 +- Common/CUDA/Siddon_projection.cu | 10 +- Common/CUDA/Siddon_projection_parallel.cu | 2 +- Common/CUDA/ray_interpolated_projection.cu | 4 +- .../ray_interpolated_projection_parallel.cu | 4 +- Common/CUDA/voxel_backprojection.cu | 4 +- Common/CUDA/voxel_backprojection2.cu | 4 +- Common/CUDA/voxel_backprojection_parallel.cu | 4 +- Python/setup.py | 102 +++++++++--------- 11 files changed, 75 insertions(+), 70 deletions(-) diff --git a/Common/CUDA/GD_AwTV.cu b/Common/CUDA/GD_AwTV.cu index 03956111..e899b196 100644 --- a/Common/CUDA/GD_AwTV.cu +++ b/Common/CUDA/GD_AwTV.cu @@ -542,7 +542,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS; hipStreamSynchronize(stream[dev*nStream_device+1]); - reduceNorm2 << > >(d_norm2[dev], d_norm2aux[dev], total_pixels); + reduceNorm2 <<>>(d_norm2[dev], d_norm2aux[dev], total_pixels); } for (dev = 0; dev < deviceCount; dev++){ @@ -553,7 +553,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS; if (dimgridRed > 1) { - reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); + reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed); hipStreamSynchronize(stream[dev*nStream_device]); hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } diff --git a/Common/CUDA/GD_TV.cu b/Common/CUDA/GD_TV.cu index 4086e951..9777bbc2 100644 --- a/Common/CUDA/GD_TV.cu +++ b/Common/CUDA/GD_TV.cu @@ -526,7 +526,7 @@ do { \ size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS; hipStreamSynchronize(stream[dev*nStream_device+1]); - reduceNorm2 << > >(d_norm2[dev], d_norm2aux[dev], total_pixels); + reduceNorm2 <<>>(d_norm2[dev], d_norm2aux[dev], total_pixels); } for (dev = 0; dev < deviceCount; dev++){ @@ -537,7 +537,7 @@ do { \ size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS; if (dimgridRed > 1) { - reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed); + reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed); hipStreamSynchronize(stream[dev*nStream_device]); hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]); } diff --git a/Common/CUDA/RandomNumberGenerator.cu b/Common/CUDA/RandomNumberGenerator.cu index 5910b407..e4e7c283 100644 --- a/Common/CUDA/RandomNumberGenerator.cu +++ b/Common/CUDA/RandomNumberGenerator.cu @@ -48,7 +48,8 @@ #include #include #include -#include +#include +#include #include "gpuUtils.hpp" #include "RandomNumberGenerator.hpp" diff --git a/Common/CUDA/Siddon_projection.cu b/Common/CUDA/Siddon_projection.cu index 8e551626..94b9eb1d 100644 --- a/Common/CUDA/Siddon_projection.cu +++ b/Common/CUDA/Siddon_projection.cu @@ -230,16 +230,16 @@ __global__ void kernelPixelDetector( Geometry geo, float ac=am; //eq (28), unit anlges float axu,ayu,azu; - axu=__frcp_rd(fabsf(ray.x)); - ayu=__frcp_rd(fabsf(ray.y)); - azu=__frcp_rd(fabsf(ray.z)); + axu=__frcp_rn(fabsf(ray.x)); + ayu=__frcp_rn(fabsf(ray.y)); + azu=__frcp_rn(fabsf(ray.z)); // eq(29), direction of update float iu,ju,ku; iu=(source.x< pixel1D.x)? 1.0f : -1.0f; ju=(source.y< pixel1D.y)? 1.0f : -1.0f; ku=(source.z< pixel1D.z)? 1.0f : -1.0f; - float maxlength=__fsqrt_rd(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ); + float maxlength=__fsqrt_rn(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ); float sum=0.0f; unsigned long Np=(imax-imin+1)+(jmax-jmin+1)+(kmax-kmin+1); // Number of intersections // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed @@ -601,7 +601,7 @@ void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipA //hipArray Descriptor hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); } } for (unsigned int dev = 0; dev < num_devices; dev++){ diff --git a/Common/CUDA/Siddon_projection_parallel.cu b/Common/CUDA/Siddon_projection_parallel.cu index 65e04a92..a6c50130 100644 --- a/Common/CUDA/Siddon_projection_parallel.cu +++ b/Common/CUDA/Siddon_projection_parallel.cu @@ -491,7 +491,7 @@ void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipT //hipArray Descriptor hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0); hipMemcpy3DParms copyParams = {0}; diff --git a/Common/CUDA/ray_interpolated_projection.cu b/Common/CUDA/ray_interpolated_projection.cu index 8ab4a7e7..cfafb99b 100644 --- a/Common/CUDA/ray_interpolated_projection.cu +++ b/Common/CUDA/ray_interpolated_projection.cu @@ -162,7 +162,7 @@ template P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z); // Length is the ray length in normalized space - float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z)); + float length=__fsqrt_rn((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z)); //now legth is an integer of Nsamples that are required on this line length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer vectX=__fdividef(P.x -source.x,length); @@ -561,7 +561,7 @@ void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry ge hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); cudaCheckErrors("Texture memory allocation fail"); } diff --git a/Common/CUDA/ray_interpolated_projection_parallel.cu b/Common/CUDA/ray_interpolated_projection_parallel.cu index 4793821f..45cd3984 100644 --- a/Common/CUDA/ray_interpolated_projection_parallel.cu +++ b/Common/CUDA/ray_interpolated_projection_parallel.cu @@ -419,7 +419,7 @@ void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex //hipArray Descriptor hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0); hipMemcpy3DParms copyParams = {0}; @@ -447,4 +447,4 @@ void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex texDescr.readMode = hipReadModeElementType; hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); -} \ No newline at end of file +} diff --git a/Common/CUDA/voxel_backprojection.cu b/Common/CUDA/voxel_backprojection.cu index 8fb9df3c..b525d23c 100644 --- a/Common/CUDA/voxel_backprojection.cu +++ b/Common/CUDA/voxel_backprojection.cu @@ -247,7 +247,7 @@ __global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,co weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO); - weight=__frcp_rd(weight*weight); + weight=__frcp_rn(weight*weight); // Get Value in the computed (U,V) and multiply by the corresponding weight. // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!) @@ -680,7 +680,7 @@ void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipA //hipArray Descriptor hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); } } diff --git a/Common/CUDA/voxel_backprojection2.cu b/Common/CUDA/voxel_backprojection2.cu index 43091e78..814422a8 100644 --- a/Common/CUDA/voxel_backprojection2.cu +++ b/Common/CUDA/voxel_backprojection2.cu @@ -272,7 +272,7 @@ __global__ void kernelPixelBackprojection(const Geometry geo, float* image,const realD.y=-realDaux.x*sinalpha + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x) float L,lsq; - L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always. + L = __fsqrt_rn( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always. lsq = (realS.x-realvoxel.x)*(realS.x-realvoxel.x) + (realS.y-realvoxel.y)*(realS.y-realvoxel.y) + (realS.z-realvoxel.z)*(realS.z-realvoxel.z); @@ -665,7 +665,7 @@ void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hip //hipArray Descriptor hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array - hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0); } } diff --git a/Common/CUDA/voxel_backprojection_parallel.cu b/Common/CUDA/voxel_backprojection_parallel.cu index 58ab9f38..055d27d2 100644 --- a/Common/CUDA/voxel_backprojection_parallel.cu +++ b/Common/CUDA/voxel_backprojection_parallel.cu @@ -595,7 +595,7 @@ void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArr hipChannelFormatDesc channelDesc = hipCreateChannelDesc(); //cuda Array if (alloc){ - hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent); + hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0); cudaCheckErrors("Texture memory allocation fail"); } hipMemcpy3DParms copyParams = {0}; @@ -625,4 +625,4 @@ void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArr hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL); cudaCheckErrors("Texture object creation fail"); -} \ No newline at end of file +} diff --git a/Python/setup.py b/Python/setup.py index 40bc3b3f..eee483e0 100644 --- a/Python/setup.py +++ b/Python/setup.py @@ -20,7 +20,7 @@ if "--no_pinned_memory" in sys.argv[2:] : no_pinned=True sys.argv.pop(sys.argv.index("--no_pinned_memory")) - + if no_pinned: define_macros.append(("NO_PINNED_MEMORY",None)) @@ -48,10 +48,10 @@ ] COMPUTE_CAPABILITY_ARGS = [ - "-gencode=arch=compute_70,code=compute_70", # allows forward compiling - "--ptxas-options=-v", + #"-gencode=arch=compute_70,code=compute_70", # allows forward compiling + #"--ptxas-options=-v", "-c", - "--default-stream=per-thread", + #"--default-stream=per-thread", ] @@ -65,13 +65,14 @@ def get_cuda_version(cuda_home): return version_str.split(" ")[2][:4] else: version_str = subprocess.check_output( - [os.path.join(cuda_home, "bin", "nvcc"), "--version"] + [os.path.join(cuda_home, "bin", "hipcc"), "--version"] ) version_str = str(version_str).replace("\n", "").replace("\r", "") idx = version_str.find("release") return version_str[idx + len("release ") : idx + len("release ") + 4] except: - raise RuntimeError("Cannot read cuda version file") + pass + #raise RuntimeError("Cannot read cuda version file") def locate_cuda(): @@ -81,16 +82,16 @@ def locate_cuda(): and values giving the absolute path to each directory. Starts by looking for the CUDA_HOME or CUDA_PATH env variable. If not found, everything - is based on finding 'nvcc' in the PATH. + is based on finding 'hipcc' in the PATH. """ # Guess #1 - cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") + cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or os.environ.get("HIP_PATH") if cuda_home is None: # Guess #2 try: which = "where" if IS_WINDOWS else "which" - nvcc = subprocess.check_output([which, "nvcc"]).decode().rstrip("\r\n") - cuda_home = os.path.dirname(os.path.dirname(nvcc)) + hipcc = subprocess.check_output([which, "hipcc"]).decode().rstrip("\r\n") + cuda_home = os.path.dirname(os.path.dirname(hipcc)) except subprocess.CalledProcessError: # Guess #3 if IS_WINDOWS: @@ -124,24 +125,24 @@ def _is_cuda_file(path): CUDA, CUDA_VERSION = locate_cuda() -cuda_version = 11.0 -try: - cuda_version = float(CUDA_VERSION) -except ValueError: - cuda_list = re.findall('\d+', CUDA_VERSION) - cuda_version = float( str(cuda_list[0] + '.' + cuda_list[1])) - -# Insert CUDA arguments depedning on the version -for item in CC_COMPATIBILITY_TABLE: - support_begin = item[2] - support_end = item[3] - if cuda_version < support_begin: - continue - if cuda_version >= support_end: - continue - str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}" - COMPUTE_CAPABILITY_ARGS.insert(0, str_arg) - +#cuda_version = 11.0 +#try: + #cuda_version = float(CUDA_VERSION) +#except ValueError: + #cuda_list = re.findall("\d+", CUDA_VERSION) + #cuda_version = float(str(cuda_list[0] + "." + cuda_list[1])) +# +## Insert CUDA arguments depedning on the version +#for item in CC_COMPATIBILITY_TABLE: + #support_begin = item[2] + #support_end = item[3] + #if cuda_version < support_begin: + ##continue + #if cuda_version >= support_end: + #continue + #str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}" + #COMPUTE_CAPABILITY_ARGS.insert(0, str_arg) +# # Obtain the numpy include directory. This logic works across numpy versions. try: NUMPY_INCLUDE = numpy.get_include() @@ -153,10 +154,10 @@ def _is_cuda_file(path): COMMON_NVCC_FLAGS = [ - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", - "--expt-relaxed-constexpr", + #"-D__CUDA_NO_HALF_OPERATORS__", + #"-D__CUDA_NO_HALF_CONVERSIONS__", + #"-D__CUDA_NO_HALF2_OPERATORS__", + #"--expt-relaxed-constexpr", ] @@ -211,18 +212,20 @@ def build_extensions(self): def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): # Copy before we make any modifications. cflags = copy.deepcopy(extra_postargs) + cflags.append("-D__HIP_PLATFORM_AMD__") try: original_compiler = self.compiler.compiler_so if _is_cuda_file(src): - nvcc = _join_cuda_home("bin", "nvcc") - if not isinstance(nvcc, list): - nvcc = [nvcc] - self.compiler.set_executable("compiler_so", nvcc) + hipcc = _join_cuda_home("bin", "hipcc") + if not isinstance(hipcc, list): + hipcc = [hipcc] + self.compiler.set_executable("compiler_so", hipcc) + self.compiler.set_executable("compiler", hipcc) if isinstance(cflags, dict): - cflags = cflags["nvcc"] + cflags = cflags["hipcc"] cflags = ( COMMON_NVCC_FLAGS - + ["--compiler-options", "'-fPIC'"] + + ["-fPIC"] + cflags + COMPUTE_CAPABILITY_ARGS ) @@ -237,6 +240,7 @@ def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): finally: # Put the original compiler back in place. self.compiler.set_executable("compiler_so", original_compiler) + self.compiler.set_executable("compiler", original_compiler) def win_wrap_compile( sources, @@ -269,9 +273,9 @@ def spawn(cmd, cflags): src = src_list[0] obj = obj_list[0] if _is_cuda_file(src): - nvcc = _join_cuda_home("bin", "nvcc") + hipcc = _join_cuda_home("bin", "hipcc") if isinstance(cflags, dict): - cflags = cflags["nvcc"] + cflags = cflags["hipcc"] elif not isinstance(cflags, list): cflags = [] @@ -287,7 +291,7 @@ def spawn(cmd, cflags): elif len(macro) == 1: cflags += ["--undefine-macro", macro[0]] - cmd = [nvcc, "-c", src, "-o", obj] + include_list + cflags + cmd = [hipcc, "-c", src, "-o", obj] + include_list + cflags elif isinstance(cflags, dict): cflags = COMMON_MSVC_FLAGS # + self.cflags['cxx'] cmd += cflags @@ -372,7 +376,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -395,7 +399,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -416,7 +420,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -437,7 +441,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -458,7 +462,7 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -475,7 +479,7 @@ def include_headers(filename_list, sdist=False): sdist=sys.argv[1] == "sdist", ), library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], @@ -496,10 +500,10 @@ def include_headers(filename_list, sdist=False): ), define_macros=define_macros, library_dirs=[CUDA["lib64"]], - libraries=["cudart"], + libraries=["amdhip64", "hiprand"], language="c++", runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None, - include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"], + include_dirs=[NUMPY_INCLUDE, CUDA["include"],"../Common/CUDA/"], )