diff --git a/Common/CUDA/GD_AwTV.cu b/Common/CUDA/GD_AwTV.cu
index d98c13c1..e899b196 100644
--- a/Common/CUDA/GD_AwTV.cu
+++ b/Common/CUDA/GD_AwTV.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for Steepest descend in POCS-type algorithms.
@@ -61,11 +62,11 @@
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -378,16 +379,16 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
          
         // allocate memory in each GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
-            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
-            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
-            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
-            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
-            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
-            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
-            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
+            hipMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
+            hipMemset(         d_image[dev],0  , mem_img_each_GPU);
+            hipMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
+            hipMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
+            hipMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
+            hipMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
+            hipMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
+            hipMemset(         d_norm2aux[dev],0 , mem_auxiliary);
             cudaCheckErrors("Malloc  error");
             
             
@@ -397,7 +398,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         if(splits>1){
             mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
         }else{
-            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
+            hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
         }
         
         
@@ -406,12 +407,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
         int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+        hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
-            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
+            hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
         }
         cudaCheckErrors("Error pinning memory");
 
@@ -420,12 +421,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                 // Create streams
         int nStream_device=2;
         int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
+                hipStreamCreate(&stream[i+dev*nStream_device]);
             }
         }
         cudaCheckErrors("Stream creation fail");
@@ -437,7 +438,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         double totalsum;
         float sum_curr_spl;
         float * sumnorm2;
-        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
+        hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float));
         
         unsigned int curr_slices;
         unsigned long long curr_pixels;
@@ -476,28 +477,28 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
 
                 if(i==0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
                 if (splits>1 & i>0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipSetDevice(gpuids[dev]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 cudaCheckErrors("Memcpy failure on multi split");
@@ -509,7 +510,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                     dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // Compute the gradient of the TV norm
                         
@@ -522,48 +523,48 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // no need to copy the 2 aux slices here
-                        cudaStreamSynchronize(stream[dev*nStream_device]);
-                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                        hipStreamSynchronize(stream[dev*nStream_device]);
+                        hipMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), hipMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
                     }
                     
                     
                     // Compute the L2 norm of the gradient. For that, reduction is used.
                     //REDUCE
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
                         size_t dimblockRed = MAXTHREADS;
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
                         
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
+                        reduceNorm2 <<<dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2[dev], d_norm2aux[dev], total_pixels);
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         size_t dimblockRed = MAXTHREADS;
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
 
                         if (dimgridRed > 1) {
-                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                         else {
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Reduction error");
                     
@@ -586,7 +587,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         //NORMALIZE
@@ -596,15 +597,15 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                         multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
                     }
                      for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Scalar operations error");
                     
                     //SUBSTRACT GRADIENT
                     //////////////////////////////////////////////
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
@@ -614,8 +615,8 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
 
                 // Synchronize mathematics, make sure bounding pixels are correct
                  for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                 
                 if(splits==1){
@@ -623,35 +624,35 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
+                            hipSetDevice(gpuids[dev+1]);
+                            hipMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); 
                         }
-                        cudaDeviceSynchronize();
+                        hipDeviceSynchronize();
                         if (dev>0){
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
+                            hipSetDevice(gpuids[dev-1]);
+                            hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice);
                         }
                     }
                 }else{
                     
                     // We need to take it out :(
                     for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
-                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                     }
                 }
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
+                    hipSetDevice(gpuids[dev]);
+                    hipDeviceSynchronize();
                 }
                 cudaCheckErrors("Memory gather error");
 
@@ -662,34 +663,34 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         // If there has not been splits, we still have data in memory
         if(splits==1){
             for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
                 total_pixels=curr_slices*image_size[0]*image_size[1];
-                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                hipMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost);
             }
         }
         cudaCheckErrors("Copy result back");
         
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(d_image[dev]);
-            cudaFree(d_norm2aux[dev]);
-            cudaFree(d_dimgTV[dev]);
-            cudaFree(d_norm2[dev]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(d_image[dev]);
+            hipFree(d_norm2aux[dev]);
+            hipFree(d_dimgTV[dev]);
+            hipFree(d_norm2[dev]);
         }
         if (splits==1){
-            cudaFreeHost(buffer);
+            hipHostFree(buffer);
         }
         
         if (isHostRegisterSupported& splits>2){
-            cudaHostUnregister(img);
-            cudaHostUnregister(dst);
+            hipHostUnregister(img);
+            hipHostUnregister(dst);
         }
         for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
+           hipStreamDestroy(stream[i]) ;
         cudaCheckErrors("Memory free");
-//         cudaDeviceReset();
+//         hipDeviceReset();
     }
         
 void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
@@ -697,8 +698,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
         size_t memtotal;
         const int deviceCount = gpuids.GetLength();
         for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
+            hipSetDevice(gpuids[dev]);
+            hipMemGetInfo(&memfree,&memtotal);
             if(dev==0) *mem_GPU_global=memfree;
             if(memfree<memtotal/2){
                 mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/GD_TV.cu b/Common/CUDA/GD_TV.cu
index 4edcf94c..9777bbc2 100644
--- a/Common/CUDA/GD_TV.cu
+++ b/Common/CUDA/GD_TV.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for Steepest descend in POCS-type algorithms.
@@ -61,11 +62,11 @@
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("GD_TV:GPU",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                mexErrMsgIdAndTxt("GD_TV:GPU",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -362,16 +363,16 @@ do { \
          
         // allocate memory in each GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
-            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
-            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
-            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
-            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
-            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
-            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
-            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
+            hipMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
+            hipMemset(         d_image[dev],0  , mem_img_each_GPU);
+            hipMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
+            hipMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
+            hipMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
+            hipMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
+            hipMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
+            hipMemset(         d_norm2aux[dev],0 , mem_auxiliary);
             cudaCheckErrors("Malloc  error");
             
             
@@ -381,7 +382,7 @@ do { \
         if(splits>1){
             mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
         }else{
-            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
+            hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
         }
         
         
@@ -390,12 +391,12 @@ do { \
         // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
         int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+        hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
-            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
+            hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
         }
         cudaCheckErrors("Error pinning memory");
 
@@ -404,12 +405,12 @@ do { \
                 // Create streams
         int nStream_device=2;
         int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
+                hipStreamCreate(&stream[i+dev*nStream_device]);
             }
         }
         cudaCheckErrors("Stream creation fail");
@@ -421,7 +422,7 @@ do { \
         double totalsum;
         float sum_curr_spl;
         float * sumnorm2;
-        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
+        hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float));
         
         unsigned int curr_slices;
         unsigned long long curr_pixels;
@@ -460,28 +461,28 @@ do { \
 
                 if(i==0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
                 if (splits>1 & i>0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipSetDevice(gpuids[dev]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 cudaCheckErrors("Memcpy failure on multi split");
@@ -493,7 +494,7 @@ do { \
                     dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // Compute the gradient of the TV norm
                         
@@ -506,48 +507,48 @@ do { \
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // no need to copy the 2 aux slices here
-                        cudaStreamSynchronize(stream[dev*nStream_device]);
-                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                        hipStreamSynchronize(stream[dev*nStream_device]);
+                        hipMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), hipMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
                     }
                     
                     
                     // Compute the L2 norm of the gradient. For that, reduction is used.
                     //REDUCE
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
                         size_t dimblockRed = MAXTHREADS;
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
                         
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
+                        reduceNorm2 <<<dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2[dev], d_norm2aux[dev], total_pixels);
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         size_t dimblockRed = MAXTHREADS;
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
 
                         if (dimgridRed > 1) {
-                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                         else {
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Reduction error");
                     
@@ -570,7 +571,7 @@ do { \
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         //NORMALIZE
@@ -580,15 +581,15 @@ do { \
                         multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
                     }
                      for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Scalar operations error");
                     
                     //SUBSTRACT GRADIENT
                     //////////////////////////////////////////////
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
@@ -598,8 +599,8 @@ do { \
 
                 // Synchronize mathematics, make sure bounding pixels are correct
                  for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                 
                 if(splits==1){
@@ -607,35 +608,35 @@ do { \
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
+                            hipSetDevice(gpuids[dev+1]);
+                            hipMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); 
                         }
-                        cudaDeviceSynchronize();
+                        hipDeviceSynchronize();
                         if (dev>0){
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
+                            hipSetDevice(gpuids[dev-1]);
+                            hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice);
                         }
                     }
                 }else{
                     
                     // We need to take it out :(
                     for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
-                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                     }
                 }
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
+                    hipSetDevice(gpuids[dev]);
+                    hipDeviceSynchronize();
                 }
                 cudaCheckErrors("Memory gather error");
 
@@ -646,39 +647,39 @@ do { \
         // If there has not been splits, we still have data in memory
         if(splits==1){
             for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
                 total_pixels=curr_slices*image_size[0]*image_size[1];
-                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                hipMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost);
             }
         }
         cudaCheckErrors("Copy result back");
         
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(d_image[dev]);
-            cudaFree(d_norm2aux[dev]);
-            cudaFree(d_dimgTV[dev]);
-            cudaFree(d_norm2[dev]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(d_image[dev]);
+            hipFree(d_norm2aux[dev]);
+            hipFree(d_dimgTV[dev]);
+            hipFree(d_norm2[dev]);
         }
         if (splits==1){
-            cudaFreeHost(buffer);
+            hipHostFree(buffer);
         }
         
         if (isHostRegisterSupported& splits>2){
-            cudaHostUnregister(img);
-            cudaHostUnregister(dst);
+            hipHostUnregister(img);
+            hipHostUnregister(dst);
         }
         for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
+           hipStreamDestroy(stream[i]) ;
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("Memory free");
-        cudaDeviceReset();
+        hipDeviceReset();
     }
         
 void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
@@ -686,8 +687,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
         size_t memtotal;
         int deviceCount = gpuids.GetLength();
         for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
+            hipSetDevice(gpuids[dev]);
+            hipMemGetInfo(&memfree,&memtotal);
             if(dev==0) *mem_GPU_global=memfree;
             if(memfree<memtotal/2){
                 mexErrMsgIdAndTxt("GD_TV:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/GpuIds.cpp b/Common/CUDA/GpuIds.cpp
index e9e622cc..b787503e 100644
--- a/Common/CUDA/GpuIds.cpp
+++ b/Common/CUDA/GpuIds.cpp
@@ -1,7 +1,7 @@
 #include "GpuIds.hpp"
 #include <stdlib.h>
 #include <string.h>
-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>
 
 GpuIds::~GpuIds() {
     free(m_piDeviceIds); m_piDeviceIds = nullptr;
@@ -52,12 +52,12 @@ void GpuIds::SetAllGpus(int iTotalDeviceCount) {
 
 bool GpuIds::AreEqualDevices() const {
     int deviceCount = this->GetLength();
-    const int devicenamelength = 256;  // The length 256 is fixed by spec of cudaDeviceProp::name
+    const int devicenamelength = 256;  // The length 256 is fixed by spec of hipDeviceProp_t::name
     char devicename[devicenamelength];
-    cudaDeviceProp deviceProp;
+    hipDeviceProp_t deviceProp;
     for (int dev = 0; dev < deviceCount; dev++) {
-        // cudaSetDevice(m_piDeviceIds[dev]);
-        cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
+        // hipSetDevice(m_piDeviceIds[dev]);
+        hipGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
         if (dev>0) {
             if (strcmp(devicename, deviceProp.name) != 0) {
                 return false;
diff --git a/Common/CUDA/PICCS.cu b/Common/CUDA/PICCS.cu
index 481ede08..e447b375 100644
--- a/Common/CUDA/PICCS.cu
+++ b/Common/CUDA/PICCS.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for Steepest descend in POCS-type algorithms.
@@ -60,10 +61,10 @@ Codes  : https://github.com/CERN/TIGRE
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("ERROR in: %s \n",msg);\
-                mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("err",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -263,9 +264,9 @@ do { \
 bool isnan_cuda(float* vec, size_t size){
     bool*d_nan;
     bool h_nan;
-    cudaMalloc((void **)&d_nan, sizeof (bool));
+    hipMalloc((void **)&d_nan, sizeof (bool));
     isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan);
-    cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
+    hipMemcpy(&h_nan, d_nan, sizeof(bool), hipMemcpyDeviceToHost);
     return h_nan;
 
 }
@@ -281,24 +282,24 @@ bool isnan_cuda(float* vec, size_t size){
         
         float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2;
         // memory for image
-        cudaMalloc(&d_image, mem_size);
-        cudaMalloc(&d_prior, mem_size);
+        hipMalloc(&d_image, mem_size);
+        hipMalloc(&d_prior, mem_size);
 
         cudaCheckErrors("Malloc Image error");
-        cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice);
+        hipMemcpy(d_image, img, mem_size, hipMemcpyHostToDevice);
+        hipMemcpy(d_prior, prior, mem_size, hipMemcpyHostToDevice);
         cudaCheckErrors("Memory Malloc and Memset: SRC");
         // memory for df
-        cudaMalloc(&d_dimgTV, mem_size);
-        cudaMalloc(&d_dpiccsTV, mem_size);
+        hipMalloc(&d_dimgTV, mem_size);
+        hipMalloc(&d_dpiccsTV, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_norm2, mem_size);
+        hipMalloc(&d_norm2, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_aux_image, mem_size);
+        hipMalloc(&d_aux_image, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
         
         // memory for L2norm auxiliar
-        cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
+        hipMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
         cudaCheckErrors("Memory Malloc and Memset: NORMAux");
         
         
@@ -315,64 +316,64 @@ bool isnan_cuda(float* vec, size_t size){
 
         for(unsigned int i=0;i<maxIter;i++){
             
-            cudaMemcpy( d_aux_image,d_image, mem_size, cudaMemcpyDeviceToDevice);
+            hipMemcpy( d_aux_image,d_image, mem_size, hipMemcpyDeviceToDevice);
 //             mexPrintf("Iteration %d\n",(int)i);
 
             // Compute the gradient of the TV norm
             gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Gradient");
 //             mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
 
 
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio),   total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
 
             substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Substraction error");
             
             gradientTV<<<gridGrad, blockGrad>>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Gradient");
 //             mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
 
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio,   total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
 //             mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
                 
             
             addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             //NOMRALIZE via reduction
             //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
-            cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
+            hipMemcpy(d_norm2, d_dimgTV, mem_size, hipMemcpyDeviceToDevice);
             cudaCheckErrors("Copy from gradient call error");
             reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_aux_small, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("reduce1");
             if (dimgridRed > 1) {
                 reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed);
-                cudaDeviceSynchronize();
+                hipDeviceSynchronize();
                 cudaCheckErrors("reduce2");
-                cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
+                hipMemcpy(&sumnorm2, d_norm2, sizeof(float), hipMemcpyDeviceToHost);
+                cudaCheckErrors("hipMemcpy");
 
             }
             else {
-                cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
+                hipMemcpy(&sumnorm2, d_aux_small, sizeof(float), hipMemcpyDeviceToHost);
+                cudaCheckErrors("hipMemcpy");
             }
 //             mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2));
             //MULTIPLY HYPERPARAMETER sqrt(sumnorm2)
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2),  total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
             //SUBSTRACT GRADIENT
             substractArrays    <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Substraction error");
 //             mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false");
 //             mexPrintf("\n");
@@ -381,18 +382,18 @@ bool isnan_cuda(float* vec, size_t size){
         
         cudaCheckErrors("TV minimization");
         
-        cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
+        hipMemcpy(dst, d_image, mem_size, hipMemcpyDeviceToHost);
         cudaCheckErrors("Copy result back");
         
-        cudaFree(d_image);
-        cudaFree(d_dpiccsTV);
-        cudaFree(d_aux_image);
-        cudaFree(d_aux_small);
-        cudaFree(d_prior);
-        cudaFree(d_norm2);
+        hipFree(d_image);
+        hipFree(d_dpiccsTV);
+        hipFree(d_aux_image);
+        hipFree(d_aux_small);
+        hipFree(d_prior);
+        hipFree(d_norm2);
 
 
         cudaCheckErrors("Memory free");
-        cudaDeviceReset();
+        hipDeviceReset();
     }
     
diff --git a/Common/CUDA/RandomNumberGenerator.cu b/Common/CUDA/RandomNumberGenerator.cu
index d7d1224a..e4e7c283 100644
--- a/Common/CUDA/RandomNumberGenerator.cu
+++ b/Common/CUDA/RandomNumberGenerator.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for random number generator
@@ -45,40 +46,41 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda.h>
-#include <curand_kernel.h>
-#include <curand.h>
+#include <hip/hip_runtime.h>
+#include <hiprand/hiprand_kernel.h>
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand.h>
 
 #include "gpuUtils.hpp"
 #include "RandomNumberGenerator.hpp"
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                mexErrMsgIdAndTxt("RandomNumberGenerator:",hipGetErrorString(__err));\
         } \
 } while (0)
 
 
-__global__ void setup_kernel(curandState *state) {
+__global__ void setup_kernel(hiprandState *state) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Each thread gets same seed, a different sequence number, no offset */
-    curand_init(1234, idx, 0, &state[idx]);
+    hiprand_init(1234, idx, 0, &state[idx]);
 }
 
-__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
+__global__ void GeneratePoisson(hiprandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
+    hiprandState localState = state[idx];
     int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
     for (int iI = 0; iI < iIter; ++iI) {
         size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
         if (uiPos < uiLen) {
             /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
             pfOut[uiPos] = (float)uiPoisson;
         }
     }
@@ -86,7 +88,7 @@ __global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t ui
     state[idx] = localState;
 }
 
-__global__ void GeneratePoissonAddGaussian(curandState *state,
+__global__ void GeneratePoissonAddGaussian(hiprandState *state,
                         const float* pfIn,
                         size_t uiLen, 
                         float fGaussMu,
@@ -95,15 +97,15 @@ __global__ void GeneratePoissonAddGaussian(curandState *state,
 {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
+    hiprandState localState = state[idx];
     int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
     for (int iI = 0; iI < iIter; ++iI) {
         size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
         if (uiPos < uiLen) {
             /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
             /* Gaussian */
-            float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu;
+            float fNormal = hiprand_normal(&localState) * fGaussSigma + fGaussMu;
             pfOut[uiPos] = fNormal + (float)uiPoisson;
         }
     }
@@ -127,31 +129,31 @@ void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpu
     // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut);
     float* d_pfIn = nullptr;
     float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 1");
+    hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail hipMalloc 1");
+    hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail hipMalloc 2");
+    hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
+    cudaCheckErrors("poisson_1d fail hipMemcpy 1");
 
     // float fMin, fMax;
     // GetMinMax(pfIn, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
+    hiprandState *curandStates = nullptr;
     const int kiBlockDim = 1024;  // Threads per Block
     const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 3");
+    hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
+    cudaCheckErrors("poisson_1d fail hipMalloc 3");
     setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
     GeneratePoisson<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 2");
+    hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_1d fail hipMemcpy 2");
     // GetMinMax(pfOut, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
     
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
+    hipFree(d_pfIn); d_pfIn = nullptr;
+    hipFree(d_pfOut); d_pfOut = nullptr;
+    hipFree(curandStates); curandStates = nullptr;
 }
 
 void poisson_gaussian_1d(const float* pfIn,
@@ -164,30 +166,30 @@ void poisson_gaussian_1d(const float* pfIn,
     // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut);
     float* d_pfIn = nullptr;
     float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1");
+    hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 1");
+    hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 2");
+    hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
+    cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 1");
 
     // float fMin, fMax;
     // GetMinMax(pfIn, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
+    hiprandState *curandStates = nullptr;
     const int kiBlockDim = 64;  // Threads per Block
     const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3");
+    hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 3");
     setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
     GeneratePoissonAddGaussian<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2");
+    hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 2");
     // GetMinMax(pfOut, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
 
 
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
+    hipFree(d_pfIn); d_pfIn = nullptr;
+    hipFree(d_pfOut); d_pfOut = nullptr;
+    hipFree(curandStates); curandStates = nullptr;
 }
diff --git a/Common/CUDA/Siddon_projection.cu b/Common/CUDA/Siddon_projection.cu
index 2a025f8c..94b9eb1d 100644
--- a/Common/CUDA/Siddon_projection.cu
+++ b/Common/CUDA/Siddon_projection.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for ray-voxel intersection based projection
@@ -48,18 +49,18 @@
  */
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "Siddon_projection.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("Ax:Siddon_projection",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -94,7 +95,7 @@ do { \
      *
      **/
     
-    void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc);
+    void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc);
 
 __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
 
@@ -111,7 +112,7 @@ __global__ void kernelPixelDetector( Geometry geo,
         float* detector,
         const int currProjSetNumber,
         const int totalNoOfProjections,
-        cudaTextureObject_t tex){
+        hipTextureObject_t tex){
     
     
     unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
@@ -229,16 +230,16 @@ __global__ void kernelPixelDetector( Geometry geo,
     float ac=am;
     //eq (28), unit anlges
     float axu,ayu,azu;
-    axu=__frcp_rd(fabsf(ray.x));
-    ayu=__frcp_rd(fabsf(ray.y));
-    azu=__frcp_rd(fabsf(ray.z));
+    axu=__frcp_rn(fabsf(ray.x));
+    ayu=__frcp_rn(fabsf(ray.y));
+    azu=__frcp_rn(fabsf(ray.z));
     // eq(29), direction of update
     float iu,ju,ku;
     iu=(source.x< pixel1D.x)? 1.0f : -1.0f;
     ju=(source.y< pixel1D.y)? 1.0f : -1.0f;
     ku=(source.z< pixel1D.z)? 1.0f : -1.0f;
     
-    float maxlength=__fsqrt_rd(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
+    float maxlength=__fsqrt_rn(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
     float sum=0.0f;
     unsigned long Np=(imax-imin+1)+(jmax-jmin+1)+(kmax-kmin+1); // Number of intersections
     // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed
@@ -311,10 +312,10 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     if (!fits_in_memory){
         dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
         for (dev = 0; dev < deviceCount; dev++) {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < 2; ++i){
-                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
-                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
+                hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
+                hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
                 cudaCheckErrors("cudaMallocauxiliarty projections fail");
             }
         }
@@ -323,12 +324,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     // This is happening regarthless if the image fits on memory
     float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         
         for (int i = 0; i < 2; ++i){
-            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
-            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
-            cudaCheckErrors("cudaMalloc projections fail");
+            hipMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
+            hipMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
+            cudaCheckErrors("hipMalloc projections fail");
         }
     }
     
@@ -338,13 +339,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
-        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
 #endif
     cudaCheckErrors("Error pinning memory");
@@ -354,18 +355,18 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     // auxiliary variables
     Point3D source, deltaU, deltaV, uvOrigin;
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
     cudaCheckErrors("Error allocating auxiliary constant memory");
     
     // Create Streams for overlapping memcopy and compute
     int nStreams=deviceCount*2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < 2; ++i){
-            cudaStreamCreate(&stream[i+dev*2]);
+            hipStreamCreate(&stream[i+dev*2]);
             
         }
     }
@@ -376,8 +377,8 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
     unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management.
     int projection_this_block;
-    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
-    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
+    hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount];
+    hipArray **d_cuArrTex = new hipArray*[deviceCount];
     
     for (unsigned int sp=0;sp<splits;sp++){
         
@@ -406,7 +407,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
         // we project for all angles.
         for (unsigned int i=0; i<noOfKernelCalls; i++) {
             for (dev=0;dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
                     proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
@@ -429,8 +430,8 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     projParamsArrayHost[4*j+3]=source;
                     
                 }
-                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*2]);
-                cudaStreamSynchronize(stream[dev*2]);
+                hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[dev*2]);
+                hipStreamSynchronize(stream[dev*2]);
                 cudaCheckErrors("kernel fail");
                 kernelPixelDetector<<<grid,block,0,stream[dev*2]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
             }
@@ -450,7 +451,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                 // 1) grab previous results and put them in the auxiliary variable dProjection_accum
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -463,12 +464,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     else
                         projection_this_block=PROJ_PER_BLOCK;
 
-                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
+                    hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]);
                 }
                 //  2) take the results from current compute call and add it to the code in execution.
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -481,7 +482,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     else
                         projection_this_block=PROJ_PER_BLOCK;
 
-                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
+                    hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
                     vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
                 }
             } // end accumulation case, where the image needs to be split 
@@ -490,7 +491,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
             if (i>0){
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on previous set on this GPU
                     proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
                     if (dev+1==deviceCount) {    //is it the last device?
@@ -510,13 +511,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     else {
                         projection_this_block=PROJ_PER_BLOCK;
                     }
-                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+                    hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
                 }
             }
             // Make sure Computation on kernels has finished before we launch the next batch.
             for (dev = 0; dev < deviceCount; dev++){
-                cudaSetDevice(gpuids[dev]);
-                cudaStreamSynchronize(stream[dev*2]);
+                hipSetDevice(gpuids[dev]);
+                hipStreamSynchronize(stream[dev*2]);
             }
         }
         
@@ -524,7 +525,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
          // We still have the last set of projections to get out of GPUs
         for (dev = 0; dev < deviceCount; dev++)
         {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             //Global index of FIRST projection on this set on this GPU
             proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
             if(proj_global>=nangles) 
@@ -533,106 +534,106 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
             projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
                                       nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
 
-            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
+            hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
             cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
-            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+            hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
         }
         // Make sure everyone has done their bussiness before the next image split:
-        cudaDeviceSynchronize();
+        hipDeviceSynchronize();
     } // End image split loop.
     
     cudaCheckErrors("Main loop  fail");
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
     for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texImg[dev]);
-            cudaFreeArray(d_cuArrTex[dev]);
+            hipSetDevice(gpuids[dev]);
+            hipDestroyTextureObject(texImg[dev]);
+            hipFreeArray(d_cuArrTex[dev]);
     }
     delete[] texImg; texImg = 0;
     delete[] d_cuArrTex; d_cuArrTex = 0;
     // Freeing Stage
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dProjection[dev*2]);
-        cudaFree(dProjection[dev*2+1]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dProjection[dev*2]);
+        hipFree(dProjection[dev*2+1]);
         
     }
     free(dProjection);
     
     if(!fits_in_memory){
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(dProjection_accum[dev*2]);
-            cudaFree(dProjection_accum[dev*2+1]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(dProjection_accum[dev*2]);
+            hipFree(dProjection_accum[dev*2+1]);
             
         }
         free(dProjection_accum);
     }
     freeGeoArray(splits,geoArray);
-    cudaFreeHost(projParamsArrayHost);
+    hipHostFree(projParamsArrayHost);
    
     
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]) ;
+        hipStreamDestroy(stream[i]) ;
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
-        cudaHostUnregister(img);
+        hipHostUnregister(img);
     }
-    cudaCheckErrors("cudaFree  fail");
+    cudaCheckErrors("hipFree  fail");
 #endif
-    //cudaDeviceReset();
+    //hipDeviceReset();
     return 0;
 }
 
 
 
 
-void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc)
+void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc)
 {
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
     const unsigned int num_devices = gpuids.GetLength();
     if(alloc){
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //hipArray Descriptor
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
         }
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams);
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModePoint;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModePoint;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
         
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDeviceSynchronize();
+        hipSetDevice(gpuids[dev]);
+        hipDeviceSynchronize();
     }
     cudaCheckErrors("Texture object creation fail");
 }
@@ -842,8 +843,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
     const int deviceCount = gpuids.GetLength();
 
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUmemory","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/Siddon_projection_parallel.cu b/Common/CUDA/Siddon_projection_parallel.cu
index 25a07e9d..a6c50130 100644
--- a/Common/CUDA/Siddon_projection_parallel.cu
+++ b/Common/CUDA/Siddon_projection_parallel.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for ray-voxel intersection based projection
@@ -50,24 +51,24 @@
 
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "Siddon_projection_parallel.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",hipGetErrorString(__err));\
         } \
 } while (0)
     
     
 // Declare the texture reference.
-void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
+void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream);
 
 
 #define MAXTREADS 1024
@@ -105,7 +106,7 @@ __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is o
 
 
 __global__ void kernelPixelDetector_parallel( Geometry geo,
-        float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){
+        float* detector, const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex){
     
     unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
@@ -282,23 +283,23 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa
     size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float);
     float** dProjection=(float **)malloc(2*sizeof(float *));
     for (int i = 0; i < 2; ++i){
-        cudaMalloc((void**)&dProjection[i],   num_bytes);
-        cudaCheckErrors("cudaMalloc projections fail");
+        hipMalloc((void**)&dProjection[i],   num_bytes);
+        cudaCheckErrors("hipMalloc projections fail");
     }
     int nStreams=2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
     
     for (int i = 0; i < 2; ++i){
-        cudaStreamCreate(&stream[i]);
+        hipStreamCreate(&stream[i]);
     }
     
     
         
     // Texture object variables
-    cudaTextureObject_t *texImg = 0;
-    cudaArray **d_cuArrTex = 0;
-    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
+    hipTextureObject_t *texImg = 0;
+    hipArray **d_cuArrTex = 0;
+    texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*));
     
     CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg   [0],stream);
     cudaCheckErrors("Texture allocation fail");
@@ -310,7 +311,7 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa
     
     
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
 
     // 16x16 gave the best performance empirically
     // Funnily that makes it compatible with most GPUs.....
@@ -349,36 +350,36 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa
 
          }
          
-         cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-         cudaStreamSynchronize(stream[0]);
+         hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]);
+         hipStreamSynchronize(stream[0]);
          kernelPixelDetector_parallel<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
          // copy result to host
          if (i>0)
-             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);
+             hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]);
     }
-    cudaDeviceSynchronize();
+    hipDeviceSynchronize();
     
     int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
-    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
+    hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]);
 
     
 
-    cudaDestroyTextureObject(texImg[0]);
-    cudaFreeArray(d_cuArrTex[0]);
+    hipDestroyTextureObject(texImg[0]);
+    hipFreeArray(d_cuArrTex[0]);
     free(texImg); texImg = 0;
     free(d_cuArrTex); d_cuArrTex = 0;
     cudaCheckErrors("Unbind  fail");
-    cudaFree(dProjection[0]);
-    cudaFree(dProjection[1]);
+    hipFree(dProjection[0]);
+    hipFree(dProjection[1]);
     free(dProjection);
-    cudaFreeHost(projParamsArrayHost);
-    cudaCheckErrors("cudaFree d_imagedata fail");
+    hipHostFree(projParamsArrayHost);
+    cudaCheckErrors("hipFree d_imagedata fail");
     
     
     for (int i = 0; i < 2; ++i){
-      cudaStreamDestroy(stream[i]);
+      hipStreamDestroy(stream[i]);
     }
-//     cudaDeviceReset();
+//     hipDeviceReset();
     return 0;
 }
 
@@ -482,41 +483,41 @@ void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uv
     
     *source=S2;
 }
-void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
     
     
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
   
-    //cudaArray Descriptor
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    //hipArray Descriptor
+    hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
     //cuda Array
-    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0);
 
 
-        cudaMemcpy3DParms copyParams = {0};
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[0];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[1]);
     
 
     //Array creation End
 
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[0];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModePoint;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModePoint;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
     
 }
 
diff --git a/Common/CUDA/errors.hpp b/Common/CUDA/errors.hpp
index 05518b20..16bece09 100644
--- a/Common/CUDA/errors.hpp
+++ b/Common/CUDA/errors.hpp
@@ -1,4 +1,4 @@
-#define CUDA_SUCCESS 0
+#define hipSuccess 0
 #define ERR_CUDA 1
 
 #define ERR_NO_CAPABLE_DEVICES 2
diff --git a/Common/CUDA/gpuUtils.cu b/Common/CUDA/gpuUtils.cu
index 8f2754e4..910b7a58 100644
--- a/Common/CUDA/gpuUtils.cu
+++ b/Common/CUDA/gpuUtils.cu
@@ -1,7 +1,7 @@
 
 #include "gpuUtils.hpp"
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include <string.h>
 #include <stdio.h>
 
@@ -34,11 +34,11 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha
         return iCudaDeviceCount;
     }
 
-    cudaError_t err;
-    cudaDeviceProp propDevice;
+    hipError_t err;
+    hipDeviceProp_t propDevice;
     int nMatch = 0;
     for (int iId = 0; iId < iCudaDeviceCount; ++iId) {
-        err = cudaGetDeviceProperties(&propDevice, iId);
+        err = hipGetDeviceProperties(&propDevice, iId);
         iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name);
         if (strcmp(propDevice.name, kacGPUName) == 0) {
             piDeviceIds[nMatch] = iId;
@@ -55,16 +55,16 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha
 
 void GetGpuName(int iDeviceId, char* pcName) {
     memset(pcName, 0, 128);
-    cudaError_t err;
-    cudaDeviceProp propDevice;
+    hipError_t err;
+    hipDeviceProp_t propDevice;
     int id = iDeviceId;
-    err = cudaGetDeviceProperties(&propDevice, id);
+    err = hipGetDeviceProperties(&propDevice, id);
     memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char));
 }
 
 
 int GetGpuCount() {
     int iCudaDeviceCount = 0;
-    cudaGetDeviceCount(&iCudaDeviceCount);
+    hipGetDeviceCount(&iCudaDeviceCount);
     return iCudaDeviceCount;
 }
diff --git a/Common/CUDA/improvedForwardProjections.cu b/Common/CUDA/improvedForwardProjections.cu
index 0f32be72..7c5fbddd 100644
--- a/Common/CUDA/improvedForwardProjections.cu
+++ b/Common/CUDA/improvedForwardProjections.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  * CUDA function for optimized proton CT radiographies
  * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
@@ -20,19 +21,19 @@
  Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
 --------------------------------------------------------------------------*/
 
-#include <cuda.h>
+#include <hip/hip_runtime.h>
 #include "mex.h"
-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>
 #include "improvedForwardProjections.hpp"
 #include <algorithm>
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\
         } \
 } while (0)
 
@@ -937,43 +938,43 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
     }
 
     //Allocate Memory on GPU
-    cudaMalloc( (void**) &dPosIn, sizeInputs );
-    cudaMalloc( (void**) &dPosOut, sizeInputs );
-    cudaMalloc( (void**) &ddirIn, sizeInputs );
-    cudaMalloc( (void**) &ddirOut, sizeInputs );
-    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
-    cudaMalloc( (void**) &dhist1, detectorMem );
-    cudaMalloc( (void**) &dhist2, detectorMem );
-    cudaMalloc( (void**) &dnumEntries, sizeof(int));
-    cudaMalloc( (void**) &ddetectorX, sizeof(int));
-    cudaMalloc( (void**) &ddetectorY, sizeof(int));
-    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
-    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
-    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
-    cudaMalloc( (void**) &dEin, sizeof(float));
-    cudaMalloc( (void**) &dReject, sizeof(float));
-    cudaMalloc( (void**) &dHull, 5*sizeof(float));
-    cudaError_t _err_alloc = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
+    hipMalloc( (void**) &dPosIn, sizeInputs );
+    hipMalloc( (void**) &dPosOut, sizeInputs );
+    hipMalloc( (void**) &ddirIn, sizeInputs );
+    hipMalloc( (void**) &ddirOut, sizeInputs );
+    hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
+    hipMalloc( (void**) &dhist1, detectorMem );
+    hipMalloc( (void**) &dhist2, detectorMem );
+    hipMalloc( (void**) &dnumEntries, sizeof(int));
+    hipMalloc( (void**) &ddetectorX, sizeof(int));
+    hipMalloc( (void**) &ddetectorY, sizeof(int));
+    hipMalloc( (void**) &dpixelSize, 2*sizeof(float));
+    hipMalloc( (void**) &dDetectDistIn, sizeof(float));
+    hipMalloc( (void**) &dDetectDistOut, sizeof(float));
+    hipMalloc( (void**) &dEin, sizeof(float));
+    hipMalloc( (void**) &dReject, sizeof(float));
+    hipMalloc( (void**) &dHull, 5*sizeof(float));
+    hipError_t _err_alloc = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err_alloc));
     cudaCheckErrors("GPU Allocation failed!");
 
     //Copy Arrays to GPU
-    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
-    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
+    hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice);
+    hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice);
+    hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice);
     cudaCheckErrors("Host to device transport failed!");
 
 
@@ -984,8 +985,8 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
     
     ParticleKernel<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
             dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject);
-    cudaError_t _err = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err));
+    hipError_t _err = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err));
     cudaCheckErrors("Kernel fail!");
     
     //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
@@ -993,12 +994,12 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
     //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
         
     //Copy result from device to host
-    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
-    //cudaError_t _errcp = cudaGetLastError();
-    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
+    //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost);
+    //hipError_t _errcp = hipGetLastError();
+    //mexPrintf("%s \n", hipGetErrorString(_errcp));
     cudaCheckErrors("Device to host transport failed!");
     
     for(int j = 0; j<detectSizeX*detectSizeY; j++){
@@ -1007,22 +1008,22 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
 
     std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
 
-    cudaFree(dPosIn);
-    cudaFree(dPosOut);
-    cudaFree(ddirIn);
-    cudaFree(ddirOut);
-    cudaFree(dhist1);
-    cudaFree(dhist2);
-    cudaFree(d_wepl);
-    cudaFree(dnumEntries);
-    cudaFree(ddetectorX);
-    cudaFree(ddetectorY);
-    cudaFree(dpixelSize);
-    cudaFree(dDetectDistIn);
-    cudaFree(dDetectDistOut);
-    cudaFree(dEin);
-    cudaFree(dReject);
-    cudaFree(dHull);
+    hipFree(dPosIn);
+    hipFree(dPosOut);
+    hipFree(ddirIn);
+    hipFree(ddirOut);
+    hipFree(dhist1);
+    hipFree(dhist2);
+    hipFree(d_wepl);
+    hipFree(dnumEntries);
+    hipFree(ddetectorX);
+    hipFree(ddetectorY);
+    hipFree(dpixelSize);
+    hipFree(dDetectDistIn);
+    hipFree(dDetectDistOut);
+    hipFree(dEin);
+    hipFree(dReject);
+    hipFree(dHull);
 
     delete(hist1);
     delete(hist2);
diff --git a/Common/CUDA/improvedForwardProjections.hpp b/Common/CUDA/improvedForwardProjections.hpp
index 6da25b63..075dfd9c 100644
--- a/Common/CUDA/improvedForwardProjections.hpp
+++ b/Common/CUDA/improvedForwardProjections.hpp
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  * CUDA function for optimized proton CT radiographies
  * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
@@ -20,8 +21,8 @@
  Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
 --------------------------------------------------------------------------*/
 
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include <iostream>
 #ifndef improvedForwardProjections_H
 #define improvedForwardProjections_H
diff --git a/Common/CUDA/improvedForwardProjections_cone.cu b/Common/CUDA/improvedForwardProjections_cone.cu
index 7a4f6b46..d11657a9 100644
--- a/Common/CUDA/improvedForwardProjections_cone.cu
+++ b/Common/CUDA/improvedForwardProjections_cone.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  * CUDA function for optimized proton CT radiographies
  * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
@@ -21,19 +22,19 @@
 --------------------------------------------------------------------------*/
 
 
-#include <cuda.h>
+#include <hip/hip_runtime.h>
 #include "mex.h"
-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>
 #include "improvedForwardProjections.hpp"
 // #include <algorithm>
 // #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\
         } \
 } while (0)
 
@@ -1133,45 +1134,45 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
     }
 
     //Allocate Memory on GPU
-    cudaMalloc( (void**) &dPosIn, sizeInputs );
-    cudaMalloc( (void**) &dPosOut, sizeInputs );
-    cudaMalloc( (void**) &ddirIn, sizeInputs );
-    cudaMalloc( (void**) &ddirOut, sizeInputs );
-    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
-    cudaMalloc( (void**) &dhist1, detectorMem );
-    cudaMalloc( (void**) &dhist2, detectorMem );
-    cudaMalloc( (void**) &dnumEntries, sizeof(int));
-    cudaMalloc( (void**) &ddetectorX, sizeof(int));
-    cudaMalloc( (void**) &ddetectorY, sizeof(int));
-    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
-    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
-    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
-    cudaMalloc( (void**) &dSourceDist, sizeof(float));
-    cudaMalloc( (void**) &dEin, sizeof(float));
-    cudaMalloc( (void**) &dReject, sizeof(float));
-    cudaMalloc( (void**) &dHull, 5*sizeof(float));
-    cudaError_t _err_alloc = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
+    hipMalloc( (void**) &dPosIn, sizeInputs );
+    hipMalloc( (void**) &dPosOut, sizeInputs );
+    hipMalloc( (void**) &ddirIn, sizeInputs );
+    hipMalloc( (void**) &ddirOut, sizeInputs );
+    hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
+    hipMalloc( (void**) &dhist1, detectorMem );
+    hipMalloc( (void**) &dhist2, detectorMem );
+    hipMalloc( (void**) &dnumEntries, sizeof(int));
+    hipMalloc( (void**) &ddetectorX, sizeof(int));
+    hipMalloc( (void**) &ddetectorY, sizeof(int));
+    hipMalloc( (void**) &dpixelSize, 2*sizeof(float));
+    hipMalloc( (void**) &dDetectDistIn, sizeof(float));
+    hipMalloc( (void**) &dDetectDistOut, sizeof(float));
+    hipMalloc( (void**) &dSourceDist, sizeof(float));
+    hipMalloc( (void**) &dEin, sizeof(float));
+    hipMalloc( (void**) &dReject, sizeof(float));
+    hipMalloc( (void**) &dHull, 5*sizeof(float));
+    hipError_t _err_alloc = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err_alloc));
     cudaCheckErrors("GPU Allocation failed!");
 
     //Copy Arrays to GPU
-    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
-    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dSourceDist, &sourcePos, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
+    hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice);
+    hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dSourceDist, &sourcePos, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice);
+    hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice);
     cudaCheckErrors("Host to device transport failed!");
 
 
@@ -1182,8 +1183,8 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
     
     ParticleKernelCone<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
             dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist);
-    cudaError_t _err = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err));
+    hipError_t _err = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err));
     cudaCheckErrors("Kernel fail!");
     
     //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
@@ -1191,12 +1192,12 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
     //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
         
     //Copy result from device to host
-    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
-    //cudaError_t _errcp = cudaGetLastError();
-    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
+    //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost);
+    //hipError_t _errcp = hipGetLastError();
+    //mexPrintf("%s \n", hipGetErrorString(_errcp));
     cudaCheckErrors("Device to host transport failed!");
     
     for(int j = 0; j<detectSizeX*detectSizeY; j++){
@@ -1205,22 +1206,22 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
 
     std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
 
-    cudaFree(dPosIn);
-    cudaFree(dPosOut);
-    cudaFree(ddirIn);
-    cudaFree(ddirOut);
-    cudaFree(dhist1);
-    cudaFree(dhist2);
-    cudaFree(d_wepl);
-    cudaFree(dnumEntries);
-    cudaFree(ddetectorX);
-    cudaFree(ddetectorY);
-    cudaFree(dpixelSize);
-    cudaFree(dDetectDistIn);
-    cudaFree(dDetectDistOut);
-    cudaFree(dEin);
-    cudaFree(dReject);
-    cudaFree(dHull);
+    hipFree(dPosIn);
+    hipFree(dPosOut);
+    hipFree(ddirIn);
+    hipFree(ddirOut);
+    hipFree(dhist1);
+    hipFree(dhist2);
+    hipFree(d_wepl);
+    hipFree(dnumEntries);
+    hipFree(ddetectorX);
+    hipFree(ddetectorY);
+    hipFree(dpixelSize);
+    hipFree(dDetectDistIn);
+    hipFree(dDetectDistOut);
+    hipFree(dEin);
+    hipFree(dReject);
+    hipFree(dHull);
 
     delete(hist1);
     delete(hist2);
diff --git a/Common/CUDA/ray_interpolated_projection.cu b/Common/CUDA/ray_interpolated_projection.cu
index e71c5b59..cfafb99b 100644
--- a/Common/CUDA/ray_interpolated_projection.cu
+++ b/Common/CUDA/ray_interpolated_projection.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for texture-memory interpolation based projection
@@ -53,19 +54,19 @@
 
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "ray_interpolated_projection.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                        mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                        mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -100,7 +101,7 @@ do { \
      *
      *
      **/
-    void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate);
+    void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate);
 __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
 __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
 
@@ -119,7 +120,7 @@ template<bool sphericalrotation>
         float* detector,
         const int currProjSetNumber,
         const int totalNoOfProjections,
-        cudaTextureObject_t tex){
+        hipTextureObject_t tex){
     
     unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
@@ -161,7 +162,7 @@ template<bool sphericalrotation>
     P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
     
     // Length is the ray length in normalized space
-    float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z));
+    float length=__fsqrt_rn((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z));
     //now legth is an integer of Nsamples that are required on this line
     length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer
     vectX=__fdividef(P.x -source.x,length);
@@ -255,10 +256,10 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     if (!fits_in_memory){
         dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
         for (dev = 0; dev < deviceCount; dev++) {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < 2; ++i){
-                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
-                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
+                hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
+                hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
                 cudaCheckErrors("cudaMallocauxiliarty projections fail");
             }
         }
@@ -267,12 +268,12 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     // This is happening regarthless if the image fits on memory
     float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         
         for (int i = 0; i < 2; ++i){
-            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
-            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
-            cudaCheckErrors("cudaMalloc projections fail");
+            hipMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
+            hipMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
+            cudaCheckErrors("hipMalloc projections fail");
         }
     }
     
@@ -284,34 +285,34 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
    
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & splits>1){
-        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     cudaCheckErrors("Error pinning memory");
 #endif
     Point3D source, deltaU, deltaV, uvOrigin;
     
     Point3D* projParamsArrayHost = 0;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
     float* projFloatsArrayHost = 0;
-    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
+    hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
     cudaCheckErrors("Error allocating auxiliary constant memory");
     
     // Create Streams for overlapping memcopy and compute
     int nStream_device=2;
     int nStreams=deviceCount*nStream_device;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < nStream_device; ++i){
-            cudaStreamCreate(&stream[i+dev*nStream_device]);
+            hipStreamCreate(&stream[i+dev*nStream_device]);
             
         }
     }
@@ -324,8 +325,8 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
 
 
     
-    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
-    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
+    hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount];
+    hipArray **d_cuArrTex = new hipArray*[deviceCount];
     for (unsigned int sp=0;sp<splits;sp++){
         // Create texture objects for all GPUs
         
@@ -350,7 +351,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
         for (unsigned int i=0; i<noOfKernelCalls; i++) {
             for (dev=0;dev<deviceCount;dev++){
                 float is_spherical=0;
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
                     proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
@@ -378,9 +379,9 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                     projFloatsArrayHost[2*j+1]=floor(maxdist);
                 }
                 
-                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
-                cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
-                cudaStreamSynchronize(stream[dev*nStream_device]);
+                hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[dev*nStream_device]);
+                hipMemcpyToSymbolAsync(HIP_SYMBOL(projFloatsArrayDev), projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[dev*nStream_device]);
+                hipStreamSynchronize(stream[dev*nStream_device]);
                 
                 
                 //TODO: we could do this around X and Y axis too, but we would need to compute the new axis of rotation (not possible to know from jsut the angles)
@@ -407,7 +408,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                 // 1) grab previous results and put them in the auxiliary variable dProjection_accum
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -419,12 +420,12 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                                                   nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
                     else
                         projection_this_block=PROJ_PER_BLOCK;
-                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
+                    hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]);
                 }
                 //  2) take the results from current compute call and add it to the code in execution.
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -436,7 +437,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                                                   nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
                     else
                         projection_this_block=PROJ_PER_BLOCK;
-                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
+                    hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
                     vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
                 }
             } // end accumulation case, where the image needs to be split 
@@ -446,7 +447,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
             {
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on previous set on this GPU
                     proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
                     if (dev+1==deviceCount) {    //is it the last device?
@@ -466,21 +467,21 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                     else {
                         projection_this_block=PROJ_PER_BLOCK;
                     }
-                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+                    hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
                 }
             }
             // Make sure Computation on kernels has finished before we launch the next batch.
             for (dev = 0; dev < deviceCount; dev++)
             {
-                cudaSetDevice(gpuids[dev]);
-                cudaStreamSynchronize(stream[dev*2]);
+                hipSetDevice(gpuids[dev]);
+                hipStreamSynchronize(stream[dev*2]);
             }
         } // End noOfKernelCalls (i) loop.
         
         // We still have the last set of projections to get out of GPUs
         for (dev = 0; dev < deviceCount; dev++)
         {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             //Global index of FIRST projection on this set on this GPU
             proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
             if(proj_global>=nangles) 
@@ -489,15 +490,15 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
             projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
                                       nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
 
-            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
+            hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
             cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
-            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+            hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
         }
         // Make sure everyone has done their bussiness before the next image split:
         for (dev = 0; dev < deviceCount; dev++)
         {
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
     } // End image split loop.
     
@@ -505,99 +506,99 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDestroyTextureObject(texImg[dev]);
-        cudaFreeArray(d_cuArrTex[dev]);
+        hipSetDevice(gpuids[dev]);
+        hipDestroyTextureObject(texImg[dev]);
+        hipFreeArray(d_cuArrTex[dev]);
     }
     delete[] texImg; texImg = 0;
     delete[] d_cuArrTex; d_cuArrTex = 0;
     // Freeing Stage
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dProjection[dev*2]);
-        cudaFree(dProjection[dev*2+1]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dProjection[dev*2]);
+        hipFree(dProjection[dev*2+1]);
         
     }
     free(dProjection);
     
     if(!fits_in_memory){
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(dProjection_accum[dev*2]);
-            cudaFree(dProjection_accum[dev*2+1]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(dProjection_accum[dev*2]);
+            hipFree(dProjection_accum[dev*2+1]);
             
         }
         free(dProjection_accum);
     }
     freeGeoArray(splits,geoArray);
-    cudaFreeHost(projParamsArrayHost);
-    cudaFreeHost(projFloatsArrayHost);
+    hipHostFree(projParamsArrayHost);
+    hipHostFree(projFloatsArrayHost);
     
     
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]) ;
+        hipStreamDestroy(stream[i]) ;
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & splits>1){
-        cudaHostUnregister(img);
+        hipHostUnregister(img);
     }
 #endif
-    cudaCheckErrors("cudaFree  fail");
+    cudaCheckErrors("hipFree  fail");
     
-//     cudaDeviceReset();
+//     hipDeviceReset();
     return 0;
 }
-void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate)
+void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate)
 {
     const unsigned int num_devices = gpuids.GetLength();
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
     if(allocate){
         
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
+            //hipArray Descriptor
             
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
             cudaCheckErrors("Texture memory allocation fail");
         }
         
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaMemcpy3DParms copyParams = {0};
-        cudaSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams);
         //cudaCheckErrors("Texture memory data copy fail");
         //Array creation End
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
         if (geo.accuracy>1){
-            texDescr.filterMode = cudaFilterModePoint;
+            texDescr.filterMode = hipFilterModePoint;
             geo.accuracy=1;
         }
         else{
-            texDescr.filterMode = cudaFilterModeLinear;
+            texDescr.filterMode = hipFilterModeLinear;
         }
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
         cudaCheckErrors("Texture object creation fail");
     }
 }
@@ -828,8 +829,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
     size_t memtotal;
     int deviceCount = gpuids.GetLength();
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("ray_interpolated_projection:ax:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/ray_interpolated_projection_parallel.cu b/Common/CUDA/ray_interpolated_projection_parallel.cu
index 4aad5d6f..45cd3984 100644
--- a/Common/CUDA/ray_interpolated_projection_parallel.cu
+++ b/Common/CUDA/ray_interpolated_projection_parallel.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for texture-memory interpolation based projection
@@ -50,18 +51,18 @@
 
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "ray_interpolated_projection_parallel.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -96,7 +97,7 @@ do { \
  *
  *
  **/
-void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
+void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream);
 __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
 __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
 
@@ -104,7 +105,7 @@ __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on
 
 __global__ void kernelPixelDetector_parallel_interpolated( Geometry geo,
         float* detector,
-        const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+        const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex)
 {
 //         Point3D source ,
 //         Point3D deltaU,
@@ -199,23 +200,23 @@ int interpolation_projection_parallel(float  *  img, Geometry geo, float** resul
     size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float);
     float** dProjection=(float **)malloc(2*sizeof(float *));
     for (int i = 0; i < 2; ++i){
-        cudaMalloc((void**)&dProjection[i],   num_bytes);
-        cudaCheckErrors("cudaMalloc projections fail");
+        hipMalloc((void**)&dProjection[i],   num_bytes);
+        cudaCheckErrors("hipMalloc projections fail");
     }
     // allocate streams for memory and compute
     int nStreams=2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (int i = 0; i < 2; ++i){
-        cudaStreamCreate(&stream[i]);
+        hipStreamCreate(&stream[i]);
     }
     
     
     // Texture object variables
-    cudaTextureObject_t *texImg = 0;
-    cudaArray **d_cuArrTex = 0;
-    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
+    hipTextureObject_t *texImg = 0;
+    hipArray **d_cuArrTex = 0;
+    texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*));
     
     CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream);
     cudaCheckErrors("Texture allocation fail");
@@ -226,9 +227,9 @@ int interpolation_projection_parallel(float  *  img, Geometry geo, float** resul
     Point3D source, deltaU, deltaV, uvOrigin;
     
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
     float* projFloatsArrayHost;
-    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
+    hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
     
     // 16x16 gave the best performance empirically
     // Funnily that makes it compatible with most GPUs.....
@@ -266,39 +267,39 @@ int interpolation_projection_parallel(float  *  img, Geometry geo, float** resul
             projFloatsArrayHost[2*j+1]=floor(maxdist);
             
         }
-        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-        cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-        cudaStreamSynchronize(stream[0]);
+        hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]);
+        hipMemcpyToSymbolAsync(HIP_SYMBOL(projFloatsArrayDev), projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]);
+        hipStreamSynchronize(stream[0]);
         
         kernelPixelDetector_parallel_interpolated<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
         // copy result to host
         if (i>0)
-             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);    
+             hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]);    
     }
-    cudaDeviceSynchronize();
+    hipDeviceSynchronize();
     
     int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
-    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
+    hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]);
 
     
-    cudaDestroyTextureObject(texImg[0]);
-    cudaFreeArray(d_cuArrTex[0]);
+    hipDestroyTextureObject(texImg[0]);
+    hipFreeArray(d_cuArrTex[0]);
     free(texImg); texImg = 0;
     free(d_cuArrTex); d_cuArrTex = 0;
     cudaCheckErrors("Unbind  fail");
-    cudaFree(dProjection[0]);
-    cudaFree(dProjection[1]);
+    hipFree(dProjection[0]);
+    hipFree(dProjection[1]);
     free(dProjection);
-    cudaFreeHost(projParamsArrayHost);
-    cudaFreeHost(projFloatsArrayHost);
+    hipHostFree(projParamsArrayHost);
+    hipHostFree(projFloatsArrayHost);
 
-    cudaCheckErrors("cudaFree d_imagedata fail");
+    cudaCheckErrors("hipFree d_imagedata fail");
     
     
     for (int i = 0; i < 2; ++i){
-      cudaStreamDestroy(stream[i]);
+      hipStreamDestroy(stream[i]);
     }
-//     cudaDeviceReset();
+//     hipDeviceReset();
     
     return 0;
 }
@@ -410,40 +411,40 @@ void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* u
     
     *source=S;
 }
-void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
     
     
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
     
-    //cudaArray Descriptor
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    //hipArray Descriptor
+    hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
     //cuda Array
-    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0);
     
     
-    cudaMemcpy3DParms copyParams = {0};
+    hipMemcpy3DParms copyParams = {0};
     //Array creation
-    copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
+    copyParams.srcPtr   = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
     copyParams.dstArray = d_cuArrTex[0];
     copyParams.extent   = extent;
-    copyParams.kind     = cudaMemcpyHostToDevice;
-    cudaMemcpy3DAsync(&copyParams,stream[1]);
+    copyParams.kind     = hipMemcpyHostToDevice;
+    hipMemcpy3DAsync(&copyParams,stream[1]);
     
     
     //Array creation End
     
-    cudaResourceDesc    texRes;
-    memset(&texRes, 0, sizeof(cudaResourceDesc));
-    texRes.resType = cudaResourceTypeArray;
+    hipResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(hipResourceDesc));
+    texRes.resType = hipResourceTypeArray;
     texRes.res.array.array  = d_cuArrTex[0];
-    cudaTextureDesc     texDescr;
-    memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    hipTextureDesc     texDescr;
+    memset(&texDescr, 0, sizeof(hipTextureDesc));
     texDescr.normalizedCoords = false;
-    texDescr.filterMode = cudaFilterModeLinear;
-    texDescr.addressMode[0] = cudaAddressModeBorder;
-    texDescr.addressMode[1] = cudaAddressModeBorder;
-    texDescr.addressMode[2] = cudaAddressModeBorder;
-    texDescr.readMode = cudaReadModeElementType;
-    cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
-    
-}
\ No newline at end of file
+    texDescr.filterMode = hipFilterModeLinear;
+    texDescr.addressMode[0] = hipAddressModeBorder;
+    texDescr.addressMode[1] = hipAddressModeBorder;
+    texDescr.addressMode[2] = hipAddressModeBorder;
+    texDescr.readMode = hipReadModeElementType;
+    hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+    
+}
diff --git a/Common/CUDA/tv_proximal.cu b/Common/CUDA/tv_proximal.cu
index 32ae99c2..87d5407f 100644
--- a/Common/CUDA/tv_proximal.cu
+++ b/Common/CUDA/tv_proximal.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * MATLAB MEX  functions for TV image denoising. Check inputs and parses
@@ -57,17 +58,17 @@
 #include "tv_proximal.hpp"
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                cudaDeviceReset();\
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
+                hipDeviceReset();\
                 mexPrintf("%s \n",msg);\
-                        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",cudaGetErrorString(__err));\
+                        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",hipGetErrorString(__err));\
         } \
 } while (0)
 void cpy_from_host(float* device_array,float* host_array, 
                    unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
                    unsigned long long pixels_per_slice, unsigned int buffer_length, 
-                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size);  
+                   hipStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size);  
     
     
     __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
@@ -263,11 +264,11 @@ void cpy_from_host(float* device_array,float* host_array,
         // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
         int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+        hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
         if (isHostRegisterSupported & splits>1){
-            cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            hipHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
+            hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
         }
         cudaCheckErrors("Error pinning memory");
         
@@ -282,21 +283,21 @@ void cpy_from_host(float* device_array,float* host_array,
             if (buffer_length<maxIter){ // if we do only 1 big iter, they are not needed.
                 mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:Memory","TV dneoising requires 5 times the image memory. Your GPU(s) do not have the required memory.\n This memory will be attempted to allocate on the CPU, Whic may fail or slow the computation by a very significant amount.\n If you want to kill the execution: CTRL+C");
                 
-                cudaMallocHost((void**)&h_px,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                hipHostMalloc((void**)&h_px,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
                 cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
                 
-                cudaMallocHost((void**)&h_py,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                hipHostMalloc((void**)&h_py,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
                 cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
                 
-                cudaMallocHost((void**)&h_pz,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                hipHostMalloc((void**)&h_pz,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
                 cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
             }
             h_u=dst;
         }else{
-            cudaMallocHost((void**)&buffer_u,  pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_px, pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_py, pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_pz, pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_u,  pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_px, pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_py, pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_pz, pixels_per_slice*sizeof(float));
             
         }
         // We should be good to go memory wise.
@@ -310,31 +311,31 @@ void cpy_from_host(float* device_array,float* host_array,
         
         //Malloc
         for(dev=0;dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             // F
-            cudaMalloc((void**)&d_src[dev], mem_img_each_GPU);
+            hipMalloc((void**)&d_src[dev], mem_img_each_GPU);
             // U
-            cudaMalloc((void**)&d_u [dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_u [dev],  mem_img_each_GPU);
             // PX
-            cudaMalloc((void**)&d_px[dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_px[dev],  mem_img_each_GPU);
             // PY
-            cudaMalloc((void**)&d_py[dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_py[dev],  mem_img_each_GPU);
             // PZ
-            cudaMalloc((void**)&d_pz[dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_pz[dev],  mem_img_each_GPU);
         }
-        cudaDeviceSynchronize();
+        hipDeviceSynchronize();
         cudaCheckErrors("Malloc  error");
         
         
         // Create streams
         int nStream_device=5;
         int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
+                hipStreamCreate(&stream[i+dev*nStream_device]);
             }
         }
         cudaCheckErrors("Stream creation fail");
@@ -391,23 +392,23 @@ void cpy_from_host(float* device_array,float* host_array,
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
 
-                        cudaSetDevice(gpuids[dev]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_src[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        hipSetDevice(gpuids[dev]);
+                        if (is_last_chunk) {hipMemsetAsync(d_src[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
                         cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
 
-                        cudaMemcpyAsync(d_u[dev], d_src[dev], mem_img_each_GPU, cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
-                        cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
-                        cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
-                        cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        hipMemcpyAsync(d_u[dev], d_src[dev], mem_img_each_GPU, hipMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                        hipMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        hipMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        hipMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
                     }
 
                     // Sync
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                     cudaCheckErrors("Memcpy failure");
                     
@@ -419,34 +420,34 @@ void cpy_from_host(float* device_array,float* host_array,
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_u[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
+                        if (is_last_chunk) {hipMemsetAsync(d_u[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
                         cpy_from_host(d_u[dev],h_u,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
                     }
 
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+2]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+2]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+2]);
+                        if (is_last_chunk) {hipMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+2]);}
                         cpy_from_host(d_px[dev],h_px,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+2],  is_first_chunk,  is_last_chunk, image_size);
                     }
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+3]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+3]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+3]);
+                        if (is_last_chunk) {hipMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+3]);}
                         cpy_from_host(d_py[dev],h_py,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+3],  is_first_chunk,  is_last_chunk, image_size);
                     }
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+4]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+4]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+4]);
+                        if (is_last_chunk) {hipMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+4]);}
                         cpy_from_host(d_pz[dev],h_pz,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+4],  is_first_chunk,  is_last_chunk, image_size);
                         // Z derivative must be negated in sign to keep Neumman conditions
                         if (is_first_chunk){
@@ -459,15 +460,15 @@ void cpy_from_host(float* device_array,float* host_array,
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
+                        if (is_last_chunk) {hipMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
                         cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
                     }
 
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                         cudaCheckErrors("Memcpy failure on multi split");
                     }
                 }
@@ -480,7 +481,7 @@ void cpy_from_host(float* device_array,float* host_array,
                     // bdim and gdim
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
                         dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
@@ -490,7 +491,7 @@ void cpy_from_host(float* device_array,float* host_array,
                                 spacing[2], spacing[1], spacing[0]);
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
                         dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
@@ -503,8 +504,8 @@ void cpy_from_host(float* device_array,float* host_array,
                 
                 // Synchronize mathematics, make sure bounding pixels are correct
                 for(dev=0; dev<deviceCount;dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
+                    hipSetDevice(gpuids[dev]);
+                    hipDeviceSynchronize();
                 }
 
                 // We have done as many iterations as our buffer allowed. We now need to syncronize the buffers.
@@ -519,45 +520,45 @@ void cpy_from_host(float* device_array,float* host_array,
                     // Pass buffer_pixels amount of data from the start of the image to the previous GPU
                     for(dev=0; dev<deviceCount;dev++){
                         if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpyAsync(buffer_u , d_u[dev+1] , buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+1]);
-                            cudaMemcpyAsync(buffer_px, d_px[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+2]);
-                            cudaMemcpyAsync(buffer_py, d_py[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+3]);
-                            cudaMemcpyAsync(buffer_pz, d_pz[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev+1]);
+                            hipMemcpyAsync(buffer_u , d_u[dev+1] , buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+1]);
+                            hipMemcpyAsync(buffer_px, d_px[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+2]);
+                            hipMemcpyAsync(buffer_py, d_py[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+3]);
+                            hipMemcpyAsync(buffer_pz, d_pz[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+4]);
 
                             
-                            cudaSetDevice(gpuids[dev]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+1]);
-                            cudaMemcpyAsync(d_u[dev] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+2]);
-                            cudaMemcpyAsync(d_px[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+3]);
-                            cudaMemcpyAsync(d_py[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+4]);
-                            cudaMemcpyAsync(d_pz[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+1]);
+                            hipMemcpyAsync(d_u[dev] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_u , buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+2]);
+                            hipMemcpyAsync(d_px[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_px, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+3]);
+                            hipMemcpyAsync(d_py[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_py, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+4]);
+                            hipMemcpyAsync(d_pz[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pz, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
                             
                             
                         }
-                        cudaDeviceSynchronize();
+                        hipDeviceSynchronize();
                         // Pass buffer_pixels amoung of data of the end part of the image to the next GPU.
                         if (dev>0){
                             // U
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer_u,  d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]);
-                            cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]);
-                            cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]);
-                            cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev-1]);
+                            hipMemcpyAsync(buffer_u,  d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]);
+                            hipMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]);
+                            hipMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]);
+                            hipMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]);
                             
                             
-                            cudaSetDevice(gpuids[dev]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]);
-                            cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]);
-                            cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]);
-                            cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]);
-                            cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+1]);
+                            hipMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+2]);
+                            hipMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+3]);
+                            hipMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+4]);
+                            hipMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
                             
                             
                         }
@@ -567,22 +568,22 @@ void cpy_from_host(float* device_array,float* host_array,
                 }else{
                     // Vopy all the U variable into the host.
                     for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
                         total_pixels     = curr_slices*pixels_per_slice;
-                        cudaMemcpyAsync(&h_u[linear_idx_start],  d_u [dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(&h_u[linear_idx_start],  d_u [dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                     }
                     if ((i+buffer_length)<maxIter){ // If its the last iteration, we don't need to get these out.
                         // if its not, copy them to host fully. 
                         for(dev=0; dev<deviceCount;dev++){
-                            cudaSetDevice(gpuids[dev]);
+                            hipSetDevice(gpuids[dev]);
                             curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                             linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
                             total_pixels     = curr_slices*pixels_per_slice;
-                            cudaMemcpyAsync(&h_px[linear_idx_start], d_px[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+2]);
-                            cudaMemcpyAsync(&h_py[linear_idx_start], d_py[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+3]);
-                            cudaMemcpyAsync(&h_pz[linear_idx_start], d_pz[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+4]);
+                            hipMemcpyAsync(&h_px[linear_idx_start], d_px[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+2]);
+                            hipMemcpyAsync(&h_py[linear_idx_start], d_py[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+3]);
+                            hipMemcpyAsync(&h_pz[linear_idx_start], d_pz[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+4]);
                             
                         }
                     }
@@ -592,8 +593,8 @@ void cpy_from_host(float* device_array,float* host_array,
         }//END main iter
         
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("TV minimization");
         
@@ -601,47 +602,47 @@ void cpy_from_host(float* device_array,float* host_array,
         // lets get it out. 
         if(splits==1){
             for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 curr_slices  = ((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
                 total_pixels = curr_slices*pixels_per_slice;
-                cudaMemcpyAsync(dst+slices_per_split*pixels_per_slice*dev, d_u[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                hipMemcpyAsync(dst+slices_per_split*pixels_per_slice*dev, d_u[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
             }
         } // done, everything in GPU and auxiliary variables are good to go. 
 
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("Copy result back");
         for(dev=0; dev<deviceCount;dev++){
             
-            cudaFree(d_src[dev]);
-            cudaFree(d_u [dev]);
-            cudaFree(d_pz[dev]);
-            cudaFree(d_py[dev]);
-            cudaFree(d_px[dev]);
+            hipFree(d_src[dev]);
+            hipFree(d_u [dev]);
+            hipFree(d_pz[dev]);
+            hipFree(d_py[dev]);
+            hipFree(d_px[dev]);
         }
         if(splits>1 && buffer_length<maxIter){
-            cudaFreeHost(h_px);
-            cudaFreeHost(h_py);
-            cudaFreeHost(h_pz);
+            hipHostFree(h_px);
+            hipHostFree(h_py);
+            hipHostFree(h_pz);
         }else if(splits==1){
-            cudaFreeHost(buffer_u);
-            cudaFreeHost(buffer_px);
-            cudaFreeHost(buffer_py);
-            cudaFreeHost(buffer_pz);
+            hipHostFree(buffer_u);
+            hipHostFree(buffer_px);
+            hipHostFree(buffer_py);
+            hipHostFree(buffer_pz);
         }
         
         for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
+           hipStreamDestroy(stream[i]) ;
 
         if (isHostRegisterSupported & splits>1){
-            cudaHostUnregister(src);
-            cudaHostUnregister(dst);
+            hipHostUnregister(src);
+            hipHostUnregister(dst);
         }
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("Copy free ");
         
@@ -654,8 +655,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
         const int deviceCount = gpuids.GetLength();
 
         for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
+            hipSetDevice(gpuids[dev]);
+            hipMemGetInfo(&memfree,&memtotal);
             if(dev==0) *mem_GPU_global=memfree;
             if(memfree<memtotal/2){
                 mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
@@ -672,22 +673,22 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
 void cpy_from_host(float* device_array,float* host_array, 
                    unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
                    unsigned long long pixels_per_slice, unsigned int buffer_length, 
-                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size)
+                   hipStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size)
 {
 
     // Initial and last cases are special. These define the boundary condition. In our case, we are using Neumann boundary condition
     // so we need to copy the edge slice into the buffer
     if(is_first_chunk){
         for (unsigned int j=0;j<buffer_length;j++){
-            cudaMemcpyAsync(device_array+pixels_per_slice*j, host_array+pixels_per_slice*(buffer_length-j), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream); 
+            hipMemcpyAsync(device_array+pixels_per_slice*j, host_array+pixels_per_slice*(buffer_length-j), pixels_per_slice*sizeof(float), hipMemcpyHostToDevice,stream); 
         }       
     }
     if(is_last_chunk){  
 
         for (unsigned int j=0;j<buffer_length;j++){
-           cudaMemcpyAsync(device_array+bytes_device+pixels_per_slice*j, host_array+pixels_per_slice*(image_size[2]-j-2), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream);
+           hipMemcpyAsync(device_array+bytes_device+pixels_per_slice*j, host_array+pixels_per_slice*(image_size[2]-j-2), pixels_per_slice*sizeof(float), hipMemcpyHostToDevice,stream);
         }
     }
-    cudaStreamSynchronize(stream);
-    cudaMemcpyAsync(device_array +offset_device, host_array +offset_host,  bytes_device*sizeof(float), cudaMemcpyHostToDevice,stream);
+    hipStreamSynchronize(stream);
+    hipMemcpyAsync(device_array +offset_device, host_array +offset_host,  bytes_device*sizeof(float), hipMemcpyHostToDevice,stream);
 }
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection.cu b/Common/CUDA/voxel_backprojection.cu
index bec4d909..b525d23c 100644
--- a/Common/CUDA/voxel_backprojection.cu
+++ b/Common/CUDA/voxel_backprojection.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA function for backrpojection using FDK weigts for CBCT
@@ -45,8 +46,8 @@
 
 #define  PI_2 1.57079632679489661923
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "voxel_backprojection.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
@@ -55,10 +56,10 @@
 // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -91,7 +92,7 @@ do { \
      *
      **/
     
-    void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate);
+    void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, int nStreamDevice,bool allocate);
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -134,7 +135,7 @@ __constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL];
 //      Description:    Main FDK backprojection kernel
 //______________________________________________________________________________
 
-__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex)
 {
     
     // Old kernel call signature:
@@ -246,7 +247,7 @@ __global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,co
             
             weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO);
             
-            weight=__frcp_rd(weight*weight);
+            weight=__frcp_rn(weight*weight);
             
             // Get Value in the computed (U,V) and multiply by the corresponding weight.
             // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
@@ -323,16 +324,16 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
 #ifndef NO_PINNED_MEMORY    
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
-        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     if (isHostRegisterSupported ){ 
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); 
+        hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); 
     } 
 #endif
     cudaCheckErrors("Error pinning memory");
@@ -348,20 +349,20 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
     float** dimage=(float**)malloc(deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMalloc((void**)&dimage[dev], num_bytes_img);
-        cudaCheckErrors("cudaMalloc fail");
+        hipSetDevice(gpuids[dev]);
+        hipMalloc((void**)&dimage[dev], num_bytes_img);
+        cudaCheckErrors("hipMalloc fail");
     }
     
     //If it is the first time, lets make sure our image is zeroed.
     int nStreamDevice=2;
     int nStreams=deviceCount*nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < nStreamDevice; ++i){
-            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
+            hipStreamCreate(&stream[i+dev*nStreamDevice]);
             
         }
     }
@@ -371,16 +372,16 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     
     // Kernel auxiliary variables
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D));
     float* projSinCosArrayHost;
-    cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float));
+    hipHostMalloc((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float));
     
     
     // Texture object variables
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
+    hipTextureObject_t *texProj;
+    hipArray **d_cuArrTex;
+    texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*));
     
     // Auxiliary Host page-locked memory for fast and asycnornous memcpy.
 
@@ -401,8 +402,8 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     for(unsigned int img_slice=0;img_slice<split_image;img_slice++){
         // Initialize the memory if its the first time.
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemset(dimage[dev],0,num_bytes_img);
+            hipSetDevice(gpuids[dev]);
+            hipMemset(dimage[dev],0,num_bytes_img);
             cudaCheckErrors("memset fail");
         }
         
@@ -451,8 +452,8 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
                         (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
+                    hipSetDevice(gpuids[dev]);
+                    hipStreamSynchronize(stream[dev*nStreamDevice+1]);
                  }
                                
                 // Pin the next chunk of projection data, unpin the current one.
@@ -463,7 +464,7 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
                     if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
                         break;
                     
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     
                     
                     
@@ -538,9 +539,9 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
                         }   // END for (preparing params for kernel call)
                         
                         // Copy the prepared parameter arrays to constant memory to make it available for the kernel
-                        cudaMemcpyToSymbolAsync(projSinCosArrayDev, projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArrayDev), projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipStreamSynchronize(stream[dev*nStreamDevice]);
                         
                         kernelPixelBackprojectionFDK<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
                     }  // END for
@@ -551,8 +552,8 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
             } // END sub-split of current projection chunk
             
             for (dev = 0; dev < deviceCount; dev++){
-                cudaSetDevice(gpuids[dev]);
-                cudaDeviceSynchronize();
+                hipSetDevice(gpuids[dev]);
+                hipDeviceSynchronize();
             }
             
         } // END projection splits
@@ -560,15 +561,15 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
        
         // Now we need to take the image out of the GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             // We do not need to sycnronize because the array dealocators already do.
             num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
             img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
-            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
+            hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
         }
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
             cudaCheckErrors("Main loop fail");
         }
         
@@ -582,38 +583,38 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
         if (!two_buffers_used && i==1)
             break;
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
-            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
+            hipSetDevice(gpuids[dev]);
+            hipDestroyTextureObject(texProj[i*deviceCount+dev]);
+            hipFreeArray(d_cuArrTex[i*deviceCount+dev]);
         }
     }
     cudaCheckErrors("cudadestroy textures result fail");
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dimage[dev]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dimage[dev]);
     }
-    cudaFreeHost(projSinCosArrayHost);
-    cudaFreeHost(projParamsArrayHost);
+    hipHostFree(projSinCosArrayHost);
+    hipHostFree(projParamsArrayHost);
     free(partial_projection);
     free(proj_split_size);
     
     freeGeoArray(split_image*deviceCount,geoArray);
 #ifndef NO_PINNED_MEMORY        
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
-        cudaHostUnregister(result);
+        hipHostUnregister(result);
     }
     if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
+        hipHostUnregister(projections);
     }
 #endif
     
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
+        hipStreamDestroy(stream[i]);
     
-    cudaCheckErrors("cudaFree fail");
+    cudaCheckErrors("hipFree fail");
     
-    //cudaDeviceReset(); // For the Nvidia Visual Profiler
+    //hipDeviceReset(); // For the Nvidia Visual Profiler
     return 0;
     
 }  // END voxel_backprojection
@@ -664,52 +665,52 @@ void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsign
 }
 
 
-void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
+void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
 #if IS_FOR_MATLAB_TIGRE
-    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles);
 #else
-    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles);
 #endif
     const unsigned int num_devices = gpuids.GetLength();
     if (allocate){
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //hipArray Descriptor
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
             
         }
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
     }
 
     //Array creation End
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModeLinear;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
     }
 }
 
@@ -903,8 +904,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
     const int deviceCount = gpuids.GetLength();
     
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/voxel_backprojection2.cu b/Common/CUDA/voxel_backprojection2.cu
index c9dcc957..814422a8 100644
--- a/Common/CUDA/voxel_backprojection2.cu
+++ b/Common/CUDA/voxel_backprojection2.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA function for backrpojection using FDK weigts for CBCT
@@ -45,8 +46,8 @@
 
 #define  PI_2 1.57079632679489661923
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "voxel_backprojection2.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
@@ -55,10 +56,10 @@
 // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -92,7 +93,7 @@ do { \
      **/
     
 // this definitionmust go here.
-void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate);
+void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate);
 
 __global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){
     size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -139,7 +140,7 @@ __constant__ float projSinCosArray2Dev[5*PROJ_PER_KERNEL];
 //      Description:    Main FDK backprojection kernel
 //______________________________________________________________________________
 
-__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex)
 {
     
     unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
@@ -271,7 +272,7 @@ __global__ void kernelPixelBackprojection(const Geometry geo, float* image,const
             realD.y=-realDaux.x*sinalpha  + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x)
             float L,lsq;
             
-            L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always.
+            L = __fsqrt_rn( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always.
             lsq =  (realS.x-realvoxel.x)*(realS.x-realvoxel.x)
             + (realS.y-realvoxel.y)*(realS.y-realvoxel.y)
             + (realS.z-realvoxel.z)*(realS.z-realvoxel.z);
@@ -355,9 +356,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
     float** dimage=(float**)malloc(deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMalloc((void**)&dimage[dev], num_bytes_img);
-        cudaCheckErrors("cudaMalloc fail");
+        hipSetDevice(gpuids[dev]);
+        hipMalloc((void**)&dimage[dev], num_bytes_img);
+        cudaCheckErrors("hipMalloc fail");
     }
         
     
@@ -366,15 +367,15 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & split_image>1){
-        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     if (isHostRegisterSupported ){
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     cudaCheckErrors("Error pinning memory");
 
@@ -385,27 +386,27 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     //If it is the first time, lets make sure our image is zeroed.
     int nStreamDevice=2;
     int nStreams=deviceCount*nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < nStreamDevice; ++i){
-            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
+            hipStreamCreate(&stream[i+dev*nStreamDevice]);
             
         }
     }
     
     // Kernel auxiliary variables
     Point3D* projParamsArray2Host;
-    cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D));
     float* projSinCosArray2Host;
-    cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float));
+    hipHostMalloc((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float));
     
     // Texture object variables
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
+    hipTextureObject_t *texProj;
+    hipArray **d_cuArrTex;
+    texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*));
     
     
     
@@ -425,8 +426,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
 //
         // Initialize the memory if its the first time.
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemset(dimage[dev],0,num_bytes_img);
+            hipSetDevice(gpuids[dev]);
+            hipMemset(dimage[dev],0,num_bytes_img);
             cudaCheckErrors("memset fail");
         }
         
@@ -478,8 +479,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
                         (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
+                    hipSetDevice(gpuids[dev]);
+                    hipStreamSynchronize(stream[dev*nStreamDevice+1]);
                  }
 
                 for (dev = 0; dev < deviceCount; dev++){
@@ -489,7 +490,7 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
                     if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
                         break;
                     
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     
                     
                     
@@ -566,9 +567,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
                         }   // END for (preparing params for kernel call)
                         
                         // Copy the prepared parameter arrays to constant memory to make it available for the kernel
-                        cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArray2Dev), projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArray2Dev), projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipStreamSynchronize(stream[dev*nStreamDevice]);
                         kernelPixelBackprojection<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
                         
                     }  // END for
@@ -581,24 +582,24 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
         } // END projection splits
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>(  geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV));
         }
 
         // Now we need to take the image out of the GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaStreamSynchronize(stream[dev*nStreamDevice]);
+            hipSetDevice(gpuids[dev]);
+            hipStreamSynchronize(stream[dev*nStreamDevice]);
             
             num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
             img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
-            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
+            hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
         }
     } // end image splits
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDeviceSynchronize();
+        hipSetDevice(gpuids[dev]);
+        hipDeviceSynchronize();
     }  
     
     
@@ -607,40 +608,40 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
         if (!two_buffers_used && i==1)
             break;        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
-            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
+            hipSetDevice(gpuids[dev]);
+            hipDestroyTextureObject(texProj[i*deviceCount+dev]);
+            hipFreeArray(d_cuArrTex[i*deviceCount+dev]);
         }
     }
     free(d_cuArrTex);
     free(texProj);
 
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dimage[dev]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dimage[dev]);
     }
     free(dimage);
     
-    cudaFreeHost(projSinCosArray2Host);
-    cudaFreeHost(projParamsArray2Host);
+    hipHostFree(projSinCosArray2Host);
+    hipHostFree(projParamsArray2Host);
     free(partial_projection);
     free(proj_split_size);
     
     freeGeoArray(split_image*deviceCount,geoArray);
 #ifndef NO_PINNED_MEMORY     
     if (isHostRegisterSupported & split_image>1){
-        cudaHostUnregister(result);
+        hipHostUnregister(result);
     }
     if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
+        hipHostUnregister(projections);
     }
 #endif 
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
+        hipStreamDestroy(stream[i]);
     
-    cudaCheckErrors("cudaFree fail");
+    cudaCheckErrors("hipFree fail");
     
-//     cudaDeviceReset(); // For the Nvidia Visual Profiler
+//     hipDeviceReset(); // For the Nvidia Visual Profiler
     return 0;
     
 }  // END voxel_backprojection
@@ -649,52 +650,52 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
 
 
 
-void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
+void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
     int num_devices = gpuids.GetLength();
 #if IS_FOR_MATLAB_TIGRE
-    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles);
 #else
-    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles);
 #endif
     if (allocate){
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //hipArray Descriptor
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
             
         }
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
     }
 
     //Array creation End
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModeLinear;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
     }
 }
 #ifndef BACKPROJECTION_HPP
@@ -826,8 +827,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
     const int gpuids.GetLength();
     
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/voxel_backprojection_parallel.cu b/Common/CUDA/voxel_backprojection_parallel.cu
index 03703576..055d27d2 100644
--- a/Common/CUDA/voxel_backprojection_parallel.cu
+++ b/Common/CUDA/voxel_backprojection_parallel.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA function for backrpojection  for parallel beam
@@ -46,8 +47,8 @@
 
 #define  PI_2 1.57079632679489661923
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "voxel_backprojection.hpp"
 #include "voxel_backprojection_parallel.hpp"
 
@@ -57,10 +58,10 @@
 // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -92,7 +93,7 @@ do { \
      *
      *
      **/
-void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate);
+void CreateTextureParallel( float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool allocate);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
@@ -135,7 +136,7 @@ __constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL];
 //      Description:    Main FDK backprojection kernel
 //______________________________________________________________________________
 
-__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex)
+__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,hipTextureObject_t tex)
 {
     
     // Old kernel call signature:
@@ -286,9 +287,9 @@ __global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* im
 int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids)
 {
     if (gpuids.GetLength() == 0) {
-        cudaSetDevice(0);
+        hipSetDevice(0);
     } else {
-        cudaSetDevice(gpuids[0]);
+        hipSetDevice(gpuids[0]);
     }
     
     /*
@@ -298,10 +299,10 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     //If it is the first time, lets make sure our image is zeroed.
     int nStreamDevice=2;
     int nStreams=nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (int i = 0; i < nStreamDevice; ++i){
-        cudaStreamCreate(&stream[i]);
+        hipStreamCreate(&stream[i]);
         
         
     }
@@ -310,10 +311,10 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     if (isHostRegisterSupported){
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     cudaCheckErrors("Error pinning memory");
     
@@ -321,22 +322,22 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     // Allocate result image memory
     size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float);
     float* dimage;
-    cudaMalloc((void**)&dimage, num_bytes);
-    cudaMemset(dimage,0,num_bytes);
-    cudaCheckErrors("cudaMalloc fail");
+    hipMalloc((void**)&dimage, num_bytes);
+    hipMemset(dimage,0,num_bytes);
+    cudaCheckErrors("hipMalloc fail");
     
     
     Point3D* projParamsArrayHostParallel;
-    cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D));
     float* projSinCosArrayHostParallel;
-    cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float));
+    hipHostMalloc((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float));
     
     
     // Texture buffer objects
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*));
+    hipTextureObject_t *texProj;
+    hipArray **d_cuArrTex;
+    texProj =(hipTextureObject_t*)malloc(2*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(2*sizeof(hipArray*));
 
     
     
@@ -389,7 +390,7 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
                 (proj_block_split<2));// Only allocate if its the first 2 calls
         
   
-        cudaStreamSynchronize(stream[0+1]);
+        hipStreamSynchronize(stream[0+1]);
         
         
 
@@ -464,9 +465,9 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
             
             // Copy the prepared parameter arrays to constant memory to make it available for the kernel
             
-            cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
-            cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
-            cudaStreamSynchronize(stream[0]);
+            hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArrayDevParallel), projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]);
+            hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDevParallel), projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]);
+            hipStreamSynchronize(stream[0]);
 
             kernelPixelBackprojection_parallel<<<grid,block,0,stream[0]>>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]);
         }  // END for
@@ -475,9 +476,9 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
         // END Main reconstruction loop: go through projections (rotation angles) and backproject
         //////////////////////////////////////////////////////////////////////////////////////
     }
-    cudaDeviceSynchronize();
-    cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost);
-    cudaCheckErrors("cudaMemcpy result fail");
+    hipDeviceSynchronize();
+    hipMemcpy(result, dimage, num_bytes, hipMemcpyDeviceToHost);
+    cudaCheckErrors("hipMemcpy result fail");
     
     free(partial_projection);
     free(proj_split_size);
@@ -486,23 +487,23 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
         if (!two_buffers_used && i==1)
             break;            
-            cudaDestroyTextureObject(texProj[i]);
-            cudaFreeArray(d_cuArrTex[i]);
+            hipDestroyTextureObject(texProj[i]);
+            hipFreeArray(d_cuArrTex[i]);
     }
     free(texProj);
     
     free(d_cuArrTex);
-    cudaFreeHost(projSinCosArrayHostParallel);
-    cudaFreeHost(projParamsArrayHostParallel);
+    hipHostFree(projSinCosArrayHostParallel);
+    hipHostFree(projParamsArrayHostParallel);
     
-    cudaFree(dimage);
+    hipFree(dimage);
     if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
+        hipHostUnregister(projections);
     }
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
+        hipStreamDestroy(stream[i]);
 
-//     cudaDeviceReset();
+//     hipDeviceReset();
     return 0;
     
 }  // END voxel_backprojection
@@ -583,45 +584,45 @@ void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D*
 
     
 }  // END computeDeltasCube
-void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc)
+void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool alloc)
 {
-        //cudaArray Descriptor
+        //hipArray Descriptor
 #if IS_FOR_MATLAB_TIGRE
-        const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+        const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles);
 #else
-        const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+        const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles);
 #endif
-        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+        hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
         //cuda Array
         if (alloc){
-        cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+        hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0);
         cudaCheckErrors("Texture memory allocation fail");
         }
-        cudaMemcpy3DParms copyParams = {0};
+        hipMemcpy3DParms copyParams = {0};
         
         
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[0];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[0+1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[0+1]);
         cudaCheckErrors("Texture memory data copy fail");
         //Array creation End
         
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[0];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModeLinear;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
         cudaCheckErrors("Texture object creation fail");
     
-}
\ No newline at end of file
+}
diff --git a/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip
new file mode 100644
index 00000000..650a9815
--- /dev/null
+++ b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <string.h>
+#include <mex.h>
+#include <CUDA/gpuUtils.hpp>
+
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+	if (nrhs != 0) {
+		mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "No input requred.");
+		return;
+	}
+	if (nlhs != 1) {
+		mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "Too many output arguments. Returns one integer.");
+		return;
+	}
+	int iCount = GetGpuCount();
+	size_t dims[2] = {1,1};
+	plhs[0] = mxCreateNumericArray(2, dims, mxUINT32_CLASS, mxREAL);
+    *((int*)mxGetData(plhs[0])) = iCount;
+}
diff --git a/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip
new file mode 100644
index 00000000..c56ca29b
--- /dev/null
+++ b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip
@@ -0,0 +1,29 @@
+#include <mex.h>
+#include <CUDA/gpuUtils.hpp>
+
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+    // Usage: name = getGpuName_mex(int iId)
+    if (nrhs != 1) {
+		mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:invalidNumInputs", "One input required.");
+		return;
+	} else if(nlhs > 1) {
+        mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:maxlhs", "Too many output arguments.");
+		return;
+    }
+    
+    int iId = 0;
+    if (mxIsDouble(prhs[0])) {
+        mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:inputNotInt", "Input must be an integer.");
+        return;
+    } else {
+        iId = *((int*)mxGetData(prhs[0]));
+    }
+	int iCount = GetGpuCount();
+    char* pcName = (char*)mxCalloc(128, sizeof(char));
+    if (iId < iCount) {
+        GetGpuName(iId, pcName);
+    }
+    plhs[0] = mxCreateString(pcName);
+}
diff --git a/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip
new file mode 100644
index 00000000..670c2d3e
--- /dev/null
+++ b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip
@@ -0,0 +1,28 @@
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <stdio.h>
+#include <iostream>
+
+// Purpose: To fast read .xim files 
+// Method: based on ReadXim.m by Fredrik Nordström 2015
+// Date: 2017.07
+// Author: Yi Du, yi.du@hotmail.com
+
+#ifndef STR_XIM
+#define STR_XIM
+//struct XimPara
+typedef struct XimPara
+{
+	char FileName[256]; 
+	int ImgWidth;					// Image Width
+	int ImgHeight;					// Image Height
+	int PixelNO;
+
+	int BytesPerPixel;				// Determine how to read the data
+	int Compression_Indicator;		// Data number in Rec Image Matrix
+
+	double GantryRtn;				// Gantry rotation angle
+    int KVNormChamber;          // KV norm chamber reading, date: 2022-05-23
+}XimPara;
+#endif
+
diff --git a/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip
new file mode 100644
index 00000000..453c4278
--- /dev/null
+++ b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip
@@ -0,0 +1,357 @@
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "io64.h"
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <cstdint>
+//**** C data types are defined in tmwtypes.h
+#include <tmwtypes.h>
+#include "mex.h"
+#include <math.h>
+#include "matrix.h"
+#include "XimPara.hpp"
+
+#define GET_BIT(x,bit) ((x & (1 << bit)) >>bit)
+
+// Purpose: To fast read .xim files 
+// Method: based on ReadXim.m by Fredrik Nordström 2015
+// Date: 2017.07
+// Author: Yi Du, yi.du@hotmail.com
+
+
+int cReadXim(char *XimFullFile, XimPara *XimStr, int *XimImg);
+
+void mexFunction(
+        int nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+    //check input variable
+    if (mxIsChar(prhs[0]) != 1)
+    mexErrMsgIdAndTxt( "MATLAB:revord:inputNotString",
+          "Input must be a string.");
+    
+    // .xim filename
+    char *filename;
+    filename = mxArrayToString(prhs[0]);
+    //mexPrintf("%s\n", filename);
+
+    // file open
+    FILE *fid = fopen(filename, "rb");    
+    if(fid == NULL)
+    {
+        mexErrMsgIdAndTxt("%s fopen failed.\n", filename);
+        //getchar();
+        //exit(1);
+    }
+
+    // Parameter structure
+    XimPara *para = new XimPara[1];
+    
+    // file pointer position
+    //fpos_t position = {0};
+    
+	// Skip useless information
+    // 8 * sizeof(char) + sizeof(int32_t);
+	long int position = 8*sizeof(char) + sizeof(int32_T); 
+    fseek ( fid , position , SEEK_SET );
+// 	setFilePos(fid, (fpos_t*) &position);
+	// Read ImgWidth & ImgHeight (int32)
+	fread(&(para->ImgWidth), sizeof(int32_T), 1, fid);
+	fread(&(para->ImgHeight), sizeof(int32_T), 1, fid);
+    fclose(fid);
+
+    para->PixelNO = para->ImgWidth * para->ImgHeight;
+
+    int *frame;    
+	plhs[0] = mxCreateNumericMatrix(para->ImgWidth, para->ImgHeight, mxINT32_CLASS, mxREAL);
+    frame = (int*)mxGetPr(plhs[0]);
+
+	// empty file return
+	if (para->PixelNO == 0)
+    {
+    	plhs[1] = mxCreateDoubleScalar(10000);
+    	mexPrintf("%s is an empty file\n", filename);
+        return;
+    }
+        
+	/******* Kernel Function *********/
+    cReadXim(filename, para, frame);
+
+	/**** KVSourceRtn is the only parameter-of-interest to return ****/
+    // KVSourceRtn = GantryRtn + 90 deg;
+	double KVSourceRtn = para->GantryRtn + 90;
+	plhs[1] = mxCreateDoubleScalar(KVSourceRtn);
+    
+    double NormChamberReading = para->KVNormChamber * 1.0;
+    plhs[2] = mxCreateDoubleScalar(NormChamberReading);
+
+}
+
+/************* Kernel Funtion to read .xim ***************/
+// Kernel function
+int cReadXim(char *XimFullFile,
+	XimPara *XimStr,
+	int *XimImg)
+{
+	// Read the .xim file name
+
+//	char *ptr = strrchr(XimFullFile, '\\');
+//	sprintf(XimStr->FileName, "%s", ptr + 1);
+
+	// ****** Open .xim File Pointer ***********//
+	FILE *fid = fopen(XimFullFile, "rb");
+    
+	// Syntax Parsing
+	if (fid == NULL)
+	{
+		mexErrMsgIdAndTxt("Error: file %s doesn't exist, at all\n", XimFullFile);
+        //getchar();
+		//exit(1);
+	}
+
+    // ******* Stage 1: Portal Image Data ****//
+	// Skip useless information
+	fseek(fid, 8 * sizeof(char) + sizeof(int32_T), SEEK_CUR);
+
+	// Read ImgWidth & ImgHeight
+	fread(&(XimStr->ImgWidth), sizeof(int32_T), 1, fid);
+	fread(&(XimStr->ImgHeight), sizeof(int32_T), 1, fid);
+	XimStr->PixelNO = (XimStr->ImgWidth)*(XimStr->ImgHeight);
+
+	// Skip the useless information: bits_per_pixel
+	fseek(fid, sizeof(int32_T), SEEK_CUR);
+
+	// Load .xim file compression  parameters
+	fread(&(XimStr->BytesPerPixel), sizeof(int32_T), 1, fid);
+	fread(&(XimStr->Compression_Indicator), sizeof(int32_T), 1, fid);
+
+	// Load .xim Pixel Data
+	if (1 == XimStr->Compression_Indicator)
+	{
+		int LookUpTableSize = 0;
+		fread(&LookUpTableSize, sizeof(int), 1, fid);
+
+		int *LookUpTable = new int[XimStr->ImgHeight * XimStr->ImgWidth];
+		memset(LookUpTable, 0, XimStr->ImgHeight * XimStr->ImgWidth * sizeof(int));
+
+		// Load the LookUpTable data
+		for (int ii = 0; ii < LookUpTableSize; ii++)
+		{
+			// Load in the 8-bit date
+			// Updated: 2021-11-05, Yi Du
+			uint8_T tmp =0;
+			fread(&tmp, 1, 1, fid);
+			int Bit2[4] = { 0 };
+			Bit2[0] = GET_BIT(tmp,0) + GET_BIT(tmp,1) *2;
+			Bit2[1] = GET_BIT(tmp,2) + GET_BIT(tmp,3) *2;
+			Bit2[2] = GET_BIT(tmp,4) + GET_BIT(tmp,5) *2;
+			Bit2[3] = GET_BIT(tmp,6) + GET_BIT(tmp,7) *2;
+			
+			// extract the lookup_table data
+			for (int jj = 0; jj < 4; jj++)
+			{
+				LookUpTable[ii * 4 + jj] = Bit2[jj];
+			}
+
+			/**  Old Code with bug			
+			int Bit2[4] = { 0 };
+
+			// extract the lookup_table data
+			for (int jj = 0; jj < 8; jj = jj +2)
+			{
+				Bit2[jj/2] = ((tmp & 1 << jj) != 0);
+				// It's 4, because 1 unsigned __int8 in tmp is represented by 4 ints in LookUpTable.
+				LookUpTable[ii * 4 + jj / 2] = Bit2[jj / 2];
+				
+				//printf("Index = %d, LookUpTable = %d\n", ii * 4 + jj / 2, LookUpTable[ii * 4 + jj / 2]);
+			}
+			**/			
+		}
+
+		// Skip compressed_pixel_buffer_size: passed
+		fseek(fid, sizeof(int32_T), SEEK_CUR);
+
+		// Allocate memory for XimImg
+		fread(XimImg, sizeof(int32_T), (XimStr->ImgWidth) + 1, fid);
+
+		// load the compressed pixel data
+		int delta = 0;
+		int LUT_Pos = 0;
+
+		// Be very careful with all data types!!!
+		int8_T tmp8 = 0;
+		int16_T tmp16 = 0;
+		int32_T tmp32 = 0;
+
+		for (int ImgTag = XimStr->ImgWidth + 1;
+			ImgTag < (XimStr->ImgHeight) * (XimStr->ImgWidth);
+			ImgTag++)
+		{
+			if (0 == LookUpTable[LUT_Pos])
+			{
+				fread(&tmp8, sizeof(int8_T), 1, fid);
+				delta = int(tmp8);
+			}
+			else if (1 == LookUpTable[LUT_Pos])
+			{
+				fread(&tmp16, sizeof(int16_T), 1, fid);
+				delta = int(tmp16);
+			}
+			else
+			{
+				fread(&tmp32, sizeof(int32_T), 1, fid);
+				delta = int(tmp32);
+			}
+			
+			XimImg[ImgTag] = delta + XimImg[ImgTag - 1]
+				+ XimImg[ImgTag - XimStr->ImgWidth]
+				- XimImg[ImgTag - XimStr->ImgWidth - 1];
+
+			LUT_Pos = LUT_Pos + 1;
+		}
+
+		// Skip uncompressed_pixel_buffer_size
+		fseek(fid, sizeof(int32_T), SEEK_CUR);
+
+	}
+	else
+	{
+		// Be careful: the code block for uncompressed pixel data readout hasn't been tested yet.
+		// Date: 2017-09-12
+		int BufferSize = 0;
+		fread(&BufferSize, sizeof(int), 1, fid);
+
+		switch (XimStr->BytesPerPixel)
+		{
+		case 1:	
+		{
+			uint8_t *buffer8 = new uint8_t[XimStr->ImgWidth * XimStr->ImgHeight];
+			memset(buffer8, 0, sizeof(uint8_t)* XimStr->ImgWidth * XimStr->ImgHeight);
+			fread(buffer8, sizeof(uint8_t), BufferSize, fid);
+			for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight;ii++)
+			{
+				XimImg[ii] = int(buffer8[ii]);
+			}
+			break;
+		}
+		case 2:
+		{
+			uint16_t *buffer16 = new uint16_t[XimStr->ImgWidth * XimStr->ImgHeight];
+			memset(buffer16, 0, sizeof(uint16_t)* XimStr->ImgWidth * XimStr->ImgHeight);
+			fread(buffer16, sizeof(uint16_t), BufferSize / 2, fid);
+			for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight; ii++)
+			{
+				XimImg[ii] = int(buffer16[ii]);
+			}
+			break;
+		}
+		default:
+		{
+			fread(XimImg, sizeof(int), BufferSize / 4, fid);
+			break;
+		}
+		}
+	}
+
+	
+	// ******* Stage 2: load the gantry angle from the residual property data ****//
+	// Skip histogram
+	int tmp = 0;
+	fread(&tmp, sizeof(int), 1, fid);
+	if (tmp > 0)
+	{
+		fseek(fid, tmp* sizeof(int), SEEK_CUR);
+	}
+
+	// Decode .xim properties
+	int nProperties = 0;
+	fread(&nProperties, sizeof(int), 1, fid);
+	// Property structure is not NULL
+	if (nProperties > 0)
+	{
+		int pName_len = 0;
+		// Only load the property name rather than the content
+		char pName[128] = { 0 };
+		int pType = 0;
+		for (int ii = 0; ii < nProperties; ii++)
+		{
+			// load property name length
+			fread(&pName_len, sizeof(int), 1, fid);
+			// load property name
+			fread(pName, sizeof(char)* pName_len, 1, fid);
+			// load property data type
+			fread(&pType, sizeof(int), 1, fid);
+
+            //printf("%s\n", pName);
+			
+			// extract the Gantry Rotation Angle
+			if (!strcmp(pName, "GantryRtn"))
+			{
+				fread(&(XimStr->GantryRtn), sizeof(double), 1, fid);
+//				continue;
+			}
+            else if(!strcmp(pName, "KVNormChamber"))
+            {
+                //printf("KVNormChamber");
+				fread(&(XimStr->KVNormChamber), sizeof(int), 1, fid);
+				break;                
+            }
+			else
+			{
+				switch (pType)
+				{
+				case 0:
+				{
+					fseek(fid, sizeof(int), SEEK_CUR);
+					break;
+				}
+				case 1:
+				{
+					fseek(fid, sizeof(double), SEEK_CUR);
+					break;
+				}
+				case 2:
+				{
+					int skiplen = 0;
+					fread(&skiplen, sizeof(int), 1, fid);
+					fseek(fid, sizeof(char) * skiplen, SEEK_CUR);
+					break;
+				}
+				case 4:
+				{
+					int skiplen = 0;
+					fread(&skiplen, sizeof(int), 1, fid);
+					fseek(fid, sizeof(double) * skiplen /8, SEEK_CUR);
+					break;
+				}
+				case 5:
+				{
+					int skiplen = 0;
+					fread(&skiplen, sizeof(int), 1, fid);
+					fseek(fid, sizeof(int) * skiplen /4, SEEK_CUR);
+					break;
+				}
+				break;
+				}
+			}
+			// reset all the temporary variables 
+			pName_len = 0;
+			memset(pName, 0, 128*sizeof(char));
+			pType = 0;
+		}
+
+	}
+
+	// ********* END of XIM Reading: Close the File Pointer******* //
+	if (fclose(fid))
+	{
+		printf("The file `crt_fopen.c' was not closed\n");
+		getchar();
+		exit(1);
+	}
+    
+	return 1;
+}
diff --git a/Python/setup.py b/Python/setup.py
index 40bc3b3f..eee483e0 100644
--- a/Python/setup.py
+++ b/Python/setup.py
@@ -20,7 +20,7 @@
     if "--no_pinned_memory" in sys.argv[2:] :
         no_pinned=True
         sys.argv.pop(sys.argv.index("--no_pinned_memory"))
- 
+
 if no_pinned:
     define_macros.append(("NO_PINNED_MEMORY",None))     
 
@@ -48,10 +48,10 @@
 ]
 
 COMPUTE_CAPABILITY_ARGS = [
-    "-gencode=arch=compute_70,code=compute_70", # allows forward compiling
-    "--ptxas-options=-v",
+    #"-gencode=arch=compute_70,code=compute_70",  # allows forward compiling
+    #"--ptxas-options=-v",
     "-c",
-    "--default-stream=per-thread",
+    #"--default-stream=per-thread",
 ]
 
 
@@ -65,13 +65,14 @@ def get_cuda_version(cuda_home):
                 return version_str.split(" ")[2][:4]
         else:
             version_str = subprocess.check_output(
-                [os.path.join(cuda_home, "bin", "nvcc"), "--version"]
+                [os.path.join(cuda_home, "bin", "hipcc"), "--version"]
             )
             version_str = str(version_str).replace("\n", "").replace("\r", "")
             idx = version_str.find("release")
             return version_str[idx + len("release ") : idx + len("release ") + 4]
     except:
-        raise RuntimeError("Cannot read cuda version file")
+        pass
+        #raise RuntimeError("Cannot read cuda version file")
 
 
 def locate_cuda():
@@ -81,16 +82,16 @@ def locate_cuda():
     and values giving the absolute path to each directory.
 
     Starts by looking for the CUDA_HOME or CUDA_PATH env variable. If not found, everything
-    is based on finding 'nvcc' in the PATH.
+    is based on finding 'hipcc' in the PATH.
     """
     # Guess #1
-    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or os.environ.get("HIP_PATH")
     if cuda_home is None:
         # Guess #2
         try:
             which = "where" if IS_WINDOWS else "which"
-            nvcc = subprocess.check_output([which, "nvcc"]).decode().rstrip("\r\n")
-            cuda_home = os.path.dirname(os.path.dirname(nvcc))
+            hipcc = subprocess.check_output([which, "hipcc"]).decode().rstrip("\r\n")
+            cuda_home = os.path.dirname(os.path.dirname(hipcc))
         except subprocess.CalledProcessError:
             # Guess #3
             if IS_WINDOWS:
@@ -124,24 +125,24 @@ def _is_cuda_file(path):
 
 CUDA, CUDA_VERSION = locate_cuda()
 
-cuda_version = 11.0
-try:
-    cuda_version = float(CUDA_VERSION)
-except ValueError:
-    cuda_list = re.findall('\d+', CUDA_VERSION)
-    cuda_version = float( str(cuda_list[0] + '.' + cuda_list[1]))
-
-# Insert CUDA arguments depedning on the version
-for item in CC_COMPATIBILITY_TABLE:
-    support_begin = item[2]
-    support_end   = item[3]
-    if cuda_version < support_begin:
-        continue
-    if cuda_version >= support_end:
-        continue
-    str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}"
-    COMPUTE_CAPABILITY_ARGS.insert(0, str_arg)
-
+#cuda_version = 11.0
+#try:
+    #cuda_version = float(CUDA_VERSION)
+#except ValueError:
+    #cuda_list = re.findall("\d+", CUDA_VERSION)
+    #cuda_version = float(str(cuda_list[0] + "." + cuda_list[1]))
+#
+## Insert CUDA arguments depedning on the version
+#for item in CC_COMPATIBILITY_TABLE:
+    #support_begin = item[2]
+    #support_end = item[3]
+    #if cuda_version < support_begin:
+        ##continue
+    #if cuda_version >= support_end:
+        #continue
+    #str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}"
+    #COMPUTE_CAPABILITY_ARGS.insert(0, str_arg)
+#
 # Obtain the numpy include directory.  This logic works across numpy versions.
 try:
     NUMPY_INCLUDE = numpy.get_include()
@@ -153,10 +154,10 @@ def _is_cuda_file(path):
 
 
 COMMON_NVCC_FLAGS = [
-    "-D__CUDA_NO_HALF_OPERATORS__",
-    "-D__CUDA_NO_HALF_CONVERSIONS__",
-    "-D__CUDA_NO_HALF2_OPERATORS__",
-    "--expt-relaxed-constexpr",
+    #"-D__CUDA_NO_HALF_OPERATORS__",
+    #"-D__CUDA_NO_HALF_CONVERSIONS__",
+    #"-D__CUDA_NO_HALF2_OPERATORS__",
+    #"--expt-relaxed-constexpr",
 ]
 
 
@@ -211,18 +212,20 @@ def build_extensions(self):
         def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
             # Copy before we make any modifications.
             cflags = copy.deepcopy(extra_postargs)
+            cflags.append("-D__HIP_PLATFORM_AMD__")
             try:
                 original_compiler = self.compiler.compiler_so
                 if _is_cuda_file(src):
-                    nvcc = _join_cuda_home("bin", "nvcc")
-                    if not isinstance(nvcc, list):
-                        nvcc = [nvcc]
-                    self.compiler.set_executable("compiler_so", nvcc)
+                    hipcc = _join_cuda_home("bin", "hipcc")
+                    if not isinstance(hipcc, list):
+                        hipcc = [hipcc]
+                    self.compiler.set_executable("compiler_so", hipcc)
+                    self.compiler.set_executable("compiler", hipcc)
                     if isinstance(cflags, dict):
-                        cflags = cflags["nvcc"]
+                        cflags = cflags["hipcc"]
                     cflags = (
                         COMMON_NVCC_FLAGS
-                        + ["--compiler-options", "'-fPIC'"]
+                        + ["-fPIC"]
                         + cflags
                         + COMPUTE_CAPABILITY_ARGS
                     )
@@ -237,6 +240,7 @@ def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
             finally:
                 # Put the original compiler back in place.
                 self.compiler.set_executable("compiler_so", original_compiler)
+                self.compiler.set_executable("compiler", original_compiler)
 
         def win_wrap_compile(
             sources,
@@ -269,9 +273,9 @@ def spawn(cmd, cflags):
                     src = src_list[0]
                     obj = obj_list[0]
                     if _is_cuda_file(src):
-                        nvcc = _join_cuda_home("bin", "nvcc")
+                        hipcc = _join_cuda_home("bin", "hipcc")
                         if isinstance(cflags, dict):
-                            cflags = cflags["nvcc"]
+                            cflags = cflags["hipcc"]
                         elif not isinstance(cflags, list):
                             cflags = []
 
@@ -287,7 +291,7 @@ def spawn(cmd, cflags):
                             elif len(macro) == 1:
                                 cflags += ["--undefine-macro", macro[0]]
 
-                        cmd = [nvcc, "-c", src, "-o", obj] + include_list + cflags
+                        cmd = [hipcc, "-c", src, "-o", obj] + include_list + cflags
                     elif isinstance(cflags, dict):
                         cflags = COMMON_MSVC_FLAGS  # + self.cflags['cxx']
                         cmd += cflags
@@ -372,7 +376,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -395,7 +399,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -416,7 +420,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -437,7 +441,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -458,7 +462,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -475,7 +479,7 @@ def include_headers(filename_list, sdist=False):
         sdist=sys.argv[1] == "sdist",
     ),
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -496,10 +500,10 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64", "hiprand"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
-    include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
+    include_dirs=[NUMPY_INCLUDE, CUDA["include"],"../Common/CUDA/"],
 )