From 650a939e66a7c838ddb3bb859ec5864f42c5c96d Mon Sep 17 00:00:00 2001
From: purepani <pani0028@umn.edu>
Date: Mon, 10 Mar 2025 19:13:52 -0500
Subject: [PATCH 1/3] Generates HIP from CUDA

---
 Common/CUDA/GD_AwTV.cu                        |  151 +-
 Common/CUDA/GD_AwTV.cu.prehip                 |  713 ++++++++++
 Common/CUDA/GD_AwTV.hpp.prehip                |   62 +
 Common/CUDA/GD_TV.cu                          |  155 +--
 Common/CUDA/GD_TV.cu.prehip                   |  702 ++++++++++
 Common/CUDA/GD_TV.hpp.prehip                  |   61 +
 Common/CUDA/GpuIds.cpp                        |   10 +-
 Common/CUDA/GpuIds.cpp.prehip                 |   70 +
 Common/CUDA/GpuIds.hpp.prehip                 |   17 +
 Common/CUDA/PICCS.cu                          |   77 +-
 Common/CUDA/PICCS.cu.prehip                   |  398 ++++++
 Common/CUDA/PICCS.hpp.prehip                  |   61 +
 Common/CUDA/RandomNumberGenerator.cu          |   89 +-
 Common/CUDA/RandomNumberGenerator.cu.prehip   |  193 +++
 Common/CUDA/RandomNumberGenerator.hpp.prehip  |   49 +
 Common/CUDA/Siddon_projection.cu              |  157 +--
 Common/CUDA/Siddon_projection.cu.prehip       |  859 ++++++++++++
 Common/CUDA/Siddon_projection.hpp.prehip      |   66 +
 Common/CUDA/Siddon_projection_parallel.cu     |   99 +-
 .../CUDA/Siddon_projection_parallel.cu.prehip |  540 ++++++++
 .../Siddon_projection_parallel.hpp.prehip     |   65 +
 Common/CUDA/TIGRE_common.cpp.prehip           |   20 +
 Common/CUDA/TIGRE_common.hpp.prehip           |   24 +
 Common/CUDA/errors.hpp                        |    2 +-
 Common/CUDA/errors.hpp.prehip                 |   10 +
 Common/CUDA/gpuUtils.cu                       |   18 +-
 Common/CUDA/gpuUtils.cu.prehip                |   70 +
 Common/CUDA/gpuUtils.hpp.prehip               |   18 +
 Common/CUDA/improvedForwardProjections.cu     |  127 +-
 .../CUDA/improvedForwardProjections.cu.prehip | 1032 ++++++++++++++
 Common/CUDA/improvedForwardProjections.hpp    |    5 +-
 .../improvedForwardProjections.hpp.prehip     |  263 ++++
 .../CUDA/improvedForwardProjections_cone.cu   |  131 +-
 .../improvedForwardProjections_cone.cu.prehip | 1230 +++++++++++++++++
 Common/CUDA/projection.cpp.prehip             |   35 +
 Common/CUDA/projection.hpp.prehip             |    9 +
 Common/CUDA/ray_interpolated_projection.cu    |  165 +--
 .../ray_interpolated_projection.cu.prehip     |  843 +++++++++++
 .../ray_interpolated_projection.hpp.prehip    |   66 +
 .../ray_interpolated_projection_parallel.cu   |  105 +-
 ...interpolated_projection_parallel.cu.prehip |  449 ++++++
 ...nterpolated_projection_parallel.hpp.prehip |   65 +
 Common/CUDA/tv_proximal.cu                    |  241 ++--
 Common/CUDA/tv_proximal.cu.prehip             |  693 ++++++++++
 Common/CUDA/tv_proximal.hpp.prehip            |   57 +
 Common/CUDA/types_TIGRE.hpp.prehip            |  109 ++
 Common/CUDA/voxel_backprojection.cu           |  149 +-
 Common/CUDA/voxel_backprojection.cu.prehip    |  920 ++++++++++++
 Common/CUDA/voxel_backprojection.hpp.prehip   |   59 +
 Common/CUDA/voxel_backprojection2.cu          |  149 +-
 Common/CUDA/voxel_backprojection2.cu.prehip   |  844 +++++++++++
 Common/CUDA/voxel_backprojection2.hpp.prehip  |   64 +
 Common/CUDA/voxel_backprojection_parallel.cu  |  117 +-
 .../voxel_backprojection_parallel.cu.prehip   |  627 +++++++++
 .../voxel_backprojection_parallel.hpp.prehip  |   57 +
 .../Utilities/GPU/getGpuCount_mex.cpp.prehip  |   21 +
 .../Utilities/GPU/getGpuName_mex.cpp.prehip   |   29 +
 .../IO/VarianCBCT/XimPara.hpp.prehip          |   28 +
 .../IO/VarianCBCT/mexReadXim.cpp.prehip       |  357 +++++
 .../cuda_interface/AddNoise.cpp.prehip        |  126 ++
 .../cuda_interface/Atb_mex.cpp.prehip         |  367 +++++
 .../cuda_interface/AwminTV.cpp.prehip         |  137 ++
 .../cuda_interface/Ax_mex.cpp.prehip          |  338 +++++
 .../cuda_interface/minPICCS.cpp.prehip        |  147 ++
 .../Utilities/cuda_interface/minTV.cpp.prehip |  132 ++
 .../pCTCubicSpline_mex.cpp.prehip             |  124 ++
 .../cuda_interface/tvDenoise.cpp.prehip       |  147 ++
 67 files changed, 14354 insertions(+), 966 deletions(-)
 create mode 100644 Common/CUDA/GD_AwTV.cu.prehip
 create mode 100644 Common/CUDA/GD_AwTV.hpp.prehip
 create mode 100644 Common/CUDA/GD_TV.cu.prehip
 create mode 100644 Common/CUDA/GD_TV.hpp.prehip
 create mode 100644 Common/CUDA/GpuIds.cpp.prehip
 create mode 100644 Common/CUDA/GpuIds.hpp.prehip
 create mode 100644 Common/CUDA/PICCS.cu.prehip
 create mode 100644 Common/CUDA/PICCS.hpp.prehip
 create mode 100644 Common/CUDA/RandomNumberGenerator.cu.prehip
 create mode 100644 Common/CUDA/RandomNumberGenerator.hpp.prehip
 create mode 100644 Common/CUDA/Siddon_projection.cu.prehip
 create mode 100644 Common/CUDA/Siddon_projection.hpp.prehip
 create mode 100644 Common/CUDA/Siddon_projection_parallel.cu.prehip
 create mode 100644 Common/CUDA/Siddon_projection_parallel.hpp.prehip
 create mode 100644 Common/CUDA/TIGRE_common.cpp.prehip
 create mode 100644 Common/CUDA/TIGRE_common.hpp.prehip
 create mode 100644 Common/CUDA/errors.hpp.prehip
 create mode 100644 Common/CUDA/gpuUtils.cu.prehip
 create mode 100644 Common/CUDA/gpuUtils.hpp.prehip
 create mode 100644 Common/CUDA/improvedForwardProjections.cu.prehip
 create mode 100644 Common/CUDA/improvedForwardProjections.hpp.prehip
 create mode 100644 Common/CUDA/improvedForwardProjections_cone.cu.prehip
 create mode 100644 Common/CUDA/projection.cpp.prehip
 create mode 100644 Common/CUDA/projection.hpp.prehip
 create mode 100644 Common/CUDA/ray_interpolated_projection.cu.prehip
 create mode 100644 Common/CUDA/ray_interpolated_projection.hpp.prehip
 create mode 100644 Common/CUDA/ray_interpolated_projection_parallel.cu.prehip
 create mode 100644 Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip
 create mode 100644 Common/CUDA/tv_proximal.cu.prehip
 create mode 100644 Common/CUDA/tv_proximal.hpp.prehip
 create mode 100644 Common/CUDA/types_TIGRE.hpp.prehip
 create mode 100644 Common/CUDA/voxel_backprojection.cu.prehip
 create mode 100644 Common/CUDA/voxel_backprojection.hpp.prehip
 create mode 100644 Common/CUDA/voxel_backprojection2.cu.prehip
 create mode 100644 Common/CUDA/voxel_backprojection2.hpp.prehip
 create mode 100644 Common/CUDA/voxel_backprojection_parallel.cu.prehip
 create mode 100644 Common/CUDA/voxel_backprojection_parallel.hpp.prehip
 create mode 100644 MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip
 create mode 100644 MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip
 create mode 100644 MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip
 create mode 100644 MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/minTV.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip
 create mode 100644 MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip

diff --git a/Common/CUDA/GD_AwTV.cu b/Common/CUDA/GD_AwTV.cu
index d98c13c1..03956111 100644
--- a/Common/CUDA/GD_AwTV.cu
+++ b/Common/CUDA/GD_AwTV.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for Steepest descend in POCS-type algorithms.
@@ -61,11 +62,11 @@
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -378,16 +379,16 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
          
         // allocate memory in each GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
-            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
-            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
-            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
-            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
-            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
-            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
-            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
+            hipMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
+            hipMemset(         d_image[dev],0  , mem_img_each_GPU);
+            hipMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
+            hipMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
+            hipMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
+            hipMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
+            hipMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
+            hipMemset(         d_norm2aux[dev],0 , mem_auxiliary);
             cudaCheckErrors("Malloc  error");
             
             
@@ -397,7 +398,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         if(splits>1){
             mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
         }else{
-            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
+            hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
         }
         
         
@@ -406,12 +407,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
         int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+        hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
-            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
+            hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
         }
         cudaCheckErrors("Error pinning memory");
 
@@ -420,12 +421,12 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                 // Create streams
         int nStream_device=2;
         int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
+                hipStreamCreate(&stream[i+dev*nStream_device]);
             }
         }
         cudaCheckErrors("Stream creation fail");
@@ -437,7 +438,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         double totalsum;
         float sum_curr_spl;
         float * sumnorm2;
-        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
+        hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float));
         
         unsigned int curr_slices;
         unsigned long long curr_pixels;
@@ -476,28 +477,28 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
 
                 if(i==0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
                 if (splits>1 & i>0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipSetDevice(gpuids[dev]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 cudaCheckErrors("Memcpy failure on multi split");
@@ -509,7 +510,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                     dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // Compute the gradient of the TV norm
                         
@@ -522,30 +523,30 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // no need to copy the 2 aux slices here
-                        cudaStreamSynchronize(stream[dev*nStream_device]);
-                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                        hipStreamSynchronize(stream[dev*nStream_device]);
+                        hipMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), hipMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
                     }
                     
                     
                     // Compute the L2 norm of the gradient. For that, reduction is used.
                     //REDUCE
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
                         size_t dimblockRed = MAXTHREADS;
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
                         
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
                         reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         size_t dimblockRed = MAXTHREADS;
@@ -553,17 +554,17 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
 
                         if (dimgridRed > 1) {
                             reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                         else {
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Reduction error");
                     
@@ -586,7 +587,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         //NORMALIZE
@@ -596,15 +597,15 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                         multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
                     }
                      for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Scalar operations error");
                     
                     //SUBSTRACT GRADIENT
                     //////////////////////////////////////////////
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
@@ -614,8 +615,8 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
 
                 // Synchronize mathematics, make sure bounding pixels are correct
                  for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                 
                 if(splits==1){
@@ -623,35 +624,35 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
+                            hipSetDevice(gpuids[dev+1]);
+                            hipMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); 
                         }
-                        cudaDeviceSynchronize();
+                        hipDeviceSynchronize();
                         if (dev>0){
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
+                            hipSetDevice(gpuids[dev-1]);
+                            hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice);
                         }
                     }
                 }else{
                     
                     // We need to take it out :(
                     for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
-                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                     }
                 }
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
+                    hipSetDevice(gpuids[dev]);
+                    hipDeviceSynchronize();
                 }
                 cudaCheckErrors("Memory gather error");
 
@@ -662,34 +663,34 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
         // If there has not been splits, we still have data in memory
         if(splits==1){
             for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
                 total_pixels=curr_slices*image_size[0]*image_size[1];
-                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                hipMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost);
             }
         }
         cudaCheckErrors("Copy result back");
         
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(d_image[dev]);
-            cudaFree(d_norm2aux[dev]);
-            cudaFree(d_dimgTV[dev]);
-            cudaFree(d_norm2[dev]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(d_image[dev]);
+            hipFree(d_norm2aux[dev]);
+            hipFree(d_dimgTV[dev]);
+            hipFree(d_norm2[dev]);
         }
         if (splits==1){
-            cudaFreeHost(buffer);
+            hipHostFree(buffer);
         }
         
         if (isHostRegisterSupported& splits>2){
-            cudaHostUnregister(img);
-            cudaHostUnregister(dst);
+            hipHostUnregister(img);
+            hipHostUnregister(dst);
         }
         for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
+           hipStreamDestroy(stream[i]) ;
         cudaCheckErrors("Memory free");
-//         cudaDeviceReset();
+//         hipDeviceReset();
     }
         
 void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
@@ -697,8 +698,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
         size_t memtotal;
         const int deviceCount = gpuids.GetLength();
         for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
+            hipSetDevice(gpuids[dev]);
+            hipMemGetInfo(&memfree,&memtotal);
             if(dev==0) *mem_GPU_global=memfree;
             if(memfree<memtotal/2){
                 mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/GD_AwTV.cu.prehip b/Common/CUDA/GD_AwTV.cu.prehip
new file mode 100644
index 00000000..d98c13c1
--- /dev/null
+++ b/Common/CUDA/GD_AwTV.cu.prehip
@@ -0,0 +1,713 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for Steepest descend in POCS-type algorithms.
+ *
+ * This file will iteratively minimize by steepest descend the total variation
+ * of the input image, with the parameters given, using GPUs.
+ *
+ * CODE by       Ander Biguri
+ *
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+
+
+
+
+
+#define MAXTHREADS 1024
+#define MAX_BUFFER 60
+
+#include "GD_AwTV.hpp"
+
+
+
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                cudaDeviceReset();\
+                mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+// CUDA kernels
+//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927
+    __global__ void divideArrayScalar(float* vec,float scalar,const size_t n){
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]/=scalar;
+        }
+    }
+    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]*=scalar;
+        }
+    }
+    __global__ void substractArrays(float* vec,float* vec2,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]-=vec2[i];
+        }
+    }
+    
+    __device__ __inline__
+            void gradient(const float* u, float* grad,
+            long z, long y, long x,
+            long depth, long rows, long cols){
+        unsigned long size2d = rows*cols;
+        unsigned long long idx = z * size2d + y * cols + x;
+        
+        float uidx = u[idx];
+        
+        if ( z - 1 >= 0 && z<depth) {
+            grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
+        }
+        
+        if ( y - 1 >= 0 && y<rows){
+            grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
+        }
+        
+        if ( x - 1 >= 0 && x<cols) {
+            grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
+        }
+    }
+    
+    __global__ void gradientTV(const float* f, float* dftv,
+            long depth, long rows, long cols,const float delta){
+        unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
+        unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
+        unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
+        unsigned long long idx = z * rows * cols + y * cols + x;
+        if ( x >= cols || y >= rows || z >= depth )
+            return;
+        
+        
+        float df[3] ={0.f,0.f,0.f};
+        float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k}
+        float dfj[3]={0.f,0.f,0.f};
+        float dfk[3]={0.f,0.f,0.f};
+        gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
+        gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
+        gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
+        gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
+        float eps=0.00000001; //% avoid division by zero
+        
+        float wx=__expf(-(df[0]/delta)*(df[0]/delta));
+        float wy=__expf(-(df[1]/delta)*(df[1]/delta));
+        float wz=__expf(-(df[2]/delta)*(df[2]/delta));
+        
+        float wxi=__expf(-(dfi[0]/delta)*(dfi[0]/delta));
+        float wyi=__expf(-(dfi[1]/delta)*(dfi[1]/delta));
+        float wzi=__expf(-(dfi[2]/delta)*(dfi[2]/delta));
+        
+        float wxj=__expf(-(dfj[0]/delta)*(dfj[0]/delta));
+        float wyj=__expf(-(dfj[1]/delta)*(dfj[1]/delta));
+        float wzj=__expf(-(dfj[2]/delta)*(dfj[2]/delta));
+        
+        float wxk=__expf(-(dfk[0]/delta)*(dfk[0]/delta));
+        float wyk=__expf(-(dfk[1]/delta)*(dfk[1]/delta));
+        float wzk=__expf(-(dfk[2]/delta)*(dfk[2]/delta));
+
+        
+        // this hsould do the trick I think
+        
+        dftv[idx]=(wx*df[0]+wy*df[1]+wz*df[2])/(sqrt(wx*df[0] *df[0] +wy*df[1] *df[1] +wz*df[2] *df[2])+eps)
+        -wzi*dfi[2]/(sqrt(wxi*dfi[0]*dfi[0]+wyi*dfi[1]*dfi[1]+wzi*dfi[2]*dfi[2]) +eps)     // I wish I coudl precompute this, but if I do then Id need to recompute the gradient.
+        -wyj*dfj[1]/(sqrt(wxj*dfj[0]*dfj[0]+wyj*dfj[1]*dfj[1]+wzj*dfj[2]*dfj[2]) +eps)
+        -wxk*dfk[0]/(sqrt(wxk*dfk[0]*dfk[0]+wyk*dfk[1]*dfk[1]+wzk*dfk[2]*dfk[2]) +eps);
+        
+    
+        return;
+        
+    }
+    
+    __device__ void warpReduce(volatile float *sdata, size_t tid) {
+        sdata[tid] += sdata[tid + 32];
+        sdata[tid] += sdata[tid + 16];
+        sdata[tid] += sdata[tid + 8];
+        sdata[tid] += sdata[tid + 4];
+        sdata[tid] += sdata[tid + 2];
+        sdata[tid] += sdata[tid + 1];
+    }
+    
+    __global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
+        extern __shared__ volatile float sdata[];
+        //http://stackoverflow.com/a/35133396/1485872
+        size_t tid = threadIdx.x;
+        size_t i = blockIdx.x*blockDim.x + tid;
+        size_t gridSize = blockDim.x*gridDim.x;
+        float mySum = 0;
+        float value=0;
+        while (i < n) {
+            value=g_idata[i]; //avoid reading twice
+            mySum += value*value;
+            i += gridSize;
+        }
+        sdata[tid] = mySum;
+        __syncthreads();
+        
+        if (tid < 512)
+            sdata[tid] += sdata[tid + 512];
+        __syncthreads();
+        if (tid < 256)
+            sdata[tid] += sdata[tid + 256];
+        __syncthreads();
+        
+        if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+        
+        if (tid <  64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+        
+        
+#if (__CUDART_VERSION >= 9000)
+        if ( tid < 32 )
+        {
+            mySum = sdata[tid] + sdata[tid + 32];
+            for (int offset = warpSize/2; offset > 0; offset /= 2) {
+                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
+            }
+        }
+#else
+        if (tid < 32) {
+            warpReduce(sdata, tid);
+            mySum = sdata[0];
+        }
+#endif
+        if (tid == 0) g_odata[blockIdx.x] = mySum;
+    }
+    
+    __global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
+        extern __shared__ volatile float sdata[];
+        //http://stackoverflow.com/a/35133396/1485872
+        size_t tid = threadIdx.x;
+        size_t i = blockIdx.x*blockDim.x + tid;
+        size_t gridSize = blockDim.x*gridDim.x;
+        float mySum = 0;
+        // float value=0;
+        while (i < n) {
+            mySum += g_idata[i];
+            i += gridSize;
+        }
+        sdata[tid] = mySum;
+        __syncthreads();
+        
+        if (tid < 512)
+            sdata[tid] += sdata[tid + 512];
+        __syncthreads();
+        if (tid < 256)
+            sdata[tid] += sdata[tid + 256];
+        __syncthreads();
+        
+        if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+        
+        if (tid <  64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+        
+        
+#if (__CUDART_VERSION >= 9000)
+        if ( tid < 32 )
+        {
+            mySum = sdata[tid] + sdata[tid + 32];
+            for (int offset = warpSize/2; offset > 0; offset /= 2) {
+                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
+            }
+        }
+#else
+        if (tid < 32) {
+            warpReduce(sdata, tid);
+            mySum = sdata[0];
+        }
+#endif
+        if (tid == 0) g_odata[blockIdx.x] = mySum;
+    }
+    
+    
+    
+    
+// main function
+void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter,const float delta, const GpuIds& gpuids){
+        // Prepare for MultiGPU
+        int deviceCount = gpuids.GetLength();
+        cudaCheckErrors("Device query fail");
+        if (deviceCount == 0) {
+            mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","There are no available device(s) that support CUDA\n");
+        }
+        //
+        // CODE assumes
+        // 1.-All available devices are usable by this code
+        // 2.-All available devices are equal, they are the same machine (warning thrown)
+        // Check the available devices, and if they are the same
+        if (!gpuids.AreEqualDevices()) {
+            mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+        }
+        int dev;
+        
+        // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
+        // check free memory
+        size_t mem_GPU_global;
+        checkFreeMemory(gpuids, &mem_GPU_global);
+
+        
+        
+        // %5 of free memory should be enough, we have almost no variables in these kernels
+        size_t total_pixels              = image_size[0] * image_size[1]  * image_size[2] ;
+        size_t mem_slice_image           = sizeof(float)* image_size[0] * image_size[1]  ;
+        size_t mem_size_image            = sizeof(float)* total_pixels;
+        size_t mem_auxiliary             = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
+        
+        // Decide how are we handling the distribution of computation
+        size_t mem_img_each_GPU;
+        
+        unsigned int buffer_length=2;
+        //Does everything fit in the GPU?
+        unsigned int slices_per_split;
+        
+        // if it is a thin problem (no need to split), just use one GPU
+        if (image_size[2]<4){deviceCount=1;}
+        
+        unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this.
+        if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary) {
+            // We only need to split if we have extra GPUs
+            slices_per_split=(image_size[2]+deviceCount-1)/deviceCount;
+            mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2));
+        }else{
+            // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all
+            size_t mem_free=mem_GPU_global-mem_auxiliary;
+            
+            splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free));
+            // Now, there is an overhead here, as each splits should have 2 slices more, to account for overlap of images.
+            // lets make sure these 2 slices fit, if they do not, add 1 to splits.
+            slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits);
+            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
+            
+            // if the new stuff does not fit in the GPU, it means we are in the edge case where adding that extra slice will overflow memory
+            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
+                // one more split should do the job, as its an edge case.
+                splits++;
+                //recompute for later
+                slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap
+                mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
+            }
+
+
+            // How many EXTRA buffer slices should be able to fit in here??!?!
+            // Only do it if there are splits needed. 
+            if(splits>1){
+                mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary);
+                unsigned int extra_buff=(mem_free/mem_slice_image); 
+                buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down.
+                buffer_length=max(buffer_length,2);// minimum 2
+                buffer_length=min(MAX_BUFFER,buffer_length);
+
+                mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2);
+                
+            }else{
+                buffer_length=2;
+            }
+
+            // Assert
+            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
+                mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n");
+            }
+        }
+        
+        
+         // Assert
+       
+        if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){
+            mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n");
+        }
+        
+        
+        
+        
+        
+        
+        float** d_image=    (float**)malloc(deviceCount*sizeof(float*));
+        float** d_dimgTV=   (float**)malloc(deviceCount*sizeof(float*));
+        float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*));
+        float** d_norm2=    (float**)malloc(deviceCount*sizeof(float*));
+         
+        // allocate memory in each GPU
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            
+            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
+            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
+            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
+            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
+            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
+            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
+            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
+            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
+            cudaCheckErrors("Malloc  error");
+            
+            
+        }
+       unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1];
+        float* buffer;
+        if(splits>1){
+            mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
+        }else{
+            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
+        }
+        
+        
+        
+        // Lets try to make the host memory pinned:
+        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+        // splits>2 is completely empirical observation
+        if (isHostRegisterSupported & splits>2){
+            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+        }
+        cudaCheckErrors("Error pinning memory");
+
+        
+        
+                // Create streams
+        int nStream_device=2;
+        int nStreams=deviceCount*nStream_device;
+        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            for (int i = 0; i < nStream_device; ++i){
+                cudaStreamCreate(&stream[i+dev*nStream_device]);
+            }
+        }
+        cudaCheckErrors("Stream creation fail");
+
+        
+        // For the reduction
+
+        double totalsum_prev;
+        double totalsum;
+        float sum_curr_spl;
+        float * sumnorm2;
+        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
+        
+        unsigned int curr_slices;
+        unsigned long long curr_pixels;
+        size_t linear_idx_start;
+        unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        unsigned long long* offset_host  =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        bool is_first_chunk;
+        bool is_last_chunk;
+        for(unsigned int i=0;i<maxIter;i+=(buffer_length-1)){
+            if(splits>1){
+                totalsum_prev=0;
+            }
+            for(unsigned int sp=0;sp<splits;sp++){
+                
+                // For each iteration we need to compute all the image. The ordering of these loops
+                // need to be like this due to the bounding layers between splits. If more than 1 split is needed
+                // for each GPU then there is no other way that taking the entire memory out of GPU and putting it back.
+                // If the memory can be shared between GPUs fully without extra splits, then there is an easy way of synchronizing the memory
+                
+                // Copy image to memory
+                for (dev = 0; dev < deviceCount; dev++){
+                    curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                    curr_pixels=curr_slices*image_size[0]*image_size[1];
+                    linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
+                    
+                    // Check if its the first or last chunck
+                    is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                    is_first_chunk=!(sp*deviceCount+dev);
+                    
+                    // lets compute where we start copyes and how much. This avoids 3 calls to Memcpy
+                    offset_device[dev]=buffer_pixels*is_first_chunk;
+                    offset_host[dev]=linear_idx_start-buffer_pixels*!is_first_chunk;
+                    bytes_device[dev]=curr_pixels+buffer_pixels*!is_first_chunk+buffer_pixels*!is_last_chunk;
+                }
+
+                if(i==0){
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        
+                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        
+                        
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                    }
+                }
+                // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
+                if (splits>1 & i>0){
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        
+                        
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                    }
+                }
+                cudaCheckErrors("Memcpy failure on multi split");
+                
+                for(unsigned int ib=0;  (ib<(buffer_length-1)) && ((i+ib)<maxIter);  ib++){
+                    
+                    // For the gradient
+                    dim3 blockGrad(10, 10, 10);
+                    dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
+                    
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        // Compute the gradient of the TV norm
+                        
+                        // I don't understand why I need to store 2 layers to compute correctly with 1 buffer. The bounding checks should
+                        // be enough but they are not.
+                        gradientTV<<<gridGrad, blockGrad,0,stream[dev*nStream_device]>>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0],delta);
+                        
+                    }
+                    
+                    
+                    
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        // no need to copy the 2 aux slices here
+                        cudaStreamSynchronize(stream[dev*nStream_device]);
+                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                    }
+                    
+                    
+                    // Compute the L2 norm of the gradient. For that, reduction is used.
+                    //REDUCE
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        
+                        size_t dimblockRed = MAXTHREADS;
+                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
+                        
+                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
+                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
+                        
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        size_t dimblockRed = MAXTHREADS;
+                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
+
+                        if (dimgridRed > 1) {
+                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
+                            cudaStreamSynchronize(stream[dev*nStream_device]);
+                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        }
+                        else {
+                            cudaStreamSynchronize(stream[dev*nStream_device]);
+                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        }
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                     }
+                    cudaCheckErrors("Reduction error");
+                    
+                    
+                    // Accumulate the norm accross devices
+                    sum_curr_spl=0;
+                    // this is CPU code
+                    for (dev = 0; dev < deviceCount; dev++){
+                        sum_curr_spl+=sumnorm2[dev];
+                    }
+                    sum_curr_spl+=0.0000001f; // avoid division by zero
+                    
+                    // If we have more than one splits, lets use the result from prior calls
+                    if(i>0 && splits>1){
+                        // this is already stored:
+                        //totalsum=totalsum_prev; 
+                    }else{
+                        totalsum=sum_curr_spl;
+                    }
+                    
+                    
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        //NORMALIZE
+                        //in a Tesla, maximum blocks =15 SM * 4 blocks/SM
+                        divideArrayScalar  <<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels);
+                        //MULTIPLY HYPERPARAMETER
+                        multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
+                    }
+                     for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                     }
+                    cudaCheckErrors("Scalar operations error");
+                    
+                    //SUBSTRACT GRADIENT
+                    //////////////////////////////////////////////
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        
+                        substractArrays<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels);
+                    }
+                }
+
+                // Synchronize mathematics, make sure bounding pixels are correct
+                 for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                     }
+                
+                if(splits==1){
+                    for(dev=0; dev<deviceCount;dev++){
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        if (dev<deviceCount-1){
+                            cudaSetDevice(gpuids[dev+1]);
+                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                            cudaSetDevice(gpuids[dev]);
+                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
+                        }
+                        cudaDeviceSynchronize();
+                        if (dev>0){
+                            cudaSetDevice(gpuids[dev-1]);
+                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                            cudaSetDevice(gpuids[dev]);
+                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
+                        }
+                    }
+                }else{
+                    
+                    // We need to take it out :(
+                    for(dev=0; dev<deviceCount;dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                    }
+                }
+                
+                for (dev = 0; dev < deviceCount; dev++){
+                    cudaSetDevice(gpuids[dev]);
+                    cudaDeviceSynchronize();
+                }
+                cudaCheckErrors("Memory gather error");
+
+                totalsum_prev+=sum_curr_spl;
+            }
+            totalsum=totalsum_prev;
+        }
+        // If there has not been splits, we still have data in memory
+        if(splits==1){
+            for(dev=0; dev<deviceCount;dev++){
+                cudaSetDevice(gpuids[dev]);
+                
+                curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
+                total_pixels=curr_slices*image_size[0]*image_size[1];
+                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+            }
+        }
+        cudaCheckErrors("Copy result back");
+        
+        for(dev=0; dev<deviceCount;dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaFree(d_image[dev]);
+            cudaFree(d_norm2aux[dev]);
+            cudaFree(d_dimgTV[dev]);
+            cudaFree(d_norm2[dev]);
+        }
+        if (splits==1){
+            cudaFreeHost(buffer);
+        }
+        
+        if (isHostRegisterSupported& splits>2){
+            cudaHostUnregister(img);
+            cudaHostUnregister(dst);
+        }
+        for (int i = 0; i < nStreams; ++i)
+           cudaStreamDestroy(stream[i]) ;
+        cudaCheckErrors("Memory free");
+//         cudaDeviceReset();
+    }
+        
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
+        size_t memfree;
+        size_t memtotal;
+        const int deviceCount = gpuids.GetLength();
+        for (int dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaMemGetInfo(&memfree,&memtotal);
+            if(dev==0) *mem_GPU_global=memfree;
+            if(memfree<memtotal/2){
+                mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
+            }
+            cudaCheckErrors("Check mem error");
+            
+            *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
+        }
+        *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
+        
+        //*mem_GPU_global= insert your known number here, in bytes.
+}
diff --git a/Common/CUDA/GD_AwTV.hpp.prehip b/Common/CUDA/GD_AwTV.hpp.prehip
new file mode 100644
index 00000000..a9581025
--- /dev/null
+++ b/Common/CUDA/GD_AwTV.hpp.prehip
@@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header for CUDA functions for Steepest descend in POCS-type algorithms.
+ *
+ * This file has the required headers for POCS_TV.cu
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+
+
+#ifndef GD_AwTV_HPP
+#define GD_AwTV_HPP
+#include "TIGRE_common.hpp"
+#include "GpuIds.hpp"
+
+void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter,const float delta, const GpuIds& gpuids);
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
+
+
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/GD_TV.cu b/Common/CUDA/GD_TV.cu
index 4edcf94c..4086e951 100644
--- a/Common/CUDA/GD_TV.cu
+++ b/Common/CUDA/GD_TV.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for Steepest descend in POCS-type algorithms.
@@ -61,11 +62,11 @@
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("GD_TV:GPU",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                mexErrMsgIdAndTxt("GD_TV:GPU",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -362,16 +363,16 @@ do { \
          
         // allocate memory in each GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
-            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
-            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
-            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
-            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
-            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
-            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
-            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
+            hipMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
+            hipMemset(         d_image[dev],0  , mem_img_each_GPU);
+            hipMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
+            hipMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
+            hipMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
+            hipMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
+            hipMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
+            hipMemset(         d_norm2aux[dev],0 , mem_auxiliary);
             cudaCheckErrors("Malloc  error");
             
             
@@ -381,7 +382,7 @@ do { \
         if(splits>1){
             mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
         }else{
-            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
+            hipHostMalloc((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
         }
         
         
@@ -390,12 +391,12 @@ do { \
         // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
         int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+        hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
         // splits>2 is completely empirical observation
         if (isHostRegisterSupported & splits>2){
-            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            hipHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
+            hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
         }
         cudaCheckErrors("Error pinning memory");
 
@@ -404,12 +405,12 @@ do { \
                 // Create streams
         int nStream_device=2;
         int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
+                hipStreamCreate(&stream[i+dev*nStream_device]);
             }
         }
         cudaCheckErrors("Stream creation fail");
@@ -421,7 +422,7 @@ do { \
         double totalsum;
         float sum_curr_spl;
         float * sumnorm2;
-        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
+        hipHostMalloc((void**)&sumnorm2,deviceCount*sizeof(float));
         
         unsigned int curr_slices;
         unsigned long long curr_pixels;
@@ -460,28 +461,28 @@ do { \
 
                 if(i==0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
                 if (splits>1 & i>0){
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        hipSetDevice(gpuids[dev]);
+                        hipMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), hipMemcpyHostToDevice,stream[dev*nStream_device+1]);
                         
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                 }
                 cudaCheckErrors("Memcpy failure on multi split");
@@ -493,7 +494,7 @@ do { \
                     dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // Compute the gradient of the TV norm
                         
@@ -506,30 +507,30 @@ do { \
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         // no need to copy the 2 aux slices here
-                        cudaStreamSynchronize(stream[dev*nStream_device]);
-                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                        hipStreamSynchronize(stream[dev*nStream_device]);
+                        hipMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), hipMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
                     }
                     
                     
                     // Compute the L2 norm of the gradient. For that, reduction is used.
                     //REDUCE
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
                         size_t dimblockRed = MAXTHREADS;
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
                         
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
                         reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         size_t dimblockRed = MAXTHREADS;
@@ -537,17 +538,17 @@ do { \
 
                         if (dimgridRed > 1) {
                             reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                         else {
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                            hipStreamSynchronize(stream[dev*nStream_device]);
+                            hipMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Reduction error");
                     
@@ -570,7 +571,7 @@ do { \
                     
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         //NORMALIZE
@@ -580,15 +581,15 @@ do { \
                         multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
                     }
                      for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                     cudaCheckErrors("Scalar operations error");
                     
                     //SUBSTRACT GRADIENT
                     //////////////////////////////////////////////
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         
@@ -598,8 +599,8 @@ do { \
 
                 // Synchronize mathematics, make sure bounding pixels are correct
                  for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                      }
                 
                 if(splits==1){
@@ -607,35 +608,35 @@ do { \
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
                         if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
+                            hipSetDevice(gpuids[dev+1]);
+                            hipMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice); 
                         }
-                        cudaDeviceSynchronize();
+                        hipDeviceSynchronize();
                         if (dev>0){
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
+                            hipSetDevice(gpuids[dev-1]);
+                            hipMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost);
+                            hipSetDevice(gpuids[dev]);
+                            hipMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), hipMemcpyHostToDevice);
                         }
                     }
                 }else{
                     
                     // We need to take it out :(
                     for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
                         total_pixels=curr_slices*image_size[0]*image_size[1];
-                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                     }
                 }
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
+                    hipSetDevice(gpuids[dev]);
+                    hipDeviceSynchronize();
                 }
                 cudaCheckErrors("Memory gather error");
 
@@ -646,39 +647,39 @@ do { \
         // If there has not been splits, we still have data in memory
         if(splits==1){
             for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
                 total_pixels=curr_slices*image_size[0]*image_size[1];
-                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                hipMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost);
             }
         }
         cudaCheckErrors("Copy result back");
         
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(d_image[dev]);
-            cudaFree(d_norm2aux[dev]);
-            cudaFree(d_dimgTV[dev]);
-            cudaFree(d_norm2[dev]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(d_image[dev]);
+            hipFree(d_norm2aux[dev]);
+            hipFree(d_dimgTV[dev]);
+            hipFree(d_norm2[dev]);
         }
         if (splits==1){
-            cudaFreeHost(buffer);
+            hipHostFree(buffer);
         }
         
         if (isHostRegisterSupported& splits>2){
-            cudaHostUnregister(img);
-            cudaHostUnregister(dst);
+            hipHostUnregister(img);
+            hipHostUnregister(dst);
         }
         for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
+           hipStreamDestroy(stream[i]) ;
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("Memory free");
-        cudaDeviceReset();
+        hipDeviceReset();
     }
         
 void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
@@ -686,8 +687,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
         size_t memtotal;
         int deviceCount = gpuids.GetLength();
         for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
+            hipSetDevice(gpuids[dev]);
+            hipMemGetInfo(&memfree,&memtotal);
             if(dev==0) *mem_GPU_global=memfree;
             if(memfree<memtotal/2){
                 mexErrMsgIdAndTxt("GD_TV:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/GD_TV.cu.prehip b/Common/CUDA/GD_TV.cu.prehip
new file mode 100644
index 00000000..4edcf94c
--- /dev/null
+++ b/Common/CUDA/GD_TV.cu.prehip
@@ -0,0 +1,702 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for Steepest descend in POCS-type algorithms.
+ *
+ * This file will iteratively minimize by steepest descend the total variation
+ * of the input image, with the parameters given, using GPUs.
+ *
+ * CODE by       Ander Biguri
+ *
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+
+
+
+
+
+#define MAXTHREADS 1024
+#define MAX_BUFFER 60
+
+#include "GD_TV.hpp"
+#include "gpuUtils.hpp"
+
+
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                cudaDeviceReset();\
+                mexErrMsgIdAndTxt("GD_TV:GPU",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+// CUDA kernels
+//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927
+    __global__ void divideArrayScalar(float* vec,float scalar,const size_t n){
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]/=scalar;
+        }
+    }
+    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]*=scalar;
+        }
+    }
+    __global__ void substractArrays(float* vec,float* vec2,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]-=vec2[i];
+        }
+    }
+    
+    __device__ __inline__
+            void gradient(const float* u, float* grad,
+            long z, long y, long x,
+            long depth, long rows, long cols){
+        unsigned long size2d = rows*cols;
+        unsigned long long idx = z * size2d + y * cols + x;
+        
+        float uidx = u[idx];
+        
+        if ( z - 1 >= 0 && z<depth) {
+            grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
+        }
+        
+        if ( y - 1 >= 0 && y<rows){
+            grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
+        }
+        
+        if ( x - 1 >= 0 && x<cols) {
+            grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
+        }
+    }
+    
+    __global__ void gradientTV(const float* f, float* dftv,
+            long depth, long rows, long cols){
+        unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
+        unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
+        unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
+        unsigned long long idx = z * rows * cols + y * cols + x;
+        if ( x >= cols || y >= rows || z >= depth )
+            return;
+        
+        
+        float df[3] ={0.f,0.f,0.f};
+        float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k}
+        float dfj[3]={0.f,0.f,0.f};
+        float dfk[3]={0.f,0.f,0.f};
+        gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
+        gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
+        gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
+        gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
+        float eps=0.00000001; //% avoid division by zero
+        
+        dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps)
+        -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps)     // I wish I coudl precompute this, but if I do then Id need to recompute the gradient.
+        -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps)
+        -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps);
+        return;
+        
+    }
+    
+    __device__ void warpReduce(volatile float *sdata, size_t tid) {
+        sdata[tid] += sdata[tid + 32];
+        sdata[tid] += sdata[tid + 16];
+        sdata[tid] += sdata[tid + 8];
+        sdata[tid] += sdata[tid + 4];
+        sdata[tid] += sdata[tid + 2];
+        sdata[tid] += sdata[tid + 1];
+    }
+    
+    __global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
+        extern __shared__ volatile float sdata[];
+        //http://stackoverflow.com/a/35133396/1485872
+        size_t tid = threadIdx.x;
+        size_t i = blockIdx.x*blockDim.x + tid;
+        size_t gridSize = blockDim.x*gridDim.x;
+        float mySum = 0;
+        float value=0;
+        while (i < n) {
+            value=g_idata[i]; //avoid reading twice
+            mySum += value*value;
+            i += gridSize;
+        }
+        sdata[tid] = mySum;
+        __syncthreads();
+        
+        if (tid < 512)
+            sdata[tid] += sdata[tid + 512];
+        __syncthreads();
+        if (tid < 256)
+            sdata[tid] += sdata[tid + 256];
+        __syncthreads();
+        
+        if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+        
+        if (tid <  64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+        
+        
+#if (__CUDART_VERSION >= 9000)
+        if ( tid < 32 )
+        {
+            mySum = sdata[tid] + sdata[tid + 32];
+            for (int offset = warpSize/2; offset > 0; offset /= 2) {
+                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
+            }
+        }
+#else
+        if (tid < 32) {
+            warpReduce(sdata, tid);
+            mySum = sdata[0];
+        }
+#endif
+        if (tid == 0) g_odata[blockIdx.x] = mySum;
+    }
+    
+    __global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
+        extern __shared__ volatile float sdata[];
+        //http://stackoverflow.com/a/35133396/1485872
+        size_t tid = threadIdx.x;
+        size_t i = blockIdx.x*blockDim.x + tid;
+        size_t gridSize = blockDim.x*gridDim.x;
+        float mySum = 0;
+        // float value=0;
+        while (i < n) {
+            mySum += g_idata[i];
+            i += gridSize;
+        }
+        sdata[tid] = mySum;
+        __syncthreads();
+        
+        if (tid < 512)
+            sdata[tid] += sdata[tid + 512];
+        __syncthreads();
+        if (tid < 256)
+            sdata[tid] += sdata[tid + 256];
+        __syncthreads();
+        
+        if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+        
+        if (tid <  64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+        
+        
+#if (__CUDART_VERSION >= 9000)
+        if ( tid < 32 )
+        {
+            mySum = sdata[tid] + sdata[tid + 32];
+            for (int offset = warpSize/2; offset > 0; offset /= 2) {
+                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
+            }
+        }
+#else
+        if (tid < 32) {
+            warpReduce(sdata, tid);
+            mySum = sdata[0];
+        }
+#endif
+        if (tid == 0) g_odata[blockIdx.x] = mySum;
+    }
+    
+    
+    
+    
+// main function
+    void pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter, const GpuIds& gpuids){
+        
+        
+       
+        
+        // Prepare for MultiGPU
+        int deviceCount = gpuids.GetLength();
+        cudaCheckErrors("Device query fail");
+        if (deviceCount == 0) {
+            mexErrMsgIdAndTxt("GD_TV:GPU","There are no available device(s) that support CUDA\n");
+        }
+        //
+        // CODE assumes
+        // 1.-All available devices are usable by this code
+        // 2.-All available devices are equal, they are the same machine (warning thrown)
+        // Check the available devices, and if they are the same
+        if (!gpuids.AreEqualDevices()) {
+            mexWarnMsgIdAndTxt("minimizeTV:GD_TV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+        }
+        
+        int dev;
+        
+        // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
+
+        size_t mem_GPU_global;
+        checkFreeMemory(gpuids, &mem_GPU_global);
+
+        
+        
+        // %5 of free memory should be enough, we have almost no variables in these kernels
+        size_t total_pixels              = image_size[0] * image_size[1]  * image_size[2] ;
+        size_t mem_slice_image           = sizeof(float)* image_size[0] * image_size[1]  ;
+        size_t mem_size_image            = sizeof(float)* total_pixels;
+        size_t mem_auxiliary             = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
+        
+        // Decide how are we handling the distribution of computation
+        size_t mem_img_each_GPU;
+        
+        unsigned int buffer_length=2;
+        //Does everything fit in the GPU?
+        unsigned int slices_per_split;
+        
+        // if it is a thin problem (no need to split), just use one GPU
+        if (image_size[2]<4){deviceCount=1;}
+
+        unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this.
+        if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary){
+            // We only need to split if we have extra GPUs
+            slices_per_split=(image_size[2]+deviceCount-1)/deviceCount;
+            mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2));
+        }else{
+            // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all
+            size_t mem_free=mem_GPU_global-mem_auxiliary;
+            
+            splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free));
+            // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images.
+            // lets make sure these 2 slices fit, if they do not, add 1 to splits.
+            slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits);
+            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
+            
+            // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory
+            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
+                // one more split should do the job, as its an edge case.
+                splits++;
+                //recompute for later
+                slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap
+                mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
+            }
+
+
+            // How many EXTRA buffer slices should be able to fit in here??!?!
+            // Only do it if there are splits needed. 
+            if(splits>1){
+                mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary);
+                unsigned int extra_buff=(mem_free/mem_slice_image); 
+                buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down.
+                buffer_length=max(buffer_length,2);// minimum 2
+                buffer_length=min(MAX_BUFFER,buffer_length);
+
+                mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2);
+                
+            }else{
+                buffer_length=2;
+            }
+
+            // Assert
+            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
+                mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n");
+            }
+        }
+        
+        
+         // Assert
+       
+        if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){
+            mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n");
+        }
+        
+        
+        
+        
+        
+        
+        float** d_image=    (float**)malloc(deviceCount*sizeof(float*));
+        float** d_dimgTV=   (float**)malloc(deviceCount*sizeof(float*));
+        float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*));
+        float** d_norm2=    (float**)malloc(deviceCount*sizeof(float*));
+         
+        // allocate memory in each GPU
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            
+            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
+            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
+            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
+            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
+            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
+            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
+            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
+            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
+            cudaCheckErrors("Malloc  error");
+            
+            
+        }
+       unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1];
+        float* buffer;
+        if(splits>1){
+            mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
+        }else{
+            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
+        }
+        
+        
+        
+        // Lets try to make the host memory pinned:
+        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+        // splits>2 is completely empirical observation
+        if (isHostRegisterSupported & splits>2){
+            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+        }
+        cudaCheckErrors("Error pinning memory");
+
+        
+        
+                // Create streams
+        int nStream_device=2;
+        int nStreams=deviceCount*nStream_device;
+        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            for (int i = 0; i < nStream_device; ++i){
+                cudaStreamCreate(&stream[i+dev*nStream_device]);
+            }
+        }
+        cudaCheckErrors("Stream creation fail");
+
+        
+        // For the reduction
+
+        double totalsum_prev;
+        double totalsum;
+        float sum_curr_spl;
+        float * sumnorm2;
+        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
+        
+        unsigned int curr_slices;
+        unsigned long long curr_pixels;
+        size_t linear_idx_start;
+        unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        unsigned long long* offset_host  =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        bool is_first_chunk;
+        bool is_last_chunk;
+        for(unsigned int i=0;i<maxIter;i+=(buffer_length-1)){
+            if(splits>1){
+                totalsum_prev=0;
+            }
+            for(unsigned int sp=0;sp<splits;sp++){
+                
+                // For each iteration we need to compute all the image. The ordering of these loops
+                // need to be like this due to the bounding layers between splits. If more than 1 split is needed
+                // for each GPU then there is no other way that taking the entire memory out of GPU and putting it back.
+                // If the memory can be shared between GPUs fully without extra splits, then there is an easy way of synchronizing the memory
+                
+                // Copy image to memory
+                for (dev = 0; dev < deviceCount; dev++){
+                    curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                    curr_pixels=curr_slices*image_size[0]*image_size[1];
+                    linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
+                    
+                    // Check if its the first or last chunck
+                    is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                    is_first_chunk=!(sp*deviceCount+dev);
+                    
+                    // lets compute where we start copyes and how much. This avoids 3 calls to Memcpy
+                    offset_device[dev]=buffer_pixels*is_first_chunk;
+                    offset_host[dev]=linear_idx_start-buffer_pixels*!is_first_chunk;
+                    bytes_device[dev]=curr_pixels+buffer_pixels*!is_first_chunk+buffer_pixels*!is_last_chunk;
+                }
+
+                if(i==0){
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        
+                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        
+                        
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                    }
+                }
+                // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
+                if (splits>1 & i>0){
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
+                        
+                        
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                    }
+                }
+                cudaCheckErrors("Memcpy failure on multi split");
+                
+                for(unsigned int ib=0;  (ib<(buffer_length-1)) && ((i+ib)<maxIter);  ib++){
+                    
+                    // For the gradient
+                    dim3 blockGrad(10, 10, 10);
+                    dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
+                    
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        // Compute the gradient of the TV norm
+                        
+                        // I don't understand why I need to store 2 layers to compute correctly with 1 buffer. The bounding checks should
+                        // be enough but they are not.
+                        gradientTV<<<gridGrad, blockGrad,0,stream[dev*nStream_device]>>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0]);
+                        
+                    }
+                    
+                    
+                    
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        // no need to copy the 2 aux slices here
+                        cudaStreamSynchronize(stream[dev*nStream_device]);
+                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                    }
+                    
+                    
+                    // Compute the L2 norm of the gradient. For that, reduction is used.
+                    //REDUCE
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        
+                        size_t dimblockRed = MAXTHREADS;
+                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
+                        
+                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
+                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
+                        
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        size_t dimblockRed = MAXTHREADS;
+                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
+
+                        if (dimgridRed > 1) {
+                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
+                            cudaStreamSynchronize(stream[dev*nStream_device]);
+                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        }
+                        else {
+                            cudaStreamSynchronize(stream[dev*nStream_device]);
+                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        }
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                     }
+                    cudaCheckErrors("Reduction error");
+                    
+                    
+                    // Accumulate the norm accross devices
+                    sum_curr_spl=0;
+                    // this is CPU code
+                    for (dev = 0; dev < deviceCount; dev++){
+                        sum_curr_spl+=sumnorm2[dev];
+                    }
+                    sum_curr_spl+=0.0000001f; // avoid division by zero
+                    
+                    // If we have more than one splits, lets use the result from prior calls
+                    if(i>0 && splits>1){
+                        // this is already stored:
+                        //totalsum=totalsum_prev; 
+                    }else{
+                        totalsum=sum_curr_spl;
+                    }
+                    
+                    
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        //NORMALIZE
+                        //in a Tesla, maximum blocks =15 SM * 4 blocks/SM
+                        divideArrayScalar  <<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels);
+                        //MULTIPLY HYPERPARAMETER
+                        multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
+                    }
+                     for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                     }
+                    cudaCheckErrors("Scalar operations error");
+                    
+                    //SUBSTRACT GRADIENT
+                    //////////////////////////////////////////////
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        
+                        substractArrays<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels);
+                    }
+                }
+
+                // Synchronize mathematics, make sure bounding pixels are correct
+                 for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                     }
+                
+                if(splits==1){
+                    for(dev=0; dev<deviceCount;dev++){
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        if (dev<deviceCount-1){
+                            cudaSetDevice(gpuids[dev+1]);
+                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                            cudaSetDevice(gpuids[dev]);
+                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
+                        }
+                        cudaDeviceSynchronize();
+                        if (dev>0){
+                            cudaSetDevice(gpuids[dev-1]);
+                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+                            cudaSetDevice(gpuids[dev]);
+                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
+                        }
+                    }
+                }else{
+                    
+                    // We need to take it out :(
+                    for(dev=0; dev<deviceCount;dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
+                        total_pixels=curr_slices*image_size[0]*image_size[1];
+                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                    }
+                }
+                
+                for (dev = 0; dev < deviceCount; dev++){
+                    cudaSetDevice(gpuids[dev]);
+                    cudaDeviceSynchronize();
+                }
+                cudaCheckErrors("Memory gather error");
+
+                totalsum_prev+=sum_curr_spl;
+            }
+            totalsum=totalsum_prev;
+        }
+        // If there has not been splits, we still have data in memory
+        if(splits==1){
+            for(dev=0; dev<deviceCount;dev++){
+                cudaSetDevice(gpuids[dev]);
+                
+                curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
+                total_pixels=curr_slices*image_size[0]*image_size[1];
+                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
+            }
+        }
+        cudaCheckErrors("Copy result back");
+        
+        for(dev=0; dev<deviceCount;dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaFree(d_image[dev]);
+            cudaFree(d_norm2aux[dev]);
+            cudaFree(d_dimgTV[dev]);
+            cudaFree(d_norm2[dev]);
+        }
+        if (splits==1){
+            cudaFreeHost(buffer);
+        }
+        
+        if (isHostRegisterSupported& splits>2){
+            cudaHostUnregister(img);
+            cudaHostUnregister(dst);
+        }
+        for (int i = 0; i < nStreams; ++i)
+           cudaStreamDestroy(stream[i]) ;
+        
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDeviceSynchronize();
+        }
+        cudaCheckErrors("Memory free");
+        cudaDeviceReset();
+    }
+        
+void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
+        size_t memfree;
+        size_t memtotal;
+        int deviceCount = gpuids.GetLength();
+        for (int dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaMemGetInfo(&memfree,&memtotal);
+            if(dev==0) *mem_GPU_global=memfree;
+            if(memfree<memtotal/2){
+                mexErrMsgIdAndTxt("GD_TV:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
+            }
+            cudaCheckErrors("Check mem error");
+            
+            *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
+        }
+        *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
+        
+        //*mem_GPU_global= insert your known number here, in bytes.
+}
diff --git a/Common/CUDA/GD_TV.hpp.prehip b/Common/CUDA/GD_TV.hpp.prehip
new file mode 100644
index 00000000..998e1ad5
--- /dev/null
+++ b/Common/CUDA/GD_TV.hpp.prehip
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header for CUDA functions for Steepest descend in POCS-type algorithms.
+ *
+ * This file has the required headers for POCS_TV.cu
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+
+
+#ifndef GD_TV_HPP
+#define GD_TV_HPP
+#include "TIGRE_common.hpp"
+#include "GpuIds.hpp"
+
+void pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter, const GpuIds& gpuids);
+
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/GpuIds.cpp b/Common/CUDA/GpuIds.cpp
index e9e622cc..b787503e 100644
--- a/Common/CUDA/GpuIds.cpp
+++ b/Common/CUDA/GpuIds.cpp
@@ -1,7 +1,7 @@
 #include "GpuIds.hpp"
 #include <stdlib.h>
 #include <string.h>
-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>
 
 GpuIds::~GpuIds() {
     free(m_piDeviceIds); m_piDeviceIds = nullptr;
@@ -52,12 +52,12 @@ void GpuIds::SetAllGpus(int iTotalDeviceCount) {
 
 bool GpuIds::AreEqualDevices() const {
     int deviceCount = this->GetLength();
-    const int devicenamelength = 256;  // The length 256 is fixed by spec of cudaDeviceProp::name
+    const int devicenamelength = 256;  // The length 256 is fixed by spec of hipDeviceProp_t::name
     char devicename[devicenamelength];
-    cudaDeviceProp deviceProp;
+    hipDeviceProp_t deviceProp;
     for (int dev = 0; dev < deviceCount; dev++) {
-        // cudaSetDevice(m_piDeviceIds[dev]);
-        cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
+        // hipSetDevice(m_piDeviceIds[dev]);
+        hipGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
         if (dev>0) {
             if (strcmp(devicename, deviceProp.name) != 0) {
                 return false;
diff --git a/Common/CUDA/GpuIds.cpp.prehip b/Common/CUDA/GpuIds.cpp.prehip
new file mode 100644
index 00000000..e9e622cc
--- /dev/null
+++ b/Common/CUDA/GpuIds.cpp.prehip
@@ -0,0 +1,70 @@
+#include "GpuIds.hpp"
+#include <stdlib.h>
+#include <string.h>
+#include <cuda_runtime_api.h>
+
+GpuIds::~GpuIds() {
+    free(m_piDeviceIds); m_piDeviceIds = nullptr;
+    m_iCount = 0;
+}
+GpuIds::GpuIds() : m_piDeviceIds (nullptr), m_iCount(0) {
+
+}
+void GpuIds::SetIds(int iCount, int* piDeviceIds) {
+    if (iCount > 0 && piDeviceIds != 0) {
+        if (m_piDeviceIds) {
+            free(m_piDeviceIds); m_piDeviceIds = nullptr;
+            m_iCount = 0;
+        }
+        m_piDeviceIds = (int*)malloc(iCount * sizeof(int));
+        if (m_piDeviceIds) {
+            for (int iI = 0; iI < iCount; ++iI) {
+                m_piDeviceIds[iI] = piDeviceIds[iI];
+            }
+            m_iCount = iCount;
+        }
+    }
+}
+
+int GpuIds::GetLength() const {
+    return m_iCount;
+}
+int& GpuIds::operator[](int iIndex){
+    return m_piDeviceIds[iIndex];
+}
+int GpuIds::operator[](int iIndex) const {
+    return m_piDeviceIds[iIndex];
+}
+
+void GpuIds::SetAllGpus(int iTotalDeviceCount) {
+    // Set all GPUs for compatibility
+    // Makeup valid GpuIds.
+    int* aiIds = nullptr;
+    if (iTotalDeviceCount == 0) {
+        (int*)malloc(iTotalDeviceCount*sizeof(int));
+        for (int iI = 0; iI < iTotalDeviceCount; ++iI) {
+            aiIds[iI] = iI;
+        }
+    }
+    SetIds(iTotalDeviceCount, aiIds);
+    free(aiIds); aiIds = 0;    
+}
+
+bool GpuIds::AreEqualDevices() const {
+    int deviceCount = this->GetLength();
+    const int devicenamelength = 256;  // The length 256 is fixed by spec of cudaDeviceProp::name
+    char devicename[devicenamelength];
+    cudaDeviceProp deviceProp;
+    for (int dev = 0; dev < deviceCount; dev++) {
+        // cudaSetDevice(m_piDeviceIds[dev]);
+        cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
+        if (dev>0) {
+            if (strcmp(devicename, deviceProp.name) != 0) {
+                return false;
+            }
+        }
+        memset(devicename, 0, devicenamelength);
+        strcpy(devicename, deviceProp.name);
+    }
+    return true;
+}
diff --git a/Common/CUDA/GpuIds.hpp.prehip b/Common/CUDA/GpuIds.hpp.prehip
new file mode 100644
index 00000000..e0223f86
--- /dev/null
+++ b/Common/CUDA/GpuIds.hpp.prehip
@@ -0,0 +1,17 @@
+
+#ifndef GPUIDS_H
+#define GPUIDS_H
+struct GpuIds {
+    int* m_piDeviceIds;
+    int m_iCount;
+    ~GpuIds();
+    GpuIds();
+    void SetIds(int iCount, int* piDeviceIds);
+    int GetLength() const;
+    void SetAllGpus(int iTotalDeviceCount);
+    int& operator[](int iIndex);
+    int operator[](int iIndex) const;
+    bool AreEqualDevices() const;
+};
+#endif
+
diff --git a/Common/CUDA/PICCS.cu b/Common/CUDA/PICCS.cu
index 481ede08..e447b375 100644
--- a/Common/CUDA/PICCS.cu
+++ b/Common/CUDA/PICCS.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for Steepest descend in POCS-type algorithms.
@@ -60,10 +61,10 @@ Codes  : https://github.com/CERN/TIGRE
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("ERROR in: %s \n",msg);\
-                mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("err",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -263,9 +264,9 @@ do { \
 bool isnan_cuda(float* vec, size_t size){
     bool*d_nan;
     bool h_nan;
-    cudaMalloc((void **)&d_nan, sizeof (bool));
+    hipMalloc((void **)&d_nan, sizeof (bool));
     isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan);
-    cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
+    hipMemcpy(&h_nan, d_nan, sizeof(bool), hipMemcpyDeviceToHost);
     return h_nan;
 
 }
@@ -281,24 +282,24 @@ bool isnan_cuda(float* vec, size_t size){
         
         float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2;
         // memory for image
-        cudaMalloc(&d_image, mem_size);
-        cudaMalloc(&d_prior, mem_size);
+        hipMalloc(&d_image, mem_size);
+        hipMalloc(&d_prior, mem_size);
 
         cudaCheckErrors("Malloc Image error");
-        cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice);
+        hipMemcpy(d_image, img, mem_size, hipMemcpyHostToDevice);
+        hipMemcpy(d_prior, prior, mem_size, hipMemcpyHostToDevice);
         cudaCheckErrors("Memory Malloc and Memset: SRC");
         // memory for df
-        cudaMalloc(&d_dimgTV, mem_size);
-        cudaMalloc(&d_dpiccsTV, mem_size);
+        hipMalloc(&d_dimgTV, mem_size);
+        hipMalloc(&d_dpiccsTV, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_norm2, mem_size);
+        hipMalloc(&d_norm2, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_aux_image, mem_size);
+        hipMalloc(&d_aux_image, mem_size);
         cudaCheckErrors("Memory Malloc and Memset: TV");
         
         // memory for L2norm auxiliar
-        cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
+        hipMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
         cudaCheckErrors("Memory Malloc and Memset: NORMAux");
         
         
@@ -315,64 +316,64 @@ bool isnan_cuda(float* vec, size_t size){
 
         for(unsigned int i=0;i<maxIter;i++){
             
-            cudaMemcpy( d_aux_image,d_image, mem_size, cudaMemcpyDeviceToDevice);
+            hipMemcpy( d_aux_image,d_image, mem_size, hipMemcpyDeviceToDevice);
 //             mexPrintf("Iteration %d\n",(int)i);
 
             // Compute the gradient of the TV norm
             gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Gradient");
 //             mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
 
 
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio),   total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
 
             substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Substraction error");
             
             gradientTV<<<gridGrad, blockGrad>>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Gradient");
 //             mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
 
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio,   total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
 //             mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
                 
             
             addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             //NOMRALIZE via reduction
             //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
-            cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
+            hipMemcpy(d_norm2, d_dimgTV, mem_size, hipMemcpyDeviceToDevice);
             cudaCheckErrors("Copy from gradient call error");
             reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_aux_small, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("reduce1");
             if (dimgridRed > 1) {
                 reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed);
-                cudaDeviceSynchronize();
+                hipDeviceSynchronize();
                 cudaCheckErrors("reduce2");
-                cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
+                hipMemcpy(&sumnorm2, d_norm2, sizeof(float), hipMemcpyDeviceToHost);
+                cudaCheckErrors("hipMemcpy");
 
             }
             else {
-                cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
+                hipMemcpy(&sumnorm2, d_aux_small, sizeof(float), hipMemcpyDeviceToHost);
+                cudaCheckErrors("hipMemcpy");
             }
 //             mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2));
             //MULTIPLY HYPERPARAMETER sqrt(sumnorm2)
             multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2),  total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Multiplication error");
             //SUBSTRACT GRADIENT
             substractArrays    <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
-            cudaDeviceSynchronize();
+            hipDeviceSynchronize();
             cudaCheckErrors("Substraction error");
 //             mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false");
 //             mexPrintf("\n");
@@ -381,18 +382,18 @@ bool isnan_cuda(float* vec, size_t size){
         
         cudaCheckErrors("TV minimization");
         
-        cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
+        hipMemcpy(dst, d_image, mem_size, hipMemcpyDeviceToHost);
         cudaCheckErrors("Copy result back");
         
-        cudaFree(d_image);
-        cudaFree(d_dpiccsTV);
-        cudaFree(d_aux_image);
-        cudaFree(d_aux_small);
-        cudaFree(d_prior);
-        cudaFree(d_norm2);
+        hipFree(d_image);
+        hipFree(d_dpiccsTV);
+        hipFree(d_aux_image);
+        hipFree(d_aux_small);
+        hipFree(d_prior);
+        hipFree(d_norm2);
 
 
         cudaCheckErrors("Memory free");
-        cudaDeviceReset();
+        hipDeviceReset();
     }
     
diff --git a/Common/CUDA/PICCS.cu.prehip b/Common/CUDA/PICCS.cu.prehip
new file mode 100644
index 00000000..481ede08
--- /dev/null
+++ b/Common/CUDA/PICCS.cu.prehip
@@ -0,0 +1,398 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for Steepest descend in POCS-type algorithms.
+ *
+ * This file will iteratively minimize by stepest descend the total variation 
+ * of the input image, with the parameters given, using GPUs.
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+
+
+#define MAXTHREADS 1024
+
+#include "PICCS.hpp"
+
+
+
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("ERROR in: %s \n",msg);\
+                mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+// CUDA kernels
+//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927
+    __global__ void divideArrayScalar(float* vec,float scalar,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]/=scalar;
+        }
+    }
+    __global__ void isnan_device(float* vec,const size_t n,bool* result)
+    {
+        *result = false;
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            if(isnan(vec[i]))
+                *result=true;
+        }
+    }
+    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]*=scalar;
+        }
+    }
+    __global__ void substractArrays(float* vec,float* vec2,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]-=vec2[i];
+        }
+    }
+    __global__ void addArrays(float* vec,float* vec2,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]+=vec2[i];
+        }
+    }
+    __device__ __inline__
+            void gradient(const float* u, float* grad,
+            long z, long y, long x,
+            long depth, long rows, long cols)
+    {
+        unsigned long size2d = rows*cols;
+        unsigned long long idx = z * size2d + y * cols + x;
+        
+        float uidx = u[idx];
+        
+        if ( z - 1 >= 0 && z<depth) {
+            grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
+        }
+        
+        if ( y - 1 >= 0 && y<rows){
+            grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
+        }
+        
+        if ( x - 1 >= 0 && x<cols) {
+            grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
+        }
+    }
+    
+    __global__ void gradientTV(const float* f, float* dftv,
+            long depth, long rows, long cols){
+        unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
+        unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
+        unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
+        unsigned long long idx = z * rows * cols + y * cols + x;
+        if ( x >= cols || y >= rows || z >= depth )
+            return;
+        
+        float df[3] ={0,0,0};
+        float dfi[3]={0,0,0}; // dfi== \partial f_{i+1,j,k}
+        float dfj[3]={0,0,0};
+        float dfk[3]={0,0,0};
+        gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
+        gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
+        gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
+        gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
+        float eps=0.000001; //% avoid division by zero
+        dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps)
+        -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps)     // I wish I coudl precompute this, but if I do then Id need to recompute the gradient.
+        -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps)
+        -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps);
+        
+    }
+    
+    __device__ void warpReduce(volatile float *sdata, size_t tid) {
+        sdata[tid] += sdata[tid + 32];
+        sdata[tid] += sdata[tid + 16];
+        sdata[tid] += sdata[tid + 8];
+        sdata[tid] += sdata[tid + 4];
+        sdata[tid] += sdata[tid + 2];
+        sdata[tid] += sdata[tid + 1];
+    }
+    
+    __global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
+        extern __shared__ volatile float sdata[];
+        //http://stackoverflow.com/a/35133396/1485872
+        size_t tid = threadIdx.x;
+        size_t i = blockIdx.x*blockDim.x + tid;
+        size_t gridSize = blockDim.x*gridDim.x;
+        float mySum = 0;
+        float value=0;
+        while (i < n) {
+            value=g_idata[i]; //avoid reading twice
+            mySum += value*value;
+            i += gridSize;
+        }
+        sdata[tid] = mySum;
+        __syncthreads();
+        
+        if (tid < 512)
+            sdata[tid] += sdata[tid + 512];
+        __syncthreads();
+        if (tid < 256)
+            sdata[tid] += sdata[tid + 256];
+        __syncthreads();
+        
+        if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+        
+        if (tid <  64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+        
+        
+#if (__CUDART_VERSION >= 9000)
+        if ( tid < 32 )
+        {
+            mySum = sdata[tid] + sdata[tid + 32];
+            for (int offset = warpSize/2; offset > 0; offset /= 2) {
+                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
+            }
+        }
+#else
+        if (tid < 32) {
+            warpReduce(sdata, tid);
+            mySum = sdata[0];
+        }
+#endif
+        if (tid == 0) g_odata[blockIdx.x] = mySum;
+    }
+    __global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
+        extern __shared__ volatile float sdata[];
+        //http://stackoverflow.com/a/35133396/1485872
+        size_t tid = threadIdx.x;
+        size_t i = blockIdx.x*blockDim.x + tid;
+        size_t gridSize = blockDim.x*gridDim.x;
+        float mySum = 0;
+       // float value=0;
+        while (i < n) {
+            mySum += g_idata[i];
+            i += gridSize;
+        }
+        sdata[tid] = mySum;
+        __syncthreads();
+        
+        if (tid < 512)
+            sdata[tid] += sdata[tid + 512];
+        __syncthreads();
+        if (tid < 256)
+            sdata[tid] += sdata[tid + 256];
+        __syncthreads();
+        
+        if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+        
+        if (tid <  64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+        
+        
+#if (__CUDART_VERSION >= 9000)
+        if ( tid < 32 )
+        {
+            mySum = sdata[tid] + sdata[tid + 32];
+            for (int offset = warpSize/2; offset > 0; offset /= 2) {
+                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
+            }
+        }
+#else
+        if (tid < 32) {
+            warpReduce(sdata, tid);
+            mySum = sdata[0];
+        }
+#endif
+        if (tid == 0) g_odata[blockIdx.x] = mySum;
+    }
+    
+
+bool isnan_cuda(float* vec, size_t size){
+    bool*d_nan;
+    bool h_nan;
+    cudaMalloc((void **)&d_nan, sizeof (bool));
+    isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan);
+    cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
+    return h_nan;
+
+}
+    
+// main function
+ void piccs_tv(const float* img,const float* prior, float* dst,float alpha,float ratio, const long* image_size, int maxIter, const GpuIds& gpuids){
+        
+     
+        
+    
+        size_t total_pixels = image_size[0] * image_size[1]  * image_size[2] ;
+        size_t mem_size = sizeof(float) * total_pixels;
+        
+        float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2;
+        // memory for image
+        cudaMalloc(&d_image, mem_size);
+        cudaMalloc(&d_prior, mem_size);
+
+        cudaCheckErrors("Malloc Image error");
+        cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice);
+        cudaCheckErrors("Memory Malloc and Memset: SRC");
+        // memory for df
+        cudaMalloc(&d_dimgTV, mem_size);
+        cudaMalloc(&d_dpiccsTV, mem_size);
+        cudaCheckErrors("Memory Malloc and Memset: TV");
+        cudaMalloc(&d_norm2, mem_size);
+        cudaCheckErrors("Memory Malloc and Memset: TV");
+        cudaMalloc(&d_aux_image, mem_size);
+        cudaCheckErrors("Memory Malloc and Memset: TV");
+        
+        // memory for L2norm auxiliar
+        cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
+        cudaCheckErrors("Memory Malloc and Memset: NORMAux");
+        
+        
+        
+        // For the gradient
+        dim3 blockGrad(10, 10, 10);
+        dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (image_size[2]+blockGrad.z-1)/blockGrad.z);
+        
+        // For the reduction
+        float sumnorm2;
+        size_t dimblockRed = MAXTHREADS;
+        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
+
+
+        for(unsigned int i=0;i<maxIter;i++){
+            
+            cudaMemcpy( d_aux_image,d_image, mem_size, cudaMemcpyDeviceToDevice);
+//             mexPrintf("Iteration %d\n",(int)i);
+
+            // Compute the gradient of the TV norm
+            gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Gradient");
+//             mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
+
+
+            multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio),   total_pixels);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Multiplication error");
+
+            substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Substraction error");
+            
+            gradientTV<<<gridGrad, blockGrad>>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Gradient");
+//             mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
+
+            multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio,   total_pixels);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Multiplication error");
+//             mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
+                
+            
+            addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels);
+            cudaDeviceSynchronize();
+            //NOMRALIZE via reduction
+            //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
+            cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
+            cudaCheckErrors("Copy from gradient call error");
+            reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_aux_small, total_pixels);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("reduce1");
+            if (dimgridRed > 1) {
+                reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed);
+                cudaDeviceSynchronize();
+                cudaCheckErrors("reduce2");
+                cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
+                cudaCheckErrors("cudaMemcpy");
+
+            }
+            else {
+                cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost);
+                cudaCheckErrors("cudaMemcpy");
+            }
+//             mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2));
+            //MULTIPLY HYPERPARAMETER sqrt(sumnorm2)
+            multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2),  total_pixels);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Multiplication error");
+            //SUBSTRACT GRADIENT
+            substractArrays    <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Substraction error");
+//             mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false");
+//             mexPrintf("\n");
+            sumnorm2=0;
+        }
+        
+        cudaCheckErrors("TV minimization");
+        
+        cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
+        cudaCheckErrors("Copy result back");
+        
+        cudaFree(d_image);
+        cudaFree(d_dpiccsTV);
+        cudaFree(d_aux_image);
+        cudaFree(d_aux_small);
+        cudaFree(d_prior);
+        cudaFree(d_norm2);
+
+
+        cudaCheckErrors("Memory free");
+        cudaDeviceReset();
+    }
+    
diff --git a/Common/CUDA/PICCS.hpp.prehip b/Common/CUDA/PICCS.hpp.prehip
new file mode 100644
index 00000000..e3592dbb
--- /dev/null
+++ b/Common/CUDA/PICCS.hpp.prehip
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header for CUDA functions for Steepest descend in POCS-type algorithms.
+ *
+ * This file has the required headers for POCS_TV.cu
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+
+
+#ifndef GD_TV_HPP
+#define GD_TV_HPP
+#include "TIGRE_common.hpp"
+#include "GpuIds.hpp"
+
+void piccs_tv(const float* img,const float* prior, float* dst,float alpha, float ratio, const long* image_size, int maxIter, const GpuIds& gpuids);
+
+
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/RandomNumberGenerator.cu b/Common/CUDA/RandomNumberGenerator.cu
index d7d1224a..5910b407 100644
--- a/Common/CUDA/RandomNumberGenerator.cu
+++ b/Common/CUDA/RandomNumberGenerator.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for random number generator
@@ -45,40 +46,40 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda.h>
-#include <curand_kernel.h>
-#include <curand.h>
+#include <hip/hip_runtime.h>
+#include <hiprand/hiprand_kernel.h>
+#include <hiprand.h>
 
 #include "gpuUtils.hpp"
 #include "RandomNumberGenerator.hpp"
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                mexErrMsgIdAndTxt("RandomNumberGenerator:",hipGetErrorString(__err));\
         } \
 } while (0)
 
 
-__global__ void setup_kernel(curandState *state) {
+__global__ void setup_kernel(hiprandState *state) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Each thread gets same seed, a different sequence number, no offset */
-    curand_init(1234, idx, 0, &state[idx]);
+    hiprand_init(1234, idx, 0, &state[idx]);
 }
 
-__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
+__global__ void GeneratePoisson(hiprandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
+    hiprandState localState = state[idx];
     int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
     for (int iI = 0; iI < iIter; ++iI) {
         size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
         if (uiPos < uiLen) {
             /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
             pfOut[uiPos] = (float)uiPoisson;
         }
     }
@@ -86,7 +87,7 @@ __global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t ui
     state[idx] = localState;
 }
 
-__global__ void GeneratePoissonAddGaussian(curandState *state,
+__global__ void GeneratePoissonAddGaussian(hiprandState *state,
                         const float* pfIn,
                         size_t uiLen, 
                         float fGaussMu,
@@ -95,15 +96,15 @@ __global__ void GeneratePoissonAddGaussian(curandState *state,
 {
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
+    hiprandState localState = state[idx];
     int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
     for (int iI = 0; iI < iIter; ++iI) {
         size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
         if (uiPos < uiLen) {
             /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            unsigned int uiPoisson = hiprand_poisson(&localState, pfIn[uiPos]);
             /* Gaussian */
-            float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu;
+            float fNormal = hiprand_normal(&localState) * fGaussSigma + fGaussMu;
             pfOut[uiPos] = fNormal + (float)uiPoisson;
         }
     }
@@ -127,31 +128,31 @@ void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpu
     // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut);
     float* d_pfIn = nullptr;
     float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 1");
+    hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail hipMalloc 1");
+    hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail hipMalloc 2");
+    hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
+    cudaCheckErrors("poisson_1d fail hipMemcpy 1");
 
     // float fMin, fMax;
     // GetMinMax(pfIn, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
+    hiprandState *curandStates = nullptr;
     const int kiBlockDim = 1024;  // Threads per Block
     const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 3");
+    hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
+    cudaCheckErrors("poisson_1d fail hipMalloc 3");
     setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
     GeneratePoisson<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 2");
+    hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_1d fail hipMemcpy 2");
     // GetMinMax(pfOut, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
     
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
+    hipFree(d_pfIn); d_pfIn = nullptr;
+    hipFree(d_pfOut); d_pfOut = nullptr;
+    hipFree(curandStates); curandStates = nullptr;
 }
 
 void poisson_gaussian_1d(const float* pfIn,
@@ -164,30 +165,30 @@ void poisson_gaussian_1d(const float* pfIn,
     // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut);
     float* d_pfIn = nullptr;
     float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1");
+    hipMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 1");
+    hipMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 2");
+    hipMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), hipMemcpyHostToDevice);
+    cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 1");
 
     // float fMin, fMax;
     // GetMinMax(pfIn, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
+    hiprandState *curandStates = nullptr;
     const int kiBlockDim = 64;  // Threads per Block
     const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3");
+    hipMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(hiprandState));
+    cudaCheckErrors("poisson_gaussian_1d fail hipMalloc 3");
     setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
     GeneratePoissonAddGaussian<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2");
+    hipMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), hipMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_gaussian_1d fail hipMemcpy 2");
     // GetMinMax(pfOut, uiLen, fMin, fMax);
     // printf("fMin, fMax = %f, %f\n", fMin, fMax);
 
 
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
+    hipFree(d_pfIn); d_pfIn = nullptr;
+    hipFree(d_pfOut); d_pfOut = nullptr;
+    hipFree(curandStates); curandStates = nullptr;
 }
diff --git a/Common/CUDA/RandomNumberGenerator.cu.prehip b/Common/CUDA/RandomNumberGenerator.cu.prehip
new file mode 100644
index 00000000..d7d1224a
--- /dev/null
+++ b/Common/CUDA/RandomNumberGenerator.cu.prehip
@@ -0,0 +1,193 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for random number generator
+ *
+ * Adds noise of Poisson and normal distribution to the input.
+ *
+ * CODE by       Tomoyuki SADAKANE
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <curand_kernel.h>
+#include <curand.h>
+
+#include "gpuUtils.hpp"
+#include "RandomNumberGenerator.hpp"
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                cudaDeviceReset();\
+                mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\
+        } \
+} while (0)
+
+
+__global__ void setup_kernel(curandState *state) {
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    /* Each thread gets same seed, a different sequence number, no offset */
+    curand_init(1234, idx, 0, &state[idx]);
+}
+
+__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    /* Copy state to local memory for efficiency */
+    curandState localState = state[idx];
+    int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
+    for (int iI = 0; iI < iIter; ++iI) {
+        size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
+        if (uiPos < uiLen) {
+            /* Poisson */
+            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            pfOut[uiPos] = (float)uiPoisson;
+        }
+    }
+    /* Copy state back to global memory */
+    state[idx] = localState;
+}
+
+__global__ void GeneratePoissonAddGaussian(curandState *state,
+                        const float* pfIn,
+                        size_t uiLen, 
+                        float fGaussMu,
+                        float fGaussSigma,
+                        float* pfOut)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    /* Copy state to local memory for efficiency */
+    curandState localState = state[idx];
+    int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
+    for (int iI = 0; iI < iIter; ++iI) {
+        size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
+        if (uiPos < uiLen) {
+            /* Poisson */
+            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
+            /* Gaussian */
+            float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu;
+            pfOut[uiPos] = fNormal + (float)uiPoisson;
+        }
+    }
+    /* Copy state back to global memory */
+    state[idx] = localState;
+}
+
+
+template<class T_value>
+void GetMinMax(const T_value* pfIn, size_t uiLen, T_value& tvMin, T_value& tvMax) {
+    tvMin = pfIn[0];
+    tvMax = pfIn[0];
+    T_value tvVal;
+    for (int iI = 1; iI < uiLen; ++iI) {
+        tvVal = pfIn[iI];
+        if (tvMax < tvVal) { tvMax = tvVal; continue;}
+        if (tvMin > tvVal) { tvMin = tvVal; continue;}
+    }
+}
+void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids) {
+    // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut);
+    float* d_pfIn = nullptr;
+    float* d_pfOut = nullptr;
+    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail cudaMalloc 1");
+    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_1d fail cudaMalloc 2");
+    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
+    cudaCheckErrors("poisson_1d fail cudaMemcpy 1");
+
+    // float fMin, fMax;
+    // GetMinMax(pfIn, uiLen, fMin, fMax);
+    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
+    curandState *curandStates = nullptr;
+    const int kiBlockDim = 1024;  // Threads per Block
+    const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
+    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
+    cudaCheckErrors("poisson_1d fail cudaMalloc 3");
+    setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
+    GeneratePoisson<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, d_pfOut);
+    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_1d fail cudaMemcpy 2");
+    // GetMinMax(pfOut, uiLen, fMin, fMax);
+    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
+    
+    cudaFree(d_pfIn); d_pfIn = nullptr;
+    cudaFree(d_pfOut); d_pfOut = nullptr;
+    cudaFree(curandStates); curandStates = nullptr;
+}
+
+void poisson_gaussian_1d(const float* pfIn,
+                        size_t uiLen,
+                        float fGaussMu,
+                        float fGaussSigma,
+                        float* pfOut,
+                        GpuIds& gpuids)
+{
+    // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut);
+    float* d_pfIn = nullptr;
+    float* d_pfOut = nullptr;
+    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1");
+    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
+    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2");
+    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
+    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1");
+
+    // float fMin, fMax;
+    // GetMinMax(pfIn, uiLen, fMin, fMax);
+    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
+    curandState *curandStates = nullptr;
+    const int kiBlockDim = 64;  // Threads per Block
+    const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
+    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
+    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3");
+    setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
+    GeneratePoissonAddGaussian<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut);
+    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
+    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2");
+    // GetMinMax(pfOut, uiLen, fMin, fMax);
+    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
+
+
+    cudaFree(d_pfIn); d_pfIn = nullptr;
+    cudaFree(d_pfOut); d_pfOut = nullptr;
+    cudaFree(curandStates); curandStates = nullptr;
+}
diff --git a/Common/CUDA/RandomNumberGenerator.hpp.prehip b/Common/CUDA/RandomNumberGenerator.hpp.prehip
new file mode 100644
index 00000000..4ba68d8d
--- /dev/null
+++ b/Common/CUDA/RandomNumberGenerator.hpp.prehip
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA functions for random number generator
+ *
+ * Adds noise of Poisson and normal distribution to the input.
+ *
+ * CODE by       Tomoyuki SADAKANE
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+#include "TIGRE_common.hpp"
+#include "GpuIds.hpp"
+void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids);
+void poisson_gaussian_1d(const float* pfPoissonL, size_t uiLen, float fGaussMu, float fGaussSigma, float* pfOut, GpuIds& gpuids);
diff --git a/Common/CUDA/Siddon_projection.cu b/Common/CUDA/Siddon_projection.cu
index 2a025f8c..8e551626 100644
--- a/Common/CUDA/Siddon_projection.cu
+++ b/Common/CUDA/Siddon_projection.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for ray-voxel intersection based projection
@@ -48,18 +49,18 @@
  */
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "Siddon_projection.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("Ax:Siddon_projection",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -94,7 +95,7 @@ do { \
      *
      **/
     
-    void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc);
+    void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc);
 
 __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
 
@@ -111,7 +112,7 @@ __global__ void kernelPixelDetector( Geometry geo,
         float* detector,
         const int currProjSetNumber,
         const int totalNoOfProjections,
-        cudaTextureObject_t tex){
+        hipTextureObject_t tex){
     
     
     unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
@@ -311,10 +312,10 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     if (!fits_in_memory){
         dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
         for (dev = 0; dev < deviceCount; dev++) {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < 2; ++i){
-                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
-                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
+                hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
+                hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
                 cudaCheckErrors("cudaMallocauxiliarty projections fail");
             }
         }
@@ -323,12 +324,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     // This is happening regarthless if the image fits on memory
     float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         
         for (int i = 0; i < 2; ++i){
-            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
-            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
-            cudaCheckErrors("cudaMalloc projections fail");
+            hipMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
+            hipMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
+            cudaCheckErrors("hipMalloc projections fail");
         }
     }
     
@@ -338,13 +339,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
-        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
 #endif
     cudaCheckErrors("Error pinning memory");
@@ -354,18 +355,18 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     // auxiliary variables
     Point3D source, deltaU, deltaV, uvOrigin;
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
     cudaCheckErrors("Error allocating auxiliary constant memory");
     
     // Create Streams for overlapping memcopy and compute
     int nStreams=deviceCount*2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < 2; ++i){
-            cudaStreamCreate(&stream[i+dev*2]);
+            hipStreamCreate(&stream[i+dev*2]);
             
         }
     }
@@ -376,8 +377,8 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
     unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
     unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management.
     int projection_this_block;
-    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
-    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
+    hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount];
+    hipArray **d_cuArrTex = new hipArray*[deviceCount];
     
     for (unsigned int sp=0;sp<splits;sp++){
         
@@ -406,7 +407,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
         // we project for all angles.
         for (unsigned int i=0; i<noOfKernelCalls; i++) {
             for (dev=0;dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
                     proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
@@ -429,8 +430,8 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     projParamsArrayHost[4*j+3]=source;
                     
                 }
-                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*2]);
-                cudaStreamSynchronize(stream[dev*2]);
+                hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[dev*2]);
+                hipStreamSynchronize(stream[dev*2]);
                 cudaCheckErrors("kernel fail");
                 kernelPixelDetector<<<grid,block,0,stream[dev*2]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
             }
@@ -450,7 +451,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                 // 1) grab previous results and put them in the auxiliary variable dProjection_accum
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -463,12 +464,12 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     else
                         projection_this_block=PROJ_PER_BLOCK;
 
-                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
+                    hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]);
                 }
                 //  2) take the results from current compute call and add it to the code in execution.
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -481,7 +482,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     else
                         projection_this_block=PROJ_PER_BLOCK;
 
-                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
+                    hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
                     vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
                 }
             } // end accumulation case, where the image needs to be split 
@@ -490,7 +491,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
             if (i>0){
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on previous set on this GPU
                     proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
                     if (dev+1==deviceCount) {    //is it the last device?
@@ -510,13 +511,13 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
                     else {
                         projection_this_block=PROJ_PER_BLOCK;
                     }
-                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+                    hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
                 }
             }
             // Make sure Computation on kernels has finished before we launch the next batch.
             for (dev = 0; dev < deviceCount; dev++){
-                cudaSetDevice(gpuids[dev]);
-                cudaStreamSynchronize(stream[dev*2]);
+                hipSetDevice(gpuids[dev]);
+                hipStreamSynchronize(stream[dev*2]);
             }
         }
         
@@ -524,7 +525,7 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
          // We still have the last set of projections to get out of GPUs
         for (dev = 0; dev < deviceCount; dev++)
         {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             //Global index of FIRST projection on this set on this GPU
             proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
             if(proj_global>=nangles) 
@@ -533,106 +534,106 @@ int siddon_ray_projection(float* img, Geometry geo, float** result,float const *
             projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
                                       nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
 
-            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
+            hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
             cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
-            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+            hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
         }
         // Make sure everyone has done their bussiness before the next image split:
-        cudaDeviceSynchronize();
+        hipDeviceSynchronize();
     } // End image split loop.
     
     cudaCheckErrors("Main loop  fail");
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
     for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texImg[dev]);
-            cudaFreeArray(d_cuArrTex[dev]);
+            hipSetDevice(gpuids[dev]);
+            hipDestroyTextureObject(texImg[dev]);
+            hipFreeArray(d_cuArrTex[dev]);
     }
     delete[] texImg; texImg = 0;
     delete[] d_cuArrTex; d_cuArrTex = 0;
     // Freeing Stage
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dProjection[dev*2]);
-        cudaFree(dProjection[dev*2+1]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dProjection[dev*2]);
+        hipFree(dProjection[dev*2+1]);
         
     }
     free(dProjection);
     
     if(!fits_in_memory){
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(dProjection_accum[dev*2]);
-            cudaFree(dProjection_accum[dev*2+1]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(dProjection_accum[dev*2]);
+            hipFree(dProjection_accum[dev*2+1]);
             
         }
         free(dProjection_accum);
     }
     freeGeoArray(splits,geoArray);
-    cudaFreeHost(projParamsArrayHost);
+    hipHostFree(projParamsArrayHost);
    
     
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]) ;
+        hipStreamDestroy(stream[i]) ;
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
-        cudaHostUnregister(img);
+        hipHostUnregister(img);
     }
-    cudaCheckErrors("cudaFree  fail");
+    cudaCheckErrors("hipFree  fail");
 #endif
-    //cudaDeviceReset();
+    //hipDeviceReset();
     return 0;
 }
 
 
 
 
-void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc)
+void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool alloc)
 {
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
     const unsigned int num_devices = gpuids.GetLength();
     if(alloc){
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //hipArray Descriptor
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
         }
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams);
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModePoint;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModePoint;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
         
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDeviceSynchronize();
+        hipSetDevice(gpuids[dev]);
+        hipDeviceSynchronize();
     }
     cudaCheckErrors("Texture object creation fail");
 }
@@ -842,8 +843,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
     const int deviceCount = gpuids.GetLength();
 
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUmemory","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/Siddon_projection.cu.prehip b/Common/CUDA/Siddon_projection.cu.prehip
new file mode 100644
index 00000000..2a025f8c
--- /dev/null
+++ b/Common/CUDA/Siddon_projection.cu.prehip
@@ -0,0 +1,859 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for ray-voxel intersection based projection
+ *
+ * This file has the necessary fucntiosn to perform X-ray CBCT projection
+ * operation given a geaometry, angles and image. It usesthe so-called
+ * Jacobs algorithm to compute efficiently the length of the x-rays over
+ * voxel space.
+ *
+ * CODE by       Ander Biguri
+ *               Sepideh Hatamikia (arbitrary rotation)
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+#include <algorithm>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include "Siddon_projection.hpp"
+#include "TIGRE_common.hpp"
+#include <math.h>
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+    
+#define MAXTREADS 1024
+#define PROJ_PER_BLOCK 9
+#define PIXEL_SIZE_BLOCK 9
+    /*GEOMETRY DEFINITION
+     *
+     *                Detector plane, behind
+     *            |-----------------------------|
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |      +--------+             |
+     *            |     /        /|             |
+     *   A Z      |    /        / |*D           |
+     *   |        |   +--------+  |             |
+     *   |        |   |        |  |             |
+     *   |        |   |     *O |  +             |
+     *    --->y   |   |        | /              |
+     *  /         |   |        |/               |
+     * V X        |   +--------+                |
+     *            |-----------------------------|
+     *
+     *           *S
+     *
+     *
+     *
+     *
+     *
+     **/
+    
+    void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc);
+
+__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
+
+
+__global__ void vecAddInPlace(float *a, float *b, unsigned long  n)
+{
+    int idx = blockIdx.x*blockDim.x+threadIdx.x;
+    // Make sure we do not go out of bounds
+    if (idx < n)
+        a[idx] = a[idx] + b[idx];
+}
+
+__global__ void kernelPixelDetector( Geometry geo,
+        float* detector,
+        const int currProjSetNumber,
+        const int totalNoOfProjections,
+        cudaTextureObject_t tex){
+    
+    
+    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned long long projNumber=threadIdx.z;
+    
+    
+    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
+        return;
+    
+#if IS_FOR_MATLAB_TIGRE
+    size_t idx =  (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#else
+    size_t idx =  (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#endif
+    unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array (for a given GPU)
+
+    if(indAlpha>=totalNoOfProjections)
+        return;
+    
+    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
+    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
+    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
+    Point3D source = projParamsArrayDev[4*projNumber+3];
+    
+    /////// Get coordinates XYZ of pixel UV
+    unsigned long pixelV = geo.nDetecV-v-1;
+    unsigned long pixelU = u;
+    Point3D pixel1D;
+    pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
+    pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
+    pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
+    ///////
+    // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516
+    //////
+    // Also called Jacobs algorithms
+    Point3D ray;
+    // vector of Xray
+    ray.x=pixel1D.x-source.x;
+    ray.y=pixel1D.y-source.y;
+    ray.z=pixel1D.z-source.z;
+    float eps=0.001;
+    ray.x=(fabsf(ray.x)<eps)? 0 : ray.x;
+    ray.y=(fabsf(ray.y)<eps)? 0 : ray.y; 
+    ray.z=(fabsf(ray.z)<eps)? 0 : ray.z; 
+    // This variables are ommited because
+    // bx,by,bz ={0,0,0}
+    // dx,dy,dz ={1,1,1}
+    // compute parameter values for x-ray parametric equation. eq(3-10)
+    float axm,aym,azm;
+    float axM,ayM,azM;
+    // In the paper Nx= number of X planes-> Nvoxel+1
+   
+    axm=fminf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x));
+    aym=fminf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y));
+    azm=fminf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z));
+    axM=fmaxf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x));
+    ayM=fmaxf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y));
+    azM=fmaxf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z));
+    
+    float am=fmaxf(fmaxf(axm,aym),azm);
+    float aM=fminf(fminf(axM,ayM),azM);
+    
+    // line intersects voxel space ->   am<aM
+    if (am>=aM)
+        detector[idx]=0;
+    
+    // Compute max/min image INDEX for intersection eq(11-19)
+    // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version
+    float imin,imax,jmin,jmax,kmin,kmax;
+    // for X
+    if( source.x<pixel1D.x){
+        imin=(am==axm)? 1.0f             : ceilf (source.x+am*ray.x);
+        imax=(aM==axM)? geo.nVoxelX      : floorf(source.x+aM*ray.x);
+    }else{
+        imax=(am==axm)? geo.nVoxelX-1.0f : floorf(source.x+am*ray.x);
+        imin=(aM==axM)? 0.0f             : ceilf (source.x+aM*ray.x);
+    }
+    // for Y
+    if( source.y<pixel1D.y){
+        jmin=(am==aym)? 1.0f             : ceilf (source.y+am*ray.y);
+        jmax=(aM==ayM)? geo.nVoxelY      : floorf(source.y+aM*ray.y);
+    }else{
+        jmax=(am==aym)? geo.nVoxelY-1.0f : floorf(source.y+am*ray.y);
+        jmin=(aM==ayM)? 0.0f             : ceilf (source.y+aM*ray.y);
+    }
+    // for Z
+    if( source.z<pixel1D.z){
+        kmin=(am==azm)? 1.0f             : ceilf (source.z+am*ray.z);
+        kmax=(aM==azM)? geo.nVoxelZ      : floorf(source.z+aM*ray.z);
+    }else{
+        kmax=(am==azm)? geo.nVoxelZ-1.0f : floorf(source.z+am*ray.z);
+        kmin=(aM==azM)? 0.0f             : ceilf (source.z+aM*ray.z);
+    }
+    
+    // get intersection point N1. eq(20-21) [(also eq 9-10)]
+    float ax,ay,az;
+    ax=(source.x<pixel1D.x)?  __fdividef(imin-source.x,ray.x) :  __fdividef(imax-source.x,ray.x);
+    ay=(source.y<pixel1D.y)?  __fdividef(jmin-source.y,ray.y) :  __fdividef(jmax-source.y,ray.y);
+    az=(source.z<pixel1D.z)?  __fdividef(kmin-source.z,ray.z) :  __fdividef(kmax-source.z,ray.z);
+    
+    // If its Infinite (i.e. ray is parallel to axis), make sure its positive
+    ax=(isinf(ax))? abs(ax) : ax;
+    ay=(isinf(ay))? abs(ay) : ay;
+    az=(isinf(az))? abs(az) : az;    
+       
+    
+    // get index of first intersection. eq (26) and (19)
+    unsigned long i,j,k;
+    float aminc=fminf(fminf(ax,ay),az);
+    i=(unsigned long)floorf(source.x+ (aminc+am)*0.5f*ray.x);
+    j=(unsigned long)floorf(source.y+ (aminc+am)*0.5f*ray.y);
+    k=(unsigned long)floorf(source.z+ (aminc+am)*0.5f*ray.z);
+    // Initialize
+    float ac=am;
+    //eq (28), unit anlges
+    float axu,ayu,azu;
+    axu=__frcp_rd(fabsf(ray.x));
+    ayu=__frcp_rd(fabsf(ray.y));
+    azu=__frcp_rd(fabsf(ray.z));
+    // eq(29), direction of update
+    float iu,ju,ku;
+    iu=(source.x< pixel1D.x)? 1.0f : -1.0f;
+    ju=(source.y< pixel1D.y)? 1.0f : -1.0f;
+    ku=(source.z< pixel1D.z)? 1.0f : -1.0f;
+    
+    float maxlength=__fsqrt_rd(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
+    float sum=0.0f;
+    unsigned long Np=(imax-imin+1)+(jmax-jmin+1)+(kmax-kmin+1); // Number of intersections
+    // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed
+    i+=0.5f;
+    j+=0.5f;
+    k+=0.5f;
+    for (unsigned long ii=0;ii<Np;ii++){
+        if (ax==aminc){
+            sum+=(ax-ac)*tex3D<float>(tex, i, j, k);
+            i=i+iu;
+            ac=ax;
+            ax+=axu;
+        }else if(ay==aminc){
+            sum+=(ay-ac)*tex3D<float>(tex, i, j, k);
+            j=j+ju;
+            ac=ay;
+            ay+=ayu;
+        }else if(az==aminc){
+            sum+=(az-ac)*tex3D<float>(tex, i, j, k);
+            k=k+ku;
+            ac=az;
+            az+=azu;
+        }
+        aminc=fminf(fminf(ax,ay),az);
+    }
+    detector[idx]=sum*maxlength;
+}
+
+
+int siddon_ray_projection(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
+    // Prepare for MultiGPU
+    int deviceCount = gpuids.GetLength();
+    cudaCheckErrors("Device query fail");
+    if (deviceCount == 0) {
+        mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUselect","There are no available device(s) that support CUDA\n");
+    }
+    //
+    // CODE assumes
+    // 1.-All available devices are usable by this code
+    // 2.-All available devices are equal, they are the same machine (warning thrown)
+    // Check the available devices, and if they are the same
+    if (!gpuids.AreEqualDevices()) {
+        mexWarnMsgIdAndTxt("Ax:Siddon_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+    }
+    int dev;
+    
+    // Check free memory
+    size_t mem_GPU_global;
+    checkFreeMemory(gpuids, &mem_GPU_global);
+
+    size_t mem_image=                 (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
+    size_t mem_proj=                  (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float);
+    
+    // Does everything fit in the GPUs?
+    const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj<mem_GPU_global;
+    unsigned int splits=1;
+    if (!fits_in_memory) {
+        // Nope nope.
+        // approx free memory we have. We already have left some extra 5% free for internal stuff
+        // we need a second projection memory to combine multi-GPU stuff.
+        size_t mem_free=mem_GPU_global-4*PROJ_PER_BLOCK*mem_proj;
+        splits=mem_image/mem_free+1;// Ceil of the truncation
+    }
+    Geometry* geoArray = (Geometry*)malloc(splits*sizeof(Geometry));
+    splitImage(splits,geo,geoArray,nangles);
+    
+    // Allocate axuiliary memory for projections on the GPU to accumulate partial results
+    float ** dProjection_accum;
+    size_t num_bytes_proj = PROJ_PER_BLOCK*geo.nDetecU*geo.nDetecV * sizeof(float);
+    if (!fits_in_memory){
+        dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
+        for (dev = 0; dev < deviceCount; dev++) {
+            cudaSetDevice(gpuids[dev]);
+            for (int i = 0; i < 2; ++i){
+                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
+                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
+                cudaCheckErrors("cudaMallocauxiliarty projections fail");
+            }
+        }
+    }
+    
+    // This is happening regarthless if the image fits on memory
+    float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        
+        for (int i = 0; i < 2; ++i){
+            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
+            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
+            cudaCheckErrors("cudaMalloc projections fail");
+        }
+    }
+    
+    
+    //Pagelock memory for synchronous copy.
+    // Lets try to make the host memory pinned:
+    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
+    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
+#ifndef NO_PINNED_MEMORY
+    if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
+        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+    }
+#endif
+    cudaCheckErrors("Error pinning memory");
+
+    
+    
+    // auxiliary variables
+    Point3D source, deltaU, deltaV, uvOrigin;
+    Point3D* projParamsArrayHost;
+    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    cudaCheckErrors("Error allocating auxiliary constant memory");
+    
+    // Create Streams for overlapping memcopy and compute
+    int nStreams=deviceCount*2;
+    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    
+    
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        for (int i = 0; i < 2; ++i){
+            cudaStreamCreate(&stream[i+dev*2]);
+            
+        }
+    }
+    cudaCheckErrors("Stream creation fail");
+
+    int nangles_device=(nangles+deviceCount-1)/deviceCount;
+    int nangles_last_device=(nangles-(deviceCount-1)*nangles_device);
+    unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
+    unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management.
+    int projection_this_block;
+    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
+    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
+    
+    for (unsigned int sp=0;sp<splits;sp++){
+        
+        // Create texture objects for all GPUs
+        
+        
+        size_t linear_idx_start;
+        //First one should always be  the same size as all the rest but the last
+        linear_idx_start= (size_t)sp*(size_t)geoArray[0].nVoxelX*(size_t)geoArray[0].nVoxelY*(size_t)geoArray[0].nVoxelZ;
+        
+        
+        CreateTexture(gpuids,&img[linear_idx_start],geoArray[sp],d_cuArrTex,texImg,!sp);
+        cudaCheckErrors("Texture object creation fail");
+        
+        
+        // Prepare kernel lauch variables
+        
+        int divU,divV;
+        divU=PIXEL_SIZE_BLOCK;
+        divV=PIXEL_SIZE_BLOCK;
+        dim3 grid((geoArray[sp].nDetecU+divU-1)/divU,(geoArray[0].nDetecV+divV-1)/divV,1);
+        dim3 block(divU,divV,PROJ_PER_BLOCK);
+        
+        unsigned int proj_global;
+        // Now that we have prepared the image (piece of image) and parameters for kernels
+        // we project for all angles.
+        for (unsigned int i=0; i<noOfKernelCalls; i++) {
+            for (dev=0;dev<deviceCount;dev++){
+                cudaSetDevice(gpuids[dev]);
+                
+                for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
+                    proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
+                    if (proj_global>=nangles)
+                        break;
+                    if ((i*PROJ_PER_BLOCK+j)>=nangles_device)
+                        break;
+                    geoArray[sp].alpha=angles[proj_global*3];
+                    geoArray[sp].theta=angles[proj_global*3+1];
+                    geoArray[sp].psi  =angles[proj_global*3+2];
+                    
+                    
+                    //precomute distances for faster execution
+                    //Precompute per angle constant stuff for speed
+                    computeDeltas_Siddon(geoArray[sp],proj_global, &uvOrigin, &deltaU, &deltaV, &source);
+                    //Ray tracing!
+                    projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
+                    projParamsArrayHost[4*j+1]=deltaU;
+                    projParamsArrayHost[4*j+2]=deltaV;
+                    projParamsArrayHost[4*j+3]=source;
+                    
+                }
+                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*2]);
+                cudaStreamSynchronize(stream[dev*2]);
+                cudaCheckErrors("kernel fail");
+                kernelPixelDetector<<<grid,block,0,stream[dev*2]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
+            }
+
+
+            // Now that the computation is happening, we need to either prepare the memory for
+            // combining of the projections (splits>1) and start removing previous results.
+            
+            
+            // If our image does not fit in memory then we need to make sure we accumulate previous results too.
+            // This is done in 2 steps: 
+            // 1)copy previous results back into GPU 
+            // 2)accumulate with current results
+            // The code to take them out is the same as when there are no splits needed
+            if( !fits_in_memory&&sp>0)
+            {
+                // 1) grab previous results and put them in the auxiliary variable dProjection_accum
+                for (dev = 0; dev < deviceCount; dev++)
+                {
+                    cudaSetDevice(gpuids[dev]);
+                    //Global index of FIRST projection on this set on this GPU
+                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
+                    if(proj_global>=nangles) 
+                        break;
+
+                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
+                    if(i+1==noOfKernelCalls) //is it the last block?
+                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
+                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
+                    else
+                        projection_this_block=PROJ_PER_BLOCK;
+
+                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
+                }
+                //  2) take the results from current compute call and add it to the code in execution.
+                for (dev = 0; dev < deviceCount; dev++)
+                {
+                    cudaSetDevice(gpuids[dev]);
+                    //Global index of FIRST projection on this set on this GPU
+                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
+                    if(proj_global>=nangles) 
+                        break;
+
+                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
+                    if(i+1==noOfKernelCalls) //is it the last block?
+                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
+                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
+                    else
+                        projection_this_block=PROJ_PER_BLOCK;
+
+                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
+                    vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
+                }
+            } // end accumulation case, where the image needs to be split 
+
+            // Now, lets get out the projections from the previous execution of the kernels.
+            if (i>0){
+                for (dev = 0; dev < deviceCount; dev++)
+                {
+                    cudaSetDevice(gpuids[dev]);
+                    //Global index of FIRST projection on previous set on this GPU
+                    proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
+                    if (dev+1==deviceCount) {    //is it the last device?
+                        // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device
+                        if (i-1 < noOfKernelCallsLastDev) {
+                            // The previous set(block) was not empty.
+                            projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global);
+                        }
+                        else {
+                            // The previous set was empty.
+                            // This happens if deviceCount > PROJ_PER_BLOCK+1.
+                            // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199.
+                            // e.g. PROJ_PER_BLOCK = 1, deviceCount =  3, nangles =   7.
+                            break;
+                        }
+                    }
+                    else {
+                        projection_this_block=PROJ_PER_BLOCK;
+                    }
+                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+                }
+            }
+            // Make sure Computation on kernels has finished before we launch the next batch.
+            for (dev = 0; dev < deviceCount; dev++){
+                cudaSetDevice(gpuids[dev]);
+                cudaStreamSynchronize(stream[dev*2]);
+            }
+        }
+        
+        
+         // We still have the last set of projections to get out of GPUs
+        for (dev = 0; dev < deviceCount; dev++)
+        {
+            cudaSetDevice(gpuids[dev]);
+            //Global index of FIRST projection on this set on this GPU
+            proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
+            if(proj_global>=nangles) 
+                break;
+            // How many projections are left here?
+            projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
+                                      nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
+
+            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
+            cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
+            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+        }
+        // Make sure everyone has done their bussiness before the next image split:
+        cudaDeviceSynchronize();
+    } // End image split loop.
+    
+    cudaCheckErrors("Main loop  fail");
+    ///////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////
+    for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDestroyTextureObject(texImg[dev]);
+            cudaFreeArray(d_cuArrTex[dev]);
+    }
+    delete[] texImg; texImg = 0;
+    delete[] d_cuArrTex; d_cuArrTex = 0;
+    // Freeing Stage
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaFree(dProjection[dev*2]);
+        cudaFree(dProjection[dev*2+1]);
+        
+    }
+    free(dProjection);
+    
+    if(!fits_in_memory){
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaFree(dProjection_accum[dev*2]);
+            cudaFree(dProjection_accum[dev*2+1]);
+            
+        }
+        free(dProjection_accum);
+    }
+    freeGeoArray(splits,geoArray);
+    cudaFreeHost(projParamsArrayHost);
+   
+    
+    for (int i = 0; i < nStreams; ++i)
+        cudaStreamDestroy(stream[i]) ;
+#ifndef NO_PINNED_MEMORY
+    if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
+        cudaHostUnregister(img);
+    }
+    cudaCheckErrors("cudaFree  fail");
+#endif
+    //cudaDeviceReset();
+    return 0;
+}
+
+
+
+
+void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc)
+{
+    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const unsigned int num_devices = gpuids.GetLength();
+    if(alloc){
+        for (unsigned int dev = 0; dev < num_devices; dev++){
+            cudaSetDevice(gpuids[dev]);
+            
+            //cudaArray Descriptor
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //cuda Array
+            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+        }
+    }
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMemcpy3DParms copyParams = {0};
+        //Array creation
+        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.dstArray = d_cuArrTex[dev];
+        copyParams.extent   = extent;
+        copyParams.kind     = cudaMemcpyHostToDevice;
+        cudaMemcpy3DAsync(&copyParams);
+    }
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(cudaResourceDesc));
+        texRes.resType = cudaResourceTypeArray;
+        texRes.res.array.array  = d_cuArrTex[dev];
+        cudaTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        texDescr.normalizedCoords = false;
+        texDescr.filterMode = cudaFilterModePoint;
+        texDescr.addressMode[0] = cudaAddressModeBorder;
+        texDescr.addressMode[1] = cudaAddressModeBorder;
+        texDescr.addressMode[2] = cudaAddressModeBorder;
+        texDescr.readMode = cudaReadModeElementType;
+        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        
+    }
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaDeviceSynchronize();
+    }
+    cudaCheckErrors("Texture object creation fail");
+}
+
+/* This code generates the geometries needed to split the image properly in
+ * cases where the entire image does not fit in the memory of the GPU
+ **/
+void splitImage(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){
+    
+    unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible
+    for(unsigned int sp=0;sp<splits;sp++){
+        geoArray[sp]=geo;
+        // All of them are splitsize, but the last one, possible
+        geoArray[sp].nVoxelZ=((sp+1)*splitsize<geo.nVoxelZ)?  splitsize:  geo.nVoxelZ-splitsize*sp;
+        geoArray[sp].sVoxelZ= geoArray[sp].nVoxelZ* geoArray[sp].dVoxelZ;
+        
+        // We need to redefine the offsets, as now each subimage is not aligned in the origin.
+        geoArray[sp].offOrigZ=(float *)malloc(nangles*sizeof(float));
+        for (unsigned int i=0;i<nangles;i++){
+            geoArray[sp].offOrigZ[i]=geo.offOrigZ[i]-geo.sVoxelZ/2+sp*geoArray[0].sVoxelZ+geoArray[sp].sVoxelZ/2;
+        }
+        
+    }
+    
+}
+
+/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
+ * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
+ * it does saves about 30% of each of the kernel calls. Thats something!
+ **/
+void computeDeltas_Siddon(Geometry geo,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
+
+    
+    Point3D S;
+    S.x=geo.DSO[i];
+    S.y=0;
+    S.z=0;
+    
+    //End point
+    Point3D P,Pu0,Pv0;
+    
+    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(-((double)geo.nDetecU/2.0)+0.5);       P.z  = geo.dDetecV*(((double)geo.nDetecV/2.0)-0.5);
+    Pu0.x=0;                          Pu0.y= geo.dDetecU;                                    Pu0.z= 0;
+    Pv0.x=0;                          Pv0.y= 0;                                              Pv0.z= geo.dDetecV*(-1);
+
+    // Geometric transformations:
+    // Now we have the Real world (OXYZ) coordinates of the bottom corner and its two neighbours.
+    // The objective is to get a position of the detector in a coordinate system where:
+    // 1-units are voxel size (in each direction can be different)
+    // 2-The image has the its first voxel at (0,0,0)
+    // 3-The image never rotates
+
+    // To do that, we need to compute the "deltas" the detector, or "by how much
+    // (in new xyz) does the voxels change when and index is added". To do that
+    // several geometric steps needs to be changed
+
+    //1.Roll,pitch,jaw
+    // The detector can have a small rotation.
+    // according to
+    //"A geometric calibration method for cone beam CT systems" Yang K1, Kwan AL, Miller DF, Boone JM. Med Phys. 2006 Jun;33(6):1695-706.
+    // Only the Z rotation will have a big influence in the image quality when they are small.
+    // Still all rotations are supported
+
+    // To roll pitch jaw, the detector has to be in centered in OXYZ.
+    // NB: do not apply offsets to Pu0 and Pv0: they are directions, and are invariant through translations
+    P.x=0;
+
+    // Roll pitch yaw
+    rollPitchYaw(geo,i,&P);
+    rollPitchYaw(geo,i,&Pu0);
+    rollPitchYaw(geo,i,&Pv0);
+    //Now let's translate the points where they should be:
+    // NB: do not apply offsets to Pu0 and Pv0: they are directions, and are invariant through translations
+    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
+
+    //1: Offset detector
+
+
+    //S doesnt need to chagne
+
+
+    //3: Rotate (around z)!
+    Point3D Pfinal, Pfinalu0, Pfinalv0;
+    Pfinal.x  =P.x;
+    Pfinal.y  =P.y  +geo.offDetecU[i]; Pfinal.z  =P.z  +geo.offDetecV[i];
+    Pfinalu0 = Pu0;
+    Pfinalv0 = Pv0;
+
+    eulerZYZ(geo,&Pfinal);
+    eulerZYZ(geo,&Pfinalu0);
+    eulerZYZ(geo,&Pfinalv0);
+    eulerZYZ(geo,&S);
+
+    //2: Offset image (instead of offseting image, -offset everything else)
+    // NB: do not apply offsets to Pfinalu0 and Pfinalv0: they are directions, and are invariant through translations
+
+    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
+    S.x=S.x-geo.offOrigX[i];               S.y=S.y-geo.offOrigY[i];               S.z=S.z-geo.offOrigZ[i];
+
+    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
+    Pfinal.x  =Pfinal.x+geo.sVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2;
+    S.x      =S.x+geo.sVoxelX/2;          S.y      =S.y+geo.sVoxelY/2;              S.z      =S.z      +geo.sVoxelZ/2;
+
+    //4. Scale everything so dVoxel==1
+    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
+    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
+    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
+    S.x      =S.x/geo.dVoxelX;          S.y      =S.y/geo.dVoxelY;            S.z      =S.z/geo.dVoxelZ;
+
+
+    //mexPrintf("COR: %f \n",geo.COR[i]);
+    //5. apply COR. Wherever everything was, now its offesetd by a bit
+    // NB: do not apply offsets to Pfinalu0 and Pfinalv0: they are directions, and are invariant through translations
+    double CORx, CORy;
+    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
+    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
+    Pfinal.x+=CORx;   Pfinal.y+=CORy;
+    S.x+=CORx; S.y+=CORy;
+
+    // return
+
+    *uvorigin=Pfinal;
+
+    *deltaU=Pfinalu0;
+    *deltaV=Pfinalv0;
+    
+    *source=S;
+}
+
+
+#ifndef PROJECTION_HPP
+
+float maxDistanceCubeXY(Geometry geo, float alpha,int i){
+    ///////////
+    // Compute initial "t" so we access safely as less as out of bounds as possible.
+    //////////
+    
+    
+    float maxCubX,maxCubY;
+    // Forgetting Z, compute max distance: diagonal+offset
+    maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX;
+    maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY;
+    
+    return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY);
+    
+}
+void rollPitchYaw(Geometry geo,int i, Point3D* point){
+    Point3D auxPoint;
+    auxPoint.x=point->x;
+    auxPoint.y=point->y;
+    auxPoint.z=point->z;
+    
+    point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
+            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
+            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
+    
+    point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
+            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
+            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
+    
+    point->z=-sin(geo.dPitch[i])*auxPoint.x
+            +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y
+            +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z;
+    
+}
+void eulerZYZ(Geometry geo, Point3D* point){
+    Point3D auxPoint;
+    auxPoint.x=point->x;
+    auxPoint.y=point->y;
+    auxPoint.z=point->z;
+    
+    point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+
+            (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+
+            cos(geo.alpha)*sin(geo.theta)*auxPoint.z;
+    
+    point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+
+            (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+
+            sin(geo.alpha)*sin(geo.theta)*auxPoint.z;
+    
+    point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+
+            sin(geo.theta)*sin(geo.psi)*auxPoint.y+
+            cos(geo.theta)*auxPoint.z;
+    
+    
+}
+//______________________________________________________________________________
+//
+//      Function:       freeGeoArray
+//
+//      Description:    Frees the memory from the geometry array for multiGPU.
+//______________________________________________________________________________
+void freeGeoArray(unsigned int splits,Geometry* geoArray){
+    for(unsigned int sp=0;sp<splits;sp++){
+        free(geoArray[sp].offOrigZ);
+    }
+    free(geoArray);
+}
+//______________________________________________________________________________
+//
+//      Function:       checkFreeMemory
+//
+//      Description:    check available memory on devices
+//______________________________________________________________________________
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
+    size_t memfree;
+    size_t memtotal;
+    const int deviceCount = gpuids.GetLength();
+
+    for (int dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMemGetInfo(&memfree,&memtotal);
+        if(dev==0) *mem_GPU_global=memfree;
+        if(memfree<memtotal/2){
+            mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUmemory","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
+        }
+        cudaCheckErrors("Check mem error");
+        
+        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
+    }
+    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
+    
+    //*mem_GPU_global= insert your known number here, in bytes.
+}
+#endif
diff --git a/Common/CUDA/Siddon_projection.hpp.prehip b/Common/CUDA/Siddon_projection.hpp.prehip
new file mode 100644
index 00000000..c2d38ed9
--- /dev/null
+++ b/Common/CUDA/Siddon_projection.hpp.prehip
@@ -0,0 +1,66 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA functions for ray-voxel intersection based projection
+ *
+ *
+ * CODE by       Ander Biguri
+ *               Sepideh Hatamikia (arbitrary rotation)
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+#include "ray_interpolated_projection.hpp"
+#include "types_TIGRE.hpp"
+#include "GpuIds.hpp"
+
+#ifndef PROJECTION_HPP_SIDDON
+#define PROJECTION_HPP_SIDDON
+int siddon_ray_projection(float*  img, Geometry geo, float** result,float const * const angles,int nangle, const GpuIds& gpuids);
+
+//double computeMaxLength(Geometry geo, double alpha);
+void computeDeltas_Siddon(Geometry geo,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
+void splitImage(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles);
+void freeGeoArray(unsigned int splits,Geometry* geoArray);
+//double maxDistanceCubeXY(Geometry geo, double alpha,int i);
+
+
+#endif
+#ifndef PROJECTION_HPP
+void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global);
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/Siddon_projection_parallel.cu b/Common/CUDA/Siddon_projection_parallel.cu
index 25a07e9d..65e04a92 100644
--- a/Common/CUDA/Siddon_projection_parallel.cu
+++ b/Common/CUDA/Siddon_projection_parallel.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for ray-voxel intersection based projection
@@ -50,24 +51,24 @@
 
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "Siddon_projection_parallel.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",hipGetErrorString(__err));\
         } \
 } while (0)
     
     
 // Declare the texture reference.
-void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
+void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream);
 
 
 #define MAXTREADS 1024
@@ -105,7 +106,7 @@ __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is o
 
 
 __global__ void kernelPixelDetector_parallel( Geometry geo,
-        float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){
+        float* detector, const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex){
     
     unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
@@ -282,23 +283,23 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa
     size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float);
     float** dProjection=(float **)malloc(2*sizeof(float *));
     for (int i = 0; i < 2; ++i){
-        cudaMalloc((void**)&dProjection[i],   num_bytes);
-        cudaCheckErrors("cudaMalloc projections fail");
+        hipMalloc((void**)&dProjection[i],   num_bytes);
+        cudaCheckErrors("hipMalloc projections fail");
     }
     int nStreams=2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
     
     for (int i = 0; i < 2; ++i){
-        cudaStreamCreate(&stream[i]);
+        hipStreamCreate(&stream[i]);
     }
     
     
         
     // Texture object variables
-    cudaTextureObject_t *texImg = 0;
-    cudaArray **d_cuArrTex = 0;
-    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
+    hipTextureObject_t *texImg = 0;
+    hipArray **d_cuArrTex = 0;
+    texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*));
     
     CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg   [0],stream);
     cudaCheckErrors("Texture allocation fail");
@@ -310,7 +311,7 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa
     
     
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
 
     // 16x16 gave the best performance empirically
     // Funnily that makes it compatible with most GPUs.....
@@ -349,36 +350,36 @@ int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,floa
 
          }
          
-         cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-         cudaStreamSynchronize(stream[0]);
+         hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]);
+         hipStreamSynchronize(stream[0]);
          kernelPixelDetector_parallel<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
          // copy result to host
          if (i>0)
-             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);
+             hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]);
     }
-    cudaDeviceSynchronize();
+    hipDeviceSynchronize();
     
     int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
-    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
+    hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]);
 
     
 
-    cudaDestroyTextureObject(texImg[0]);
-    cudaFreeArray(d_cuArrTex[0]);
+    hipDestroyTextureObject(texImg[0]);
+    hipFreeArray(d_cuArrTex[0]);
     free(texImg); texImg = 0;
     free(d_cuArrTex); d_cuArrTex = 0;
     cudaCheckErrors("Unbind  fail");
-    cudaFree(dProjection[0]);
-    cudaFree(dProjection[1]);
+    hipFree(dProjection[0]);
+    hipFree(dProjection[1]);
     free(dProjection);
-    cudaFreeHost(projParamsArrayHost);
-    cudaCheckErrors("cudaFree d_imagedata fail");
+    hipHostFree(projParamsArrayHost);
+    cudaCheckErrors("hipFree d_imagedata fail");
     
     
     for (int i = 0; i < 2; ++i){
-      cudaStreamDestroy(stream[i]);
+      hipStreamDestroy(stream[i]);
     }
-//     cudaDeviceReset();
+//     hipDeviceReset();
     return 0;
 }
 
@@ -482,41 +483,41 @@ void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uv
     
     *source=S2;
 }
-void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
     
     
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
   
-    //cudaArray Descriptor
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    //hipArray Descriptor
+    hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
     //cuda Array
-    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
 
 
-        cudaMemcpy3DParms copyParams = {0};
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[0];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[1]);
     
 
     //Array creation End
 
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[0];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModePoint;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModePoint;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
     
 }
 
diff --git a/Common/CUDA/Siddon_projection_parallel.cu.prehip b/Common/CUDA/Siddon_projection_parallel.cu.prehip
new file mode 100644
index 00000000..25a07e9d
--- /dev/null
+++ b/Common/CUDA/Siddon_projection_parallel.cu.prehip
@@ -0,0 +1,540 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for ray-voxel intersection based projection
+ *
+ * This file has the necessary fucntiosn to perform X-ray parallel projection
+ * operation given a geaometry, angles and image. It usesthe so-called
+ * Jacobs algorithm to compute efficiently the length of the x-rays over
+ * voxel space. Its called Siddon because Jacobs algorithm its just a small
+ * improvement over the traditional Siddons method.
+ *
+ * CODE by       Ander Biguri
+ *
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+#include <algorithm>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include "Siddon_projection_parallel.hpp"
+#include "TIGRE_common.hpp"
+#include <math.h>
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+    
+// Declare the texture reference.
+void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
+
+
+#define MAXTREADS 1024
+#define PROJ_PER_BLOCK 9
+#define PIXEL_SIZE_BLOCK 9
+/*GEOMETRY DEFINITION
+ *
+ *                Detector plane, behind
+ *            |-----------------------------|
+ *            |                             |
+ *            |                             |
+ *            |                             |
+ *            |                             |
+ *            |      +--------+             |
+ *            |     /        /|             |
+ *   A Z      |    /        / |*D           |
+ *   |        |   +--------+  |             |
+ *   |        |   |        |  |             |
+ *   |        |   |     *O |  +             |
+ *    --->y   |   |        | /              |
+ *  /         |   |        |/               |
+ * V X        |   +--------+                |
+ *            |-----------------------------|
+ *
+ *           *S
+ *
+ *
+ *
+ *
+ *
+ **/
+
+
+__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
+
+
+__global__ void kernelPixelDetector_parallel( Geometry geo,
+        float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){
+    
+    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned long long projNumber=threadIdx.z;
+            
+    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
+        return;
+    
+    unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array
+    
+    
+#if IS_FOR_MATLAB_TIGRE
+    size_t idx =  (size_t)(u  * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#else
+    size_t idx =  (size_t)(v  * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#endif
+    
+    if(indAlpha>=totalNoOfProjections)
+        return;
+    
+    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
+    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
+    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
+    Point3D source = projParamsArrayDev[4*projNumber+3];
+    
+
+    /////// Get coordinates XYZ of pixel UV
+    unsigned long pixelV = geo.nDetecV-v-1;
+    unsigned long pixelU = u;
+    Point3D pixel1D;
+    pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
+    pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
+    pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
+    
+    
+    source.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x);
+    source.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y);
+    source.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z);
+    ///////
+    // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516
+    //////
+    Point3D ray;
+    // vector of Xray
+    ray.x=pixel1D.x-source.x;
+    ray.y=pixel1D.y-source.y;
+    ray.z=pixel1D.z-source.z;
+    // This variables are ommited because
+    // bx,by,bz ={0,0,0}
+    // dx,dy,dz ={1,1,1}
+    // compute parameter values for x-ray parametric equation. eq(3-10)
+    float axm,aym,azm;
+    float axM,ayM,azM;
+    
+    /**************************************
+     *
+     *
+     * Problem. In paralel beam, often ray.y or ray.x=0;
+     * This leads to infinities progpagating and breaking everything.
+     *
+     * We need to fix it.
+     *
+     ***************************************/
+    
+    // In the paper Nx= number of X planes-> Nvoxel+1
+    axm=fminf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x);
+    aym=fminf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y);
+//     azm=min(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z);
+    axM=fmaxf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x);
+    ayM=fmaxf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y);
+//     azM=max(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z);
+    float am=(fmaxf(axm,aym));
+    float aM=(fminf(axM,ayM));
+    
+    // line intersects voxel space ->   am<aM
+    if (am>=aM)
+        detector[idx]=0.0f;
+    
+    // Compute max/min image INDEX for intersection eq(11-19)
+    // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version
+    float imin,imax,jmin,jmax;
+    // for X
+    if( source.x<pixel1D.x){
+        imin=(am==axm)? 1.0f             : ceilf (source.x+am*ray.x);
+        imax=(aM==axM)? geo.nVoxelX      : floorf(source.x+aM*ray.x);
+    }else{
+        imax=(am==axm)? geo.nVoxelX-1.0f : floorf(source.x+am*ray.x);
+        imin=(aM==axM)? 0.0f             : ceilf (source.x+aM*ray.x);
+    }
+    // for Y
+    if( source.y<pixel1D.y){
+        jmin=(am==aym)? 1.0f             : ceilf (source.y+am*ray.y);
+        jmax=(aM==ayM)? geo.nVoxelY      : floorf(source.y+aM*ray.y);
+    }else{
+        jmax=(am==aym)? geo.nVoxelY-1.0f : floorf(source.y+am*ray.y);
+        jmin=(aM==ayM)? 0.0f             : ceilf (source.y+aM*ray.y);
+    }
+//     // for Z
+//     if( source.z<pixel1D.z){
+//         kmin=(am==azm)? 1             : ceilf (source.z+am*ray.z);
+//         kmax=(aM==azM)? geo.nVoxelZ : floorf(source.z+aM*ray.z);
+//     }else{
+//         kmax=(am==azm)? geo.nVoxelZ-1 : floorf(source.z+am*ray.z);
+//         kmin=(aM==azM)? 0             : ceilf (source.z+aM*ray.z);
+//     }
+    
+    // get intersection point N1. eq(20-21) [(also eq 9-10)]
+    float ax,ay;
+    ax=(source.x<pixel1D.x)?  (imin-source.x)/(ray.x+0.000000000001f)  :  (imax-source.x)/(ray.x+0.000000000001f);
+    ay=(source.y<pixel1D.y)?  (jmin-source.y)/(ray.y+0.000000000001f)  :  (jmax-source.y)/(ray.y+0.000000000001f);
+    ay = (ray.y==0.0f)? -copysignf(1e11,ax) : ay;
+    ax = (ray.x==0.0f)? -copysignf(1e11,ay) : ax;
+//     az=(source.z<pixel1D.z)?  (kmin-source.z)/ray.z  :  (kmax-source.z)/ray.z;
+    
+    
+    
+    // get index of first intersection. eq (26) and (19)
+    unsigned long i,j,k;
+    float aminc=fminf(ax,ay);
+    i=(unsigned long)floorf(source.x+ (aminc+am)/2*ray.x);
+    j=(unsigned long)floorf(source.y+ (aminc+am)/2*ray.y);
+    k=(unsigned long)floorf(source.z+ (aminc+am)/2*ray.z);
+//     k=(int)source.z;
+    // Initialize
+    float ac=am;
+    //eq (28), unit angles
+    float axu,ayu;
+    axu=1.0f/fabsf(ray.x);
+    ayu=1.0f/fabsf(ray.y);
+//     azu=1/abs(ray.z);
+    // eq(29), direction of update
+    float iu,ju;
+    iu=(source.x< pixel1D.x)? 1.0f : -1.0f;
+    ju=(source.y< pixel1D.y)? 1.0f : -1.0f;
+//     ku=(source.z< pixel1D.z)? 1 : -1;
+    
+    float maxlength=sqrtf(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY);//+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
+    float sum=0.0f;
+    unsigned long Np=(imax-imin+1)+(jmax-jmin+1);//+(kmax-kmin+1); // Number of intersections
+    // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed
+    i+=0.5f;
+    j+=0.5f;
+    k+=0.5f;
+    // detector[idx]=aminc;
+    // return;
+    for (unsigned long ii=0;ii<Np;ii++){
+        if (ax==aminc){
+            sum+=(ax-ac)*tex3D<float>(tex, i, j, k);//(ax-ac)*
+            i=i+iu;
+            ac=ax;
+            ax+=axu;
+        }else if(ay==aminc){
+            sum+=(ay-ac)*tex3D<float>(tex, i, j, k);//(ay-ac)*
+            j=j+ju;
+            ac=ay;
+            ay+=ayu;
+//         }else if(az==aminc){
+//             sum+=(az-ac)*tex3D<float>(tex, i+0.5, j+0.5, k+0.5);
+//             k=k+ku;
+//             ac=az;
+//             az+=azu;
+        }
+        aminc=fminf(ay,ax);
+    }
+    detector[idx]=maxlength*sum;
+}
+
+
+int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
+    
+    
+
+    
+    
+    size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float);
+    float** dProjection=(float **)malloc(2*sizeof(float *));
+    for (int i = 0; i < 2; ++i){
+        cudaMalloc((void**)&dProjection[i],   num_bytes);
+        cudaCheckErrors("cudaMalloc projections fail");
+    }
+    int nStreams=2;
+    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+    
+    for (int i = 0; i < 2; ++i){
+        cudaStreamCreate(&stream[i]);
+    }
+    
+    
+        
+    // Texture object variables
+    cudaTextureObject_t *texImg = 0;
+    cudaArray **d_cuArrTex = 0;
+    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
+    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
+    
+    CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg   [0],stream);
+    cudaCheckErrors("Texture allocation fail");
+    //Done! Image put into texture memory.
+
+    
+    
+    Point3D source, deltaU, deltaV, uvOrigin;
+    
+    
+    Point3D* projParamsArrayHost;
+    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+
+    // 16x16 gave the best performance empirically
+    // Funnily that makes it compatible with most GPUs.....
+    int divU,divV,divangle;
+    divU=PIXEL_SIZE_BLOCK;
+    divV=PIXEL_SIZE_BLOCK;
+    
+    dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1);
+    
+    dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK);
+    
+    unsigned int proj_global;
+    unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
+    unsigned int i;
+    for ( i=0; i<noOfKernelCalls; i++){
+        
+         for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
+            proj_global=i*PROJ_PER_BLOCK+j;
+            if (proj_global>=nangles)
+               break;
+            geo.alpha=angles[proj_global*3];
+            geo.theta=angles[proj_global*3+1];
+            geo.psi  =angles[proj_global*3+2];
+            if(geo.alpha==0.0 || abs(geo.alpha-1.5707963267949)<0.0000001){
+                geo.alpha=geo.alpha+1.1920929e-07;
+            }
+            
+            //precomute distances for faster execution
+            //Precompute per angle constant stuff for speed
+            computeDeltas_Siddon_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source);
+            //Ray tracing!
+            projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
+            projParamsArrayHost[4*j+1]=deltaU;
+            projParamsArrayHost[4*j+2]=deltaV;
+            projParamsArrayHost[4*j+3]=source;
+
+         }
+         
+         cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
+         cudaStreamSynchronize(stream[0]);
+         kernelPixelDetector_parallel<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
+         // copy result to host
+         if (i>0)
+             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);
+    }
+    cudaDeviceSynchronize();
+    
+    int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
+    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
+
+    
+
+    cudaDestroyTextureObject(texImg[0]);
+    cudaFreeArray(d_cuArrTex[0]);
+    free(texImg); texImg = 0;
+    free(d_cuArrTex); d_cuArrTex = 0;
+    cudaCheckErrors("Unbind  fail");
+    cudaFree(dProjection[0]);
+    cudaFree(dProjection[1]);
+    free(dProjection);
+    cudaFreeHost(projParamsArrayHost);
+    cudaCheckErrors("cudaFree d_imagedata fail");
+    
+    
+    for (int i = 0; i < 2; ++i){
+      cudaStreamDestroy(stream[i]);
+    }
+//     cudaDeviceReset();
+    return 0;
+}
+
+
+
+/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
+ * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
+ * it does saves about 30% of each of the kernel calls. Thats something!
+ **/
+void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
+    Point3D S;
+    
+    S.x  =geo.DSO[i];   S.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       S.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    
+    //End point
+    Point3D P,Pu0,Pv0;
+    
+    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       P.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    Pu0.x=-(geo.DSD[i]-geo.DSO[i]);   Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5);       Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    Pv0.x=-(geo.DSD[i]-geo.DSO[i]);   Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1);
+    // Geometric trasnformations:
+    P.x=0;Pu0.x=0;Pv0.x=0;
+    
+    // Roll pitch yaw
+    rollPitchYaw(geo,i,&P);
+    rollPitchYaw(geo,i,&Pu0);
+    rollPitchYaw(geo,i,&Pv0);
+    //Now lets translate the points where they should be:
+    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
+    Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]);
+    Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]);
+
+    S.x=0;
+    // Roll pitch yaw
+    rollPitchYaw(geo,i,&S);
+    //Now lets translate the points where they should be:
+    S.x=S.x+geo.DSO[i];
+
+    //1: Offset detector
+    
+    //P.x
+    P.y  =P.y  +geo.offDetecU[i];    P.z  =P.z  +geo.offDetecV[i];
+    Pu0.y=Pu0.y+geo.offDetecU[i];    Pu0.z=Pu0.z+geo.offDetecV[i];
+    Pv0.y=Pv0.y+geo.offDetecU[i];    Pv0.z=Pv0.z+geo.offDetecV[i];
+    //S doesnt need to chagne
+    
+    
+    //3: Rotate (around z)!
+    Point3D Pfinal, Pfinalu0, Pfinalv0;
+    
+    Pfinal.x  =P.x*cos(geo.alpha)-P.y*sin(geo.alpha);       Pfinal.y  =P.y*cos(geo.alpha)+P.x*sin(geo.alpha);       Pfinal.z  =P.z;
+    Pfinalu0.x=Pu0.x*cos(geo.alpha)-Pu0.y*sin(geo.alpha);   Pfinalu0.y=Pu0.y*cos(geo.alpha)+Pu0.x*sin(geo.alpha);   Pfinalu0.z=Pu0.z;
+    Pfinalv0.x=Pv0.x*cos(geo.alpha)-Pv0.y*sin(geo.alpha);   Pfinalv0.y=Pv0.y*cos(geo.alpha)+Pv0.x*sin(geo.alpha);   Pfinalv0.z=Pv0.z;
+    
+    Point3D S2;
+    S2.x=S.x*cos(geo.alpha)-S.y*sin(geo.alpha);
+    S2.y=S.y*cos(geo.alpha)+S.x*sin(geo.alpha);
+    S2.z=S.z;
+    
+    //2: Offset image (instead of offseting image, -offset everything else)
+    
+    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
+    Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i];   Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i];   Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i];
+    Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i];   Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i];   Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i];
+    S2.x=S2.x-geo.offOrigX[i];               S2.y=S2.y-geo.offOrigY[i];               S2.z=S2.z-geo.offOrigZ[i];
+    
+    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
+    Pfinal.x  =Pfinal.x+geo.sVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2;
+    Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2;    Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2;        Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2;
+    Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2;    Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2;        Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2;
+    S2.x      =S2.x+geo.sVoxelX/2;          S2.y      =S2.y+geo.sVoxelY/2;              S2.z      =S2.z      +geo.sVoxelZ/2;
+    
+    //4. Scale everything so dVoxel==1
+    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
+    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
+    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
+    S2.x      =S2.x/geo.dVoxelX;          S2.y      =S2.y/geo.dVoxelY;            S2.z      =S2.z/geo.dVoxelZ;
+    
+    
+    
+    //5. apply COR. Wherever everything was, now its offesetd by a bit
+    float CORx, CORy;
+    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
+    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
+    Pfinal.x+=CORx;   Pfinal.y+=CORy;
+    Pfinalu0.x+=CORx;   Pfinalu0.y+=CORy;
+    Pfinalv0.x+=CORx;   Pfinalv0.y+=CORy;
+    S2.x+=CORx; S2.y+=CORy;
+    
+    // return
+    
+    *uvorigin=Pfinal;
+    
+    deltaU->x=Pfinalu0.x-Pfinal.x;
+    deltaU->y=Pfinalu0.y-Pfinal.y;
+    deltaU->z=Pfinalu0.z-Pfinal.z;
+    
+    deltaV->x=Pfinalv0.x-Pfinal.x;
+    deltaV->y=Pfinalv0.y-Pfinal.y;
+    deltaV->z=Pfinalv0.z-Pfinal.z;
+    
+    *source=S2;
+}
+void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+    
+    
+    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+  
+    //cudaArray Descriptor
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    //cuda Array
+    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+
+
+        cudaMemcpy3DParms copyParams = {0};
+        //Array creation
+        copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.dstArray = d_cuArrTex[0];
+        copyParams.extent   = extent;
+        copyParams.kind     = cudaMemcpyHostToDevice;
+        cudaMemcpy3DAsync(&copyParams,stream[1]);
+    
+
+    //Array creation End
+
+        cudaResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(cudaResourceDesc));
+        texRes.resType = cudaResourceTypeArray;
+        texRes.res.array.array  = d_cuArrTex[0];
+        cudaTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        texDescr.normalizedCoords = false;
+        texDescr.filterMode = cudaFilterModePoint;
+        texDescr.addressMode[0] = cudaAddressModeBorder;
+        texDescr.addressMode[1] = cudaAddressModeBorder;
+        texDescr.addressMode[2] = cudaAddressModeBorder;
+        texDescr.readMode = cudaReadModeElementType;
+        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+    
+}
+
+#ifndef PROJECTION_HPP
+
+float maxDistanceCubeXY(Geometry geo, float alpha,int i){
+    ///////////
+    // Compute initial "t" so we access safely as less as out of bounds as possible.
+    //////////
+    
+    
+    float maxCubX,maxCubY;
+    // Forgetting Z, compute max distance: diagonal+offset
+    maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX;
+    maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY;
+    
+    return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY);
+    
+}
+
+#endif
diff --git a/Common/CUDA/Siddon_projection_parallel.hpp.prehip b/Common/CUDA/Siddon_projection_parallel.hpp.prehip
new file mode 100644
index 00000000..c9c6fc77
--- /dev/null
+++ b/Common/CUDA/Siddon_projection_parallel.hpp.prehip
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA functions for ray-voxel intersection based projection
+ *
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+#include "ray_interpolated_projection.hpp"
+#include "types_TIGRE.hpp"
+#include "GpuIds.hpp"
+
+#ifndef PROJECTION_PARALLEL_HPP_SIDDON
+#define PROJECTION_PARALLEL_HPP_SIDDON
+int siddon_ray_projection_parallel(float  *  img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids);
+
+//double computeMaxLength(Geometry geo, double alpha);
+void computeDeltas_Siddon_parallel(Geometry geo, float alpha,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
+
+//double maxDistanceCubeXY(Geometry geo, double alpha,int i);
+
+// below, not used
+//Geometry nomralizeGeometryImage(Geometry geo);
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/TIGRE_common.cpp.prehip b/Common/CUDA/TIGRE_common.cpp.prehip
new file mode 100644
index 00000000..cf98e4b9
--- /dev/null
+++ b/Common/CUDA/TIGRE_common.cpp.prehip
@@ -0,0 +1,20 @@
+#if defined(IS_FOR_PYTIGRE)
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include "TIGRE_common.hpp"
+void mexPrintf(const char* format, ...) {
+    PRINT_HERE("");
+    va_list argpointer;
+    va_start(argpointer, format);
+    vprintf(format, argpointer);
+    va_end(argpointer);
+}
+void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg) {
+    PRINT_HERE("%s %s\n", pcTag, pcMsg);
+    exit(1);
+}
+void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg) {
+    PRINT_HERE("%s %s\n", pcTag, pcMsg);
+}
+#endif  // IS_FOR_PYTIGRE
diff --git a/Common/CUDA/TIGRE_common.hpp.prehip b/Common/CUDA/TIGRE_common.hpp.prehip
new file mode 100644
index 00000000..faf8d7ab
--- /dev/null
+++ b/Common/CUDA/TIGRE_common.hpp.prehip
@@ -0,0 +1,24 @@
+#ifndef _COMMON_HPP_20201017_
+#define _COMMON_HPP_20201017_
+
+#define STRINGIFY(n) #n
+#define TOSTRING(n) STRINGIFY(n)
+#define __HERE__ __FILE__ " (" TOSTRING(__LINE__) "): "
+#define PRINT_HERE printf(__HERE__);printf
+// #define PRINT_HERE (void*)0
+
+#if defined(IS_FOR_PYTIGRE)
+#ifndef IS_FOR_MATLAB_TIGRE
+    #define IS_FOR_MATLAB_TIGRE 0
+#endif  // IS_FOR_MATLAB_TIGRE
+void mexPrintf(const char*, ...);
+void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg);
+void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg);
+#else
+#ifndef IS_FOR_MATLAB_TIGRE
+    #define IS_FOR_MATLAB_TIGRE 1
+#endif  // IS_FOR_MATLAB_TIGRE
+#include "mex.h"
+#include "tmwtypes.h"
+#endif  // IS_TIGRE_FOR_PYTHON
+#endif  // _COMMON_HPP_20201017_
diff --git a/Common/CUDA/errors.hpp b/Common/CUDA/errors.hpp
index 05518b20..16bece09 100644
--- a/Common/CUDA/errors.hpp
+++ b/Common/CUDA/errors.hpp
@@ -1,4 +1,4 @@
-#define CUDA_SUCCESS 0
+#define hipSuccess 0
 #define ERR_CUDA 1
 
 #define ERR_NO_CAPABLE_DEVICES 2
diff --git a/Common/CUDA/errors.hpp.prehip b/Common/CUDA/errors.hpp.prehip
new file mode 100644
index 00000000..05518b20
--- /dev/null
+++ b/Common/CUDA/errors.hpp.prehip
@@ -0,0 +1,10 @@
+#define CUDA_SUCCESS 0
+#define ERR_CUDA 1
+
+#define ERR_NO_CAPABLE_DEVICES 2
+#define ERR_NO_FREE_DEVICES 3
+#define ERR_BAD_ASSERT 4
+#define ERR_ASSERT_FAIL 5
+
+
+
diff --git a/Common/CUDA/gpuUtils.cu b/Common/CUDA/gpuUtils.cu
index 8f2754e4..910b7a58 100644
--- a/Common/CUDA/gpuUtils.cu
+++ b/Common/CUDA/gpuUtils.cu
@@ -1,7 +1,7 @@
 
 #include "gpuUtils.hpp"
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include <string.h>
 #include <stdio.h>
 
@@ -34,11 +34,11 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha
         return iCudaDeviceCount;
     }
 
-    cudaError_t err;
-    cudaDeviceProp propDevice;
+    hipError_t err;
+    hipDeviceProp_t propDevice;
     int nMatch = 0;
     for (int iId = 0; iId < iCudaDeviceCount; ++iId) {
-        err = cudaGetDeviceProperties(&propDevice, iId);
+        err = hipGetDeviceProperties(&propDevice, iId);
         iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name);
         if (strcmp(propDevice.name, kacGPUName) == 0) {
             piDeviceIds[nMatch] = iId;
@@ -55,16 +55,16 @@ int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, cha
 
 void GetGpuName(int iDeviceId, char* pcName) {
     memset(pcName, 0, 128);
-    cudaError_t err;
-    cudaDeviceProp propDevice;
+    hipError_t err;
+    hipDeviceProp_t propDevice;
     int id = iDeviceId;
-    err = cudaGetDeviceProperties(&propDevice, id);
+    err = hipGetDeviceProperties(&propDevice, id);
     memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char));
 }
 
 
 int GetGpuCount() {
     int iCudaDeviceCount = 0;
-    cudaGetDeviceCount(&iCudaDeviceCount);
+    hipGetDeviceCount(&iCudaDeviceCount);
     return iCudaDeviceCount;
 }
diff --git a/Common/CUDA/gpuUtils.cu.prehip b/Common/CUDA/gpuUtils.cu.prehip
new file mode 100644
index 00000000..8f2754e4
--- /dev/null
+++ b/Common/CUDA/gpuUtils.cu.prehip
@@ -0,0 +1,70 @@
+
+#include "gpuUtils.hpp"
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include <string.h>
+#include <stdio.h>
+
+int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage) {
+    if (pcMessage) {
+        for (int iI = 0; iI < 65535; ++iI) {
+            pcMessage[iI] = '\0';
+        }
+    }
+    if (piDeviceIds == 0 || iIdCountMax == 0) {
+        return 0;
+    }
+    int iMessagePos = 0;
+    // Count installed GPUs.
+    int iCudaDeviceCount = GetGpuCount();
+    iMessagePos += sprintf(pcMessage + iMessagePos, "Found GPUs: %d\n", iCudaDeviceCount);
+    if (iCudaDeviceCount == 0) {
+        // printf("No GPU found\n");
+        return 0;
+    }
+
+    iCudaDeviceCount = min(iCudaDeviceCount, iIdCountMax);
+    iMessagePos += sprintf(pcMessage + iMessagePos, "Max GPUs: %d\n", iCudaDeviceCount);
+    if (strlen(kacGPUName) == 0) {
+        // Semi-compatible mode:
+        //    Return all GPUs
+        for (int iI = 0; iI < iCudaDeviceCount; ++iI) {
+            piDeviceIds[iI] = iI;
+        }
+        return iCudaDeviceCount;
+    }
+
+    cudaError_t err;
+    cudaDeviceProp propDevice;
+    int nMatch = 0;
+    for (int iId = 0; iId < iCudaDeviceCount; ++iId) {
+        err = cudaGetDeviceProperties(&propDevice, iId);
+        iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name);
+        if (strcmp(propDevice.name, kacGPUName) == 0) {
+            piDeviceIds[nMatch] = iId;
+            ++nMatch;
+        }
+    }
+
+    for (int iI = 0; iI < nMatch; ++iI) {
+        iMessagePos += sprintf(pcMessage + iMessagePos, "%d, ", piDeviceIds[iI]);
+    }  
+    return nMatch;
+
+}
+
+void GetGpuName(int iDeviceId, char* pcName) {
+    memset(pcName, 0, 128);
+    cudaError_t err;
+    cudaDeviceProp propDevice;
+    int id = iDeviceId;
+    err = cudaGetDeviceProperties(&propDevice, id);
+    memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char));
+}
+
+
+int GetGpuCount() {
+    int iCudaDeviceCount = 0;
+    cudaGetDeviceCount(&iCudaDeviceCount);
+    return iCudaDeviceCount;
+}
diff --git a/Common/CUDA/gpuUtils.hpp.prehip b/Common/CUDA/gpuUtils.hpp.prehip
new file mode 100644
index 00000000..38b518cf
--- /dev/null
+++ b/Common/CUDA/gpuUtils.hpp.prehip
@@ -0,0 +1,18 @@
+
+#ifndef GPUUTILS_HPP
+#define GPUUTILS_HPP
+//! @brief # of installed GPUs
+int GetGpuCount();
+
+//! @brief IDs of GPUs whose name is kacGPUName.
+//! @note Call GetGpuCount and allocate sufficient memory for piDeviceIds.
+//! @param [in] kacGPUName
+//! @param [in, out] piDeviceIds. 
+//! @param [in] iIdCountMax. Return value of GetGpuCount() 
+int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage);
+
+//! @brief GPU name of index iDeviceId. Allocate 128bytes for pcName before call.
+void GetGpuName(int iDeviceId, char* pcName);
+
+#endif  // GPUUTILS_HPP
+
diff --git a/Common/CUDA/improvedForwardProjections.cu b/Common/CUDA/improvedForwardProjections.cu
index 0f32be72..7c5fbddd 100644
--- a/Common/CUDA/improvedForwardProjections.cu
+++ b/Common/CUDA/improvedForwardProjections.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  * CUDA function for optimized proton CT radiographies
  * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
@@ -20,19 +21,19 @@
  Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
 --------------------------------------------------------------------------*/
 
-#include <cuda.h>
+#include <hip/hip_runtime.h>
 #include "mex.h"
-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>
 #include "improvedForwardProjections.hpp"
 #include <algorithm>
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\
         } \
 } while (0)
 
@@ -937,43 +938,43 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
     }
 
     //Allocate Memory on GPU
-    cudaMalloc( (void**) &dPosIn, sizeInputs );
-    cudaMalloc( (void**) &dPosOut, sizeInputs );
-    cudaMalloc( (void**) &ddirIn, sizeInputs );
-    cudaMalloc( (void**) &ddirOut, sizeInputs );
-    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
-    cudaMalloc( (void**) &dhist1, detectorMem );
-    cudaMalloc( (void**) &dhist2, detectorMem );
-    cudaMalloc( (void**) &dnumEntries, sizeof(int));
-    cudaMalloc( (void**) &ddetectorX, sizeof(int));
-    cudaMalloc( (void**) &ddetectorY, sizeof(int));
-    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
-    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
-    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
-    cudaMalloc( (void**) &dEin, sizeof(float));
-    cudaMalloc( (void**) &dReject, sizeof(float));
-    cudaMalloc( (void**) &dHull, 5*sizeof(float));
-    cudaError_t _err_alloc = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
+    hipMalloc( (void**) &dPosIn, sizeInputs );
+    hipMalloc( (void**) &dPosOut, sizeInputs );
+    hipMalloc( (void**) &ddirIn, sizeInputs );
+    hipMalloc( (void**) &ddirOut, sizeInputs );
+    hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
+    hipMalloc( (void**) &dhist1, detectorMem );
+    hipMalloc( (void**) &dhist2, detectorMem );
+    hipMalloc( (void**) &dnumEntries, sizeof(int));
+    hipMalloc( (void**) &ddetectorX, sizeof(int));
+    hipMalloc( (void**) &ddetectorY, sizeof(int));
+    hipMalloc( (void**) &dpixelSize, 2*sizeof(float));
+    hipMalloc( (void**) &dDetectDistIn, sizeof(float));
+    hipMalloc( (void**) &dDetectDistOut, sizeof(float));
+    hipMalloc( (void**) &dEin, sizeof(float));
+    hipMalloc( (void**) &dReject, sizeof(float));
+    hipMalloc( (void**) &dHull, 5*sizeof(float));
+    hipError_t _err_alloc = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err_alloc));
     cudaCheckErrors("GPU Allocation failed!");
 
     //Copy Arrays to GPU
-    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
-    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
+    hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice);
+    hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice);
+    hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice);
     cudaCheckErrors("Host to device transport failed!");
 
 
@@ -984,8 +985,8 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
     
     ParticleKernel<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
             dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject);
-    cudaError_t _err = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err));
+    hipError_t _err = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err));
     cudaCheckErrors("Kernel fail!");
     
     //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
@@ -993,12 +994,12 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
     //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
         
     //Copy result from device to host
-    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
-    //cudaError_t _errcp = cudaGetLastError();
-    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
+    //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost);
+    //hipError_t _errcp = hipGetLastError();
+    //mexPrintf("%s \n", hipGetErrorString(_errcp));
     cudaCheckErrors("Device to host transport failed!");
     
     for(int j = 0; j<detectSizeX*detectSizeY; j++){
@@ -1007,22 +1008,22 @@ __host__ void ParticleProjections(float * outProjection, float* posIn, float* po
 
     std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
 
-    cudaFree(dPosIn);
-    cudaFree(dPosOut);
-    cudaFree(ddirIn);
-    cudaFree(ddirOut);
-    cudaFree(dhist1);
-    cudaFree(dhist2);
-    cudaFree(d_wepl);
-    cudaFree(dnumEntries);
-    cudaFree(ddetectorX);
-    cudaFree(ddetectorY);
-    cudaFree(dpixelSize);
-    cudaFree(dDetectDistIn);
-    cudaFree(dDetectDistOut);
-    cudaFree(dEin);
-    cudaFree(dReject);
-    cudaFree(dHull);
+    hipFree(dPosIn);
+    hipFree(dPosOut);
+    hipFree(ddirIn);
+    hipFree(ddirOut);
+    hipFree(dhist1);
+    hipFree(dhist2);
+    hipFree(d_wepl);
+    hipFree(dnumEntries);
+    hipFree(ddetectorX);
+    hipFree(ddetectorY);
+    hipFree(dpixelSize);
+    hipFree(dDetectDistIn);
+    hipFree(dDetectDistOut);
+    hipFree(dEin);
+    hipFree(dReject);
+    hipFree(dHull);
 
     delete(hist1);
     delete(hist2);
diff --git a/Common/CUDA/improvedForwardProjections.cu.prehip b/Common/CUDA/improvedForwardProjections.cu.prehip
new file mode 100644
index 00000000..0f32be72
--- /dev/null
+++ b/Common/CUDA/improvedForwardProjections.cu.prehip
@@ -0,0 +1,1032 @@
+/*-------------------------------------------------------------------------
+ * CUDA function for optimized proton CT radiographies
+ * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
+ * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232)
+ */
+ 
+/*--------------------------------------------------------------------------
+ This file is part of the TIGRE Toolbox
+ 
+ Copyright (c) 2015, University of Bath and 
+                     CERN-European Organization for Nuclear Research
+                     All rights reserved.
+
+ License:            Open Source under BSD. 
+                     See the full license at
+                     https://github.com/CERN/TIGRE/blob/master/LICENSE
+
+ Contact:            tigre.toolbox@gmail.com
+ Codes:              https://github.com/CERN/TIGRE/
+ Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
+--------------------------------------------------------------------------*/
+
+#include <cuda.h>
+#include "mex.h"
+#include <cuda_runtime_api.h>
+#include "improvedForwardProjections.hpp"
+#include <algorithm>
+#include <math.h>
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
+        } \
+} while (0)
+
+
+__device__ int SolvePolynomial(float*x, float a, float b, float c){
+    // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method
+    // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 
+    // Herefore, we have to make a substitution: x = z - a/3
+    float p = b - a*a / 3.0;
+    float q = 2*a*a*a/27.0 - a*b / 3.0 + c;
+    float disc = q*q/4.0 + p*p*p/27.0;
+    if(disc > 0){
+        float u = cbrt(-0.5*q + sqrt(disc)); 
+        float v = cbrt(-0.5*q - sqrt(disc)); 
+        x[0] = u + v - a/3.0; // don't forget to substitute back z --> x
+        return 1;
+    }
+    else if(disc == 0 && p == 0){
+        x[0] = -a/3.0; // don't forget to substitute back z --> x
+        return 1;
+    }
+    else if(disc == 0 && p != 0){
+        x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x
+        x[1] = -3.0*q/(2.0*p) - a/3.0; 
+        return 2;
+    }
+    else{
+        x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x
+        x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0;
+        x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0;
+        return 3;
+    }
+}
+
+__device__ float cspline(float t, float a, float b, float c, float d){
+
+    return a*(t*t*t) + b*(t*t) + c*t +d;
+
+}
+
+__device__ void SimpleSort(float* arr, int size_arr){
+    // Insertion sorting method
+    float curr_elem;
+    int j;
+    
+    for (int i=1; i<size_arr; i++){
+    
+        curr_elem = arr[i];
+        j = i-1;    // minimum is zero
+
+        while(j>=0 && curr_elem<arr[j]){
+            arr[j+1] = arr[j];
+            j = j-1;
+        }//j
+        arr[j+1] = curr_elem;
+    }//i 
+  }
+
+
+__device__ int hullEntryExit(float* HullIntercept, float* position, float* direction, int in_or_out, float* hullparams, float detOff){
+  float a = hullparams[0];
+  float b = hullparams[1];
+  float alpha = hullparams[2];
+  float h = hullparams[3];
+  float kx = direction[0];
+  float dx = position[0] - kx*detOff;
+  float pref_z2 = b*b*kx*kx*cos(alpha)*cos(alpha) - 2.0 * b*b*kx*cos(alpha)*sin(alpha) + b*b*sin(alpha)*sin(alpha) \
+          + a*a*kx*kx*sin(alpha)*sin(alpha) + 2.0 * a*a*kx*cos(alpha)*sin(alpha) + a*a*cos(alpha)*cos(alpha);
+
+  float pref_z = b*b*2.0*kx*dx*cos(alpha)*cos(alpha) - 2.0*b*b*dx*cos(alpha)*sin(alpha) + \
+           a*a*2.0*kx*dx*sin(alpha)*sin(alpha) + 2.0*a*a*dx*cos(alpha)*sin(alpha);
+
+  float pref = b*b*dx*dx*cos(alpha)*cos(alpha) + a*a*dx*dx*sin(alpha)*sin(alpha) - a*a*b*b;
+
+  float p = pref_z/pref_z2;
+  float q = pref/pref_z2;
+  float disc = (p/2.0) * (p/2.0) - q;
+  
+  if(disc>0){
+
+    float z_1 = -p/2.0 + sqrt(disc);
+    float z_2 = -p/2.0 - sqrt(disc);
+    float z_solve;
+
+    if(in_or_out == 1){
+      z_solve = min(z_1, z_2);
+    }
+    else {
+      z_solve = max(z_1, z_2);
+    }
+
+  	float x_solve = kx*z_solve + dx;
+
+    float ky = direction[1];
+    float dy = position[1] - ky*detOff;
+    float y_solve = ky*z_solve + dy;
+
+    if(-h/2 <= y_solve && y_solve <= h/2){
+
+    HullIntercept[0] = x_solve;
+    HullIntercept[1] = y_solve;
+    HullIntercept[2] = z_solve;
+
+    return 0;
+    }
+    else{
+    float z1_h = (1.0/ky) * (0.5*h-dy); 
+    float z2_h = (1.0/ky) * (-0.5*h-dy);  
+
+    if(in_or_out == 1){
+      z_solve = min(z1_h, z2_h);
+      if(dy > 0){y_solve = -h*0.5;}
+      else{y_solve = h*0.5;}
+      x_solve = kx*z_solve + dx;
+    }
+    else {
+      z_solve = max(z1_h, z2_h);
+      if(dy < 0){y_solve = -h*0.5;}
+      else{y_solve = h*0.5;}
+      x_solve = kx*z_solve + dx;
+    }
+    
+    if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){
+
+    HullIntercept[0] = x_solve;
+    HullIntercept[1] = y_solve;
+    HullIntercept[2] = z_solve;
+
+    return 0;
+    }
+
+    else{return 1;}}
+  }
+else{return 1;}
+}
+
+
+__device__ int MinMax(float* solutions, float a, float b, float c){
+    float p = 2*b/(3*a);
+    float q = c / (3*a);
+    float disc = 0.25*p*p - q;
+    if (disc > 0){
+        solutions[0] = -0.5*p + sqrt(disc);
+        solutions[1] = -0.5*p - sqrt(disc);
+        return 0;
+    }
+    solutions[0] = -1;
+    solutions[1] = -1;
+    return 1;
+}
+
+
+__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag){
+  float boundary;
+  int counter = 0;
+  int nx, ny;
+  nx = int(abs(stop[0] - start[0])/pix[0]);
+  ny = int(abs(stop[1] - start[1])/pix[1]);
+    if(nx+ny>=maxIntercep){
+        *protFlag = false;
+        return 1;}
+  
+  if (int(stop[0]/pix[0]) == int(start[0]/pix[0]) && int(stop[1]/pix[1]) == int(start[1]/pix[1])) {
+  *protFlag = true;
+  return 0;
+  }
+          
+  if (int(stop[0]/pix[0]) != int(start[0]/pix[0])) {
+    float k = direction[0];
+    float d = start[0] - k*start[2];
+    boundary = trunc( ((stop[0] > start[0]) ? stop[0]:start[0])/pix[0])*pix[0];
+
+    for (int ix=0; ix<nx; ix++){
+        if(ix != 0){
+          boundary = boundary - pix[0];
+        }
+        float intercept = (boundary - d) / k;
+
+        if(intercept > start[2] && intercept < stop[2]){
+          LinInterceptsVec[ix] = intercept; 
+          counter++;
+          if (counter >= maxIntercep){
+              *protFlag = false;
+              return counter;}
+        }
+    }
+  }
+
+  if (int(stop[1]/pix[1]) != int(start[1]/pix[1])) {
+    float k = direction[1];
+    float d = start[1] - k*start[2];
+    boundary = trunc( ((stop[1] > start[1]) ? stop[1]:start[1])/pix[1])*pix[1];
+    for (int iy=nx; iy<nx+ny; iy++){
+        if(iy != nx){
+          boundary = boundary - pix[1];
+        }
+    float intercept = (boundary - d) / k;
+    if(intercept > start[2] && intercept < stop[2]){
+      LinInterceptsVec[iy] = intercept; 
+      counter++;
+      if(counter >= maxIntercep){
+          *protFlag = false;
+          return counter;}
+    }
+  }
+  }
+  int diff = maxIntercep - counter;
+  for(int j = 0; j<diff; j++){
+    LinInterceptsVec[counter+j] = 2*abs(stop[2]-start[2]); //Just ensure that array Element is larger than total distance                      
+  }
+  SimpleSort(LinInterceptsVec, maxIntercep);
+  for(int j = 0; j<diff; j++){
+    LinInterceptsVec[counter+j] = 0; // Set value back to zero (just for safety...)                     
+  } 
+  *protFlag = true;
+  return counter;
+}
+
+        
+
+
+__device__ int calcIntercepts(float* InterceptsVec ,float* a, float* b, \
+                      float* c, float* d, float* pos1, float* pixelSize, bool* protFlag, int maxIntercep){
+                          
+            /*Calculates channel Intercepts and the lengths the proton (ion) has spent in the
+              corresponding channel.
+              Returns 1 if proton is accepted and 0 if it is rejected due to too many Intercepts
+            */
+                    
+      float oneX, oneY, zeroX, zeroY;
+	  zeroX = d[0];
+	  oneX = pos1[0];
+	  zeroY = d[1];
+	  oneY = pos1[1];
+
+
+          int status, nx, ny;
+          float IntercepX[3];
+          float IntercepY[3];
+          float solutions[2];
+          float boundary;
+          // counter has to be implemented despite the initial discrimination because one can not state beforehand if
+          // the cubic spline has more than one Intercept with the channel boundary
+          int counter=0;
+        
+
+          //Check how many Intercepts will occur approximately
+          int test = MinMax(solutions, a[0], b[0], c[0]);
+           if (test == 0){
+           if (solutions[0] < 1 && solutions[0] > 0){
+               float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0];
+               if (cand > d[0] && cand > pos1[0]){
+               (oneX > zeroX) ? oneX:zeroX=cand;
+               }
+               else if(cand < d[0] && cand < pos1[0]){
+                (oneX < zeroX) ? oneX:zeroX=cand;
+               }
+           }
+
+           if (solutions[1] < 1 && solutions[1] > 0){
+               float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0];
+               if (cand > oneX && cand > zeroX){
+                (oneX > zeroX) ? oneX:zeroX=cand;
+               }
+               else if(cand < oneX && cand < zeroX){
+                (oneX < zeroX) ? oneX:zeroX=cand;
+               }
+           }
+           }
+
+          
+           test = MinMax(solutions, a[1], b[1], c[1]);
+           if (test == 0){
+           if (solutions[0] < 1 && solutions[0] > 0){
+               float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1];
+               if (cand > d[1] && cand > pos1[1]){
+               (oneY > zeroY) ? oneY:zeroY=cand;
+               }
+               else if(cand < d[1] && cand < pos1[1]){
+                (oneY < zeroY) ? oneY:zeroY=cand;
+               }
+           }
+
+           if (solutions[1] < 1 && solutions[1] > 0){
+               float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1];
+               if (cand > oneY && cand > zeroY){
+                (oneY > zeroY) ? oneY:zeroY=cand;
+               }
+               else if(cand < oneY && cand < zeroY){
+                (oneY < zeroY) ? oneY:zeroY=cand;
+               }
+           }
+           } 
+
+          nx = int(abs(oneX - zeroX) / pixelSize[0]);
+          ny = int(abs(oneY - zeroY) / pixelSize[1]);
+          if (nx + ny == 0) {
+          *protFlag = true;
+          return 0;
+         }
+
+          if ((nx + ny) <= maxIntercep){ 
+          
+              if (int(oneX/pixelSize[0]) != int(zeroX/pixelSize[0])) {
+                boundary = trunc( ((oneX > zeroX) ? oneX:zeroX)/pixelSize[0])*pixelSize[0];
+                for (int ix=0; ix<nx; ix++){
+                  if(ix != 0){
+                    boundary = boundary - pixelSize[0];
+                  }
+                  //Start from the largest pixel boundary and propagate to the smallest
+                  status = SolvePolynomial(IntercepX, b[0]/a[0], c[0]/a[0], d[0]/a[0] - boundary/a[0]);
+                  for (int kx=0; kx < status; kx++ ){
+                    if(IntercepX[kx]< 1. && IntercepX[kx] > 0. ){
+                      if (counter >=maxIntercep){break;}
+                      InterceptsVec[counter] = IntercepX[kx];
+                      counter++;
+                    }
+                  }//kx
+                 if (counter >=maxIntercep){break;}     
+                }
+              }
+
+                if ( int(oneY/pixelSize[1]) != int(zeroY/pixelSize[1])) {
+                  boundary = trunc( ((oneY > zeroY) ? oneY:zeroY)/pixelSize[1])*pixelSize[1];
+                  for (int iy=0; iy<ny; iy++){ 
+                    if(iy != 0){
+                        boundary = boundary - pixelSize[1];
+                    }
+                    //Start from the largest pixel boundary and propagate to the smallest
+                    status = SolvePolynomial(IntercepY, b[1]/a[1], c[1]/a[1], d[1]/a[1] - boundary/a[1]);
+                    for (int ky=0; ky < status; ky++ ){
+                      if ((IntercepY[ky]< 1.) &&  (IntercepY[ky] > 0.) ){
+                        if (counter >=maxIntercep){break;}
+                        InterceptsVec[counter] = IntercepY[ky];
+                        counter++;
+                      }
+                     }//ky
+                    if (counter >=maxIntercep){break;}
+                    }
+                  }
+
+                  if (counter >= maxIntercep){ // || counter == 0){ 
+                    *protFlag = false;
+                    return counter;
+                  }else{
+                      
+
+                    int diff = maxIntercep - counter;
+                    for(int j = 0; j<diff; j++){
+                        InterceptsVec[counter+j] = 2. + (float)j; //Just ensure that array Element is larger than 1                        
+                      }     
+                   
+                    SimpleSort(InterceptsVec, maxIntercep);
+                    *protFlag = true;
+                    return counter;
+                  }
+
+          }else{
+          // Too many channel Intercepts - Proton neglected 
+          // Discrimination is implemented to neglect protons with large entry angles
+          // and to reduce the size of the array that has to be allocated for each thread
+          *protFlag = false;
+          return counter;
+          }
+        }
+
+
+__global__ void ParticleKernel(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
+                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int* detectSizeY, \
+                               float* pix, float* detectDistIn, float* detectDistOut, float *ein, float *hull, float *reject){
+    /*Calculate Spline Parameters
+    c = deviceDirIn / d = devicePosIn (pos0)
+    */
+    
+    // int customsize = int(50/(*pixelSize));
+    /*float *tInterceptsVec;  ---> this is too slow! 7 s instead of 1.5 s
+    tInterceptsVec = new float[customsize]; 
+    delete[] tInterceptsVec;*/
+    /*float *ptr; ---> this is too slow! 7.3s instead of 1.5 s
+    ptr = (float*) malloc(customsize * sizeof(float));
+    free(ptr);*/
+            
+    unsigned int protonIndex = blockIdx.x*blockDim.x  + threadIdx.x;
+    float dimX, dimY, lk, lenX, lenY;
+    float lenZ = abs(*detectDistIn) + abs(*detectDistOut);
+    dimX = (float) *detectSizeX;
+    dimY = (float) *detectSizeY;
+
+    //Dereference input parameters
+    int entries, dSizeX, dSizeY;
+    // float pix;
+    
+    entries = *numOfEntries;
+    dSizeX = *detectSizeX;
+    dSizeY = *detectSizeY;
+    // pix = *pixelSize;
+            
+            
+    if(hull[3] == 0){
+    lenX = sqrt((devicePosOut[protonIndex] - devicePosIn[protonIndex]) * (devicePosOut[protonIndex] - devicePosIn[protonIndex]) \
+            + lenZ*lenZ); 
+    lenY = sqrt((devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) * (devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) \
+            + lenZ*lenZ);
+   
+    float lambda0, lambda1, ref_wepl;
+    ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
+    lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
+    lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
+
+    float a[2], b[2], c[2], d[2], pos1[2];
+    
+    //Allocate memory for all pointers
+    // Calculate optimized xdir_in
+    devicedirIn[protonIndex] = devicedirIn[protonIndex] \
+            / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    //  ... dz = 1!
+    devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
+    
+    // Calculate optimized ydir_in
+    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
+            / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);  // ... dz = 1!
+    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
+    
+    // Calculate optimized xdir_out
+    devicedirOut[protonIndex] = devicedirOut[protonIndex] \
+            / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); //  ... dz = 1!
+    devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
+    
+    // Calculate optimized ydir_out
+    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
+            / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
+    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
+            
+    // Calculate spline parameters
+    a[0] = devicePosIn[protonIndex]*2. + devicedirIn[protonIndex] - 2.*devicePosOut[protonIndex] + devicedirOut[protonIndex];
+    a[1] = devicePosIn[protonIndex + entries]*2. + devicedirIn[protonIndex + entries] - \
+    2.*devicePosOut[protonIndex + entries] +  devicedirOut[protonIndex + entries];
+
+    b[0] = -3.*devicePosIn[protonIndex] -2.*devicedirIn[protonIndex] + 3.*devicePosOut[protonIndex] - devicedirOut[protonIndex];
+    b[1] = -3.*devicePosIn[protonIndex + entries] -2.* devicedirIn[protonIndex + entries] \
+    + 3.*devicePosOut[protonIndex + entries] - devicedirOut[protonIndex + entries];
+
+    c[0] = devicedirIn[protonIndex];
+    c[1] = devicedirIn[protonIndex + entries];
+
+    d[0] = devicePosIn[protonIndex];
+    d[1] = devicePosIn[protonIndex + entries];
+
+    pos1[0] = devicePosOut[protonIndex];
+    pos1[1] = devicePosOut[protonIndex + entries];
+    
+    /* --------------------------------------------------------------------------------- */
+    /* ------------------------ Start without Hull (CS only)  -------------------------- */
+    /* --------------------------------------------------------------------------------- */ 
+    int count;
+    bool status = false;
+    float InterceptsVec[vecSizeCS] = {0}; 
+    
+    count = calcIntercepts(InterceptsVec, a, b, c, d, pos1, pix, &status, vecSizeCS);
+       
+    if (status) { 
+        int indX, indY, linInd;
+        float tOld = 0.0;
+         if (count==0){ 
+           indX = int(pos1[0]/pix[0]+dimX/2.); // REPLACE: pos1 by pos0
+           indY = int(pos1[1]/pix[1]+dimY/2.);
+
+           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+               linInd = indY + indX*(dSizeY);  
+               atomicAdd(&dhist1[linInd], p_wepl[protonIndex]);
+               atomicAdd(&dhist2[linInd], 1.0f);
+           }
+
+         } 
+         else{
+            for(int i= 0; i<=count; i++){
+              lk = (InterceptsVec[i]- tOld)*lenZ;
+              if(tOld == 0){
+                indX = int(d[0]/pix[0] +dimX/2);
+                indY = int(d[1]/pix[1] +dimY/2);
+                linInd = indY + indX*(dSizeY); 
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+                    linInd = indY + indX*(dSizeY);
+                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
+                }
+                tOld = InterceptsVec[i];
+
+              }else if(i == count){
+                lk = lenZ - InterceptsVec[i-1]*lenZ;
+                indX = int(pos1[0]/pix[0] +dimX/2);
+                indY = int(pos1[1]/pix[1] +dimY/2);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
+                }
+
+              }else{
+                indX = int(cspline(InterceptsVec[i] - eps, a[0], b[0], c[0], d[0])/pix[0] +dimX/2);
+                indY = int(cspline(InterceptsVec[i] - eps, a[1], b[1], c[1], d[1])/pix[1] +dimY/2);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
+                }
+                tOld = InterceptsVec[i];
+              }
+
+            }//i
+         }//if - Intercepts
+     }
+    else{
+        atomicAdd(reject, 1.0);
+    }
+/* ------------------------ End no Hull calculation (CS only)  -------------------------- */
+    }
+
+else{
+    // WEIGHTING FACTORS FOR CHANNELS I 
+    float weight_air_in = 0.00479; 
+    float weight_air_out = 0.00479; 
+
+    float HullIn[3], HullOut[3], initpos[3], exitpos[3];  
+    float initdir[2], exitdir[2]; 
+            
+    initpos[0] = devicePosIn[protonIndex];
+    initpos[1] = devicePosIn[protonIndex + entries];
+    initpos[2] = *detectDistIn;
+
+    exitpos[0] = devicePosOut[protonIndex];
+    exitpos[1] = devicePosOut[protonIndex + entries];
+    exitpos[2] = *detectDistOut;
+
+    initdir[0] = devicedirIn[protonIndex];
+    initdir[1] = devicedirIn[protonIndex + entries];
+
+    exitdir[0] = devicedirOut[protonIndex];
+    exitdir[1] = devicedirOut[protonIndex + entries];
+
+    int check = hullEntryExit(HullIn, initpos, initdir, 1, hull, *detectDistIn);
+
+    if(check == 0){
+        check = hullEntryExit(HullOut, exitpos, exitdir, 0, hull, *detectDistOut);
+    }
+
+    if(check == 0 && HullOut[2] > HullIn[2]){            
+        /* --------------------------------------------------------------------------------- */
+        /* ------------------------ Start with Hull + SL outside  -------------------------- */
+        /* --------------------------------------------------------------------------------- */
+        const int hullIntercep = int(vecSizeCS);  
+        const int airIntercepIn = int(vecSizeIn);   
+        const int airIntercepOut = int(vecSizeOut);   
+        bool status1 = false;
+        bool status2 = false; 
+        bool status3 = false;
+        
+        int countIn, countHull, countOut;
+        float InterceptsVecOut[airIntercepOut] = {0}; 
+        float InterceptsVecIn[airIntercepIn] = {0};
+        float InterceptsVecHull[hullIntercep] = {0}; 
+        lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); 
+        lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2]));
+
+        countIn = calcInterceptsLinear(InterceptsVecIn, initpos, HullIn, initdir, pix, airIntercepIn, &status1);
+        countOut = calcInterceptsLinear(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2);
+
+        /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */
+        float lambda0, lambda1, ref_wepl;
+        ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
+        lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
+        lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
+
+        float a[2], b[2], c[2], d[2], pos1[2];
+
+        //Allocate memory for all pointers
+        // Calculate optimized xdir_in
+	devicedirIn[protonIndex] = devicedirIn[protonIndex] \
+                / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    // ... dz = 1! 
+        devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
+
+        // Calculate optimized ydir_in
+	devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
+                / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);   // ... dz = 1! 
+        devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
+
+        // Calculate optimized xdir_out
+	devicedirOut[protonIndex] = devicedirOut[protonIndex] \
+                / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1!
+        devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
+
+        // Calculate optimized ydir_out
+	devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
+                / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
+        devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
+
+        // Calculate spline parameters
+        a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex];
+        a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \
+        2.*HullOut[1] +  devicedirOut[protonIndex + entries];
+
+        b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex];
+        b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \
+        + 3.*HullOut[1] - devicedirOut[protonIndex + entries];
+
+        c[0] = devicedirIn[protonIndex];
+        c[1] = devicedirIn[protonIndex + entries];
+
+        d[0] = HullIn[0];
+        d[1] = HullIn[1];
+
+        pos1[0] = HullOut[0];
+        pos1[1] = HullOut[1];
+
+        countHull = calcIntercepts(InterceptsVecHull, a, b, c, d, pos1, pix, &status3, hullIntercep);
+        /* -------------------- End CS Preparations! -------------- */
+
+        if(status1 && status2 && status3){
+        float tOld = initpos[2];
+        int indX, indY, linInd;
+
+        // WEIGHTING FACTORS FOR CHANNELS II
+        float weight_water = 1;  // p_wepl[protonIndex]/(len_b*weight_air_in);
+
+        // ---------------------------------------- Start with SL from detector to hull
+        if (countIn == 0){
+        indX = int(initpos[0]/pix[0] + dimX/2.);
+        indY = int(initpos[1]/pix[1] + dimY/2.);
+        lk = HullIn[2] - initpos[2];
+        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+           linInd = indY + indX*(dSizeY);  
+           atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+           atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+            }
+        }
+
+        else{
+        for(int i= 0; i<=countIn; i++){
+           lk = InterceptsVecIn[i] - tOld;
+           if(i == 0){
+             indX = int(initpos[0]/pix[0] + dimX/2.);
+             indY = int(initpos[1]/pix[1] + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecIn[i];
+             }   
+           }
+           else if(i == countIn){
+             lk = HullIn[2] - InterceptsVecIn[i-1];
+             indX = int(HullIn[0]/pix[0] + dimX/2.);
+             indY = int(HullIn[1]/pix[1] + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+             }
+           }
+
+           else{
+             indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2])))/pix[0] + dimX/2.);
+             indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2])))/pix[1] + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecIn[i];
+             }
+            }
+           }
+          }   // end else
+        // --------------------------- CS within hull
+        
+             tOld = 0.0;
+             if (countHull==0){ 
+               indX = int(HullIn[0]/pix[0] + dimX/2.); 
+               indY = int(HullIn[1]/pix[1] + dimY/2.);
+               lk = HullOut[2] - HullIn[2];
+               if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+                   linInd = indY + indX*(dSizeY);  
+                   atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                   atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+               }
+
+             } else{
+                for(int i= 0; i<=countHull; i++){
+                  lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]);
+                  if(tOld == 0){
+                    indX = int(d[0]/pix[0] + dimX/2.);
+                    indY = int(d[1]/pix[1] + dimY/2.);
+                    linInd = indY + indX*(dSizeY); 
+
+                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
+                        linInd = indY + indX*(dSizeY);
+                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+                    }
+                    tOld = InterceptsVecHull[i]; 
+
+                  }else if(i == countHull){
+                    lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]);
+                    indX = int(pos1[0]/pix[0] + dimX/2.);
+                    indY = int(pos1[1]/pix[1] + dimY/2.);
+
+                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
+                        linInd = indY + indX*(dSizeY); 
+                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+                    }
+
+                  }else{
+                    indX = int(cspline(InterceptsVecHull[i] -eps, a[0], b[0], c[0], d[0])/pix[0] + dimX/2.);
+                    indY = int(cspline(InterceptsVecHull[i] -eps, a[1], b[1], c[1], d[1])/pix[1] + dimY/2.);
+
+                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
+                        linInd = indY + indX*(dSizeY); 
+                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+                    }
+                    tOld = InterceptsVecHull[i];
+                  }
+
+             }//i
+         }
+
+        // --------------------------- SL from hull to detector
+        tOld = HullOut[2];
+        if (countOut == 0){
+        indX = int(exitpos[0]/pix[0] + dimX/2.);
+        indY = int(exitpos[1]/pix[1] + dimY/2.);
+        lk = exitpos[2] - HullOut[2];
+        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+           linInd = indY + indX*(dSizeY);  
+           atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+           atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+            }
+        }
+
+        else{
+        for(int i= 0; i<=countOut; i++){
+           lk = abs(InterceptsVecOut[i] - tOld);
+           if(i == 0){
+             indX = int(HullOut[0]/pix[0] + dimX/2.);
+             indY = int(HullOut[1]/pix[1] + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
+             linInd = indY + indX*(dSizeY);  
+             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecOut[i];
+             }   
+           }
+           else if(i == countOut){
+             lk = exitpos[2] - InterceptsVecOut[i-1];
+             indX = int(exitpos[0]/pix[0] + dimX/2.);
+             indY = int(exitpos[1]/pix[1] + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+             }
+           }
+
+           else{
+             indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/pix[0] + dimX/2.);
+             indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/pix[1] + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecOut[i];
+             }
+            }
+           }
+          }   // end else
+        }
+        else{
+        atomicAdd(reject, 1.0);
+    }
+
+        /* --------------------------- End Hull + SL outside ------------------------------- */
+        
+        }  
+
+    else{   
+    
+    /* --------------------------------------------------------------------------------- */
+    /* ----------------------------- Start with SL only!  ------------------------------ */
+    /* --------------------------------------------------------------------------------- */ 
+    int count;
+    bool status = false;
+    float InterceptsVec[vecSizeCS] = {0}; 
+    
+    float initpos[3], exitpos[3]; 
+    float mydir[2]; 
+    initpos[0] = devicePosIn[protonIndex];
+    initpos[1] = devicePosIn[protonIndex + entries];
+    initpos[2] = *detectDistIn;
+    exitpos[0] = devicePosOut[protonIndex];
+    exitpos[1] = devicePosOut[protonIndex + entries];
+    exitpos[2] = *detectDistOut;
+
+    mydir[0] = (exitpos[0] - initpos[0])/lenZ;
+    mydir[1] = (exitpos[1] - initpos[1])/lenZ;  // dz = 1
+    count = calcInterceptsLinear(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status);
+            
+       
+    if (status) { 
+        int indX, indY, linInd;
+        float tOld = initpos[2];
+         if (count==0){ 
+           indX = int(initpos[0]/pix[0] + dimX/2.); 
+           indY = int(initpos[1]/pix[1] + dimY/2.);
+
+           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+               linInd = indY + indX*(dSizeY);  
+               atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]);
+               atomicAdd(&dhist2[linInd], weight_air_out*1.0f);
+           }
+
+         } else{
+            for(int i= 0; i<=count; i++){
+              lk = InterceptsVec[i] - tOld;
+              if(tOld == initpos[2]){
+                indX = int(initpos[0]/pix[0] + dimX/2.);
+                indY = int(initpos[1]/pix[1] + dimY/2.);
+                linInd = indY + indX*(dSizeY); 
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
+                    linInd = indY + indX*(dSizeY);
+                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+                }
+                tOld = InterceptsVec[i];
+
+              }else if(i == count){
+                lk = exitpos[2] - InterceptsVec[i-1];
+                indX = int(exitpos[0]/pix[0] + dimX/2.);
+                indY = int(exitpos[1]/pix[1] + dimY/2.);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+                }
+
+              }else{
+                indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/pix[0] + dimX/2.);
+                indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/pix[1] + dimY/2.);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+                }
+                tOld = InterceptsVec[i];
+              }
+
+            } //i
+         }//if - Intercepts
+     }
+    else{
+        // *reject += 1;
+        atomicAdd(reject, 1.0);
+    }
+    /* ------------------------------ End SL only! ------ -------------------------- */
+    }   
+   }
+}
+
+__global__ void sumHist(float* hist, float* histNorm){
+    
+    unsigned int index = blockIdx.x*blockDim.x  + threadIdx.x;
+    hist[index] = hist[index]/histNorm[index];
+}
+
+__host__ void ParticleProjections(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \
+                                  float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \
+                                  float detectDistIn, float detectDistOut, float ein, float* ch_param){
+
+    /*
+    Detect Size = 400x400
+    Prepare Input for GPU*/
+
+    const int sizeInputs = 2*numOfEntries*sizeof(float);
+    const int detectorMem = detectSizeX*detectSizeY*sizeof(float);
+    float reject = 0.0;
+
+    float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull;
+    int *dnumEntries, *ddetectorX, *ddetectorY;
+    float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dEin, *dReject;
+
+    float *hist1, *hist2;
+    hist1 = new float[detectSizeX*detectSizeY];
+    hist2 = new float[detectSizeX*detectSizeY];
+    for(int i = 0; i<detectSizeX*detectSizeY; i++){
+        hist1[i] = 0.f;
+        hist2[i]= 0.f;
+    
+    }
+
+    //Allocate Memory on GPU
+    cudaMalloc( (void**) &dPosIn, sizeInputs );
+    cudaMalloc( (void**) &dPosOut, sizeInputs );
+    cudaMalloc( (void**) &ddirIn, sizeInputs );
+    cudaMalloc( (void**) &ddirOut, sizeInputs );
+    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
+    cudaMalloc( (void**) &dhist1, detectorMem );
+    cudaMalloc( (void**) &dhist2, detectorMem );
+    cudaMalloc( (void**) &dnumEntries, sizeof(int));
+    cudaMalloc( (void**) &ddetectorX, sizeof(int));
+    cudaMalloc( (void**) &ddetectorY, sizeof(int));
+    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
+    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
+    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
+    cudaMalloc( (void**) &dEin, sizeof(float));
+    cudaMalloc( (void**) &dReject, sizeof(float));
+    cudaMalloc( (void**) &dHull, 5*sizeof(float));
+    cudaError_t _err_alloc = cudaGetLastError();
+    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
+    cudaCheckErrors("GPU Allocation failed!");
+
+    //Copy Arrays to GPU
+    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
+    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
+    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
+    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
+    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
+    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
+    cudaCheckErrors("Host to device transport failed!");
+
+
+
+    dim3 grid(floor(numOfEntries/maxthreads),1,1);
+    dim3 block(maxthreads,1,1);
+
+    
+    ParticleKernel<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
+            dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject);
+    cudaError_t _err = cudaGetLastError();
+    mexPrintf("%s \n", cudaGetErrorString(_err));
+    cudaCheckErrors("Kernel fail!");
+    
+    //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
+    //dim3 block_sum(64,1,1);
+    //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
+        
+    //Copy result from device to host
+    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
+    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
+    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
+    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
+    //cudaError_t _errcp = cudaGetLastError();
+    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
+    cudaCheckErrors("Device to host transport failed!");
+    
+    for(int j = 0; j<detectSizeX*detectSizeY; j++){
+        outProjection[j] = hist1[j]/hist2[j]; 
+    }
+
+    std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
+
+    cudaFree(dPosIn);
+    cudaFree(dPosOut);
+    cudaFree(ddirIn);
+    cudaFree(ddirOut);
+    cudaFree(dhist1);
+    cudaFree(dhist2);
+    cudaFree(d_wepl);
+    cudaFree(dnumEntries);
+    cudaFree(ddetectorX);
+    cudaFree(ddetectorY);
+    cudaFree(dpixelSize);
+    cudaFree(dDetectDistIn);
+    cudaFree(dDetectDistOut);
+    cudaFree(dEin);
+    cudaFree(dReject);
+    cudaFree(dHull);
+
+    delete(hist1);
+    delete(hist2);
+    // delete(&reject);
+
+
+}
diff --git a/Common/CUDA/improvedForwardProjections.hpp b/Common/CUDA/improvedForwardProjections.hpp
index 6da25b63..075dfd9c 100644
--- a/Common/CUDA/improvedForwardProjections.hpp
+++ b/Common/CUDA/improvedForwardProjections.hpp
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  * CUDA function for optimized proton CT radiographies
  * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
@@ -20,8 +21,8 @@
  Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
 --------------------------------------------------------------------------*/
 
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include <iostream>
 #ifndef improvedForwardProjections_H
 #define improvedForwardProjections_H
diff --git a/Common/CUDA/improvedForwardProjections.hpp.prehip b/Common/CUDA/improvedForwardProjections.hpp.prehip
new file mode 100644
index 00000000..6da25b63
--- /dev/null
+++ b/Common/CUDA/improvedForwardProjections.hpp.prehip
@@ -0,0 +1,263 @@
+/*-------------------------------------------------------------------------
+ * CUDA function for optimized proton CT radiographies
+ * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
+ * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232)
+ */
+ 
+/*--------------------------------------------------------------------------
+ This file is part of the TIGRE Toolbox
+ 
+ Copyright (c) 2015, University of Bath and 
+                     CERN-European Organization for Nuclear Research
+                     All rights reserved.
+
+ License:            Open Source under BSD. 
+                     See the full license at
+                     https://github.com/CERN/TIGRE/blob/master/LICENSE
+
+ Contact:            tigre.toolbox@gmail.com
+ Codes:              https://github.com/CERN/TIGRE/
+ Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
+--------------------------------------------------------------------------*/
+
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include <iostream>
+#ifndef improvedForwardProjections_H
+#define improvedForwardProjections_H
+#define pi 3.14159265359
+#define eps 1e-8
+#define vecSizeCS 220
+#define vecSizeOut 100
+#define vecSizeIn 10
+#define maxthreads 256
+//#include <thrust/host_vector.h>
+//#include <thrust/device_vector.h>
+
+void ParticleProjections(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \
+        int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float ein, float* ch_param);
+
+__device__ int calcIntercepts(float* InterceptsVec ,float*  a, float* b, \
+                      float*  c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep);
+
+__device__ int SolvePolynomial(float*x, float a, float b, float c);
+
+__device__ int MinMax(float* solutions, float a, float b, float c);
+
+__device__ void SimpleSort(float* arr, int size_arr);
+
+__global__ void ParticleKernel(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
+                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \
+                               float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject);
+
+__device__ int hullEntryExit(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff);
+
+__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \
+        bool* protFlag);
+
+void ParticleProjectionsCone(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \
+        int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float sourcePos, \
+        float ein, float* ch_param);
+
+__device__ int calcInterceptsCone(float* InterceptsVec ,float*  a, float* b, \
+                      float*  c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep, \
+                      float sourcePos, float din, float dout);
+
+__device__ int SolvePolynomialCone(float*x, float a, float b, float c);
+
+__device__ void SimpleSortCone(float* arr, int size_arr);
+
+__device__ int MinMaxCone(float* solutions, float a, float b, float c);
+
+__global__ void ParticleKernelCone(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
+                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \
+                               float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject, \
+                               float* sourceDist);
+
+__device__ int hullEntryExitCone(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff);
+
+__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \
+        bool* protFlag, float sourcePos);
+
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/Common/CUDA/improvedForwardProjections_cone.cu b/Common/CUDA/improvedForwardProjections_cone.cu
index 7a4f6b46..d11657a9 100644
--- a/Common/CUDA/improvedForwardProjections_cone.cu
+++ b/Common/CUDA/improvedForwardProjections_cone.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  * CUDA function for optimized proton CT radiographies
  * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
@@ -21,19 +22,19 @@
 --------------------------------------------------------------------------*/
 
 
-#include <cuda.h>
+#include <hip/hip_runtime.h>
 #include "mex.h"
-#include <cuda_runtime_api.h>
+#include <hip/hip_runtime_api.h>
 #include "improvedForwardProjections.hpp"
 // #include <algorithm>
 // #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("ImprovedForwardProj:",hipGetErrorString(__err));\
         } \
 } while (0)
 
@@ -1133,45 +1134,45 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
     }
 
     //Allocate Memory on GPU
-    cudaMalloc( (void**) &dPosIn, sizeInputs );
-    cudaMalloc( (void**) &dPosOut, sizeInputs );
-    cudaMalloc( (void**) &ddirIn, sizeInputs );
-    cudaMalloc( (void**) &ddirOut, sizeInputs );
-    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
-    cudaMalloc( (void**) &dhist1, detectorMem );
-    cudaMalloc( (void**) &dhist2, detectorMem );
-    cudaMalloc( (void**) &dnumEntries, sizeof(int));
-    cudaMalloc( (void**) &ddetectorX, sizeof(int));
-    cudaMalloc( (void**) &ddetectorY, sizeof(int));
-    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
-    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
-    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
-    cudaMalloc( (void**) &dSourceDist, sizeof(float));
-    cudaMalloc( (void**) &dEin, sizeof(float));
-    cudaMalloc( (void**) &dReject, sizeof(float));
-    cudaMalloc( (void**) &dHull, 5*sizeof(float));
-    cudaError_t _err_alloc = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
+    hipMalloc( (void**) &dPosIn, sizeInputs );
+    hipMalloc( (void**) &dPosOut, sizeInputs );
+    hipMalloc( (void**) &ddirIn, sizeInputs );
+    hipMalloc( (void**) &ddirOut, sizeInputs );
+    hipMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
+    hipMalloc( (void**) &dhist1, detectorMem );
+    hipMalloc( (void**) &dhist2, detectorMem );
+    hipMalloc( (void**) &dnumEntries, sizeof(int));
+    hipMalloc( (void**) &ddetectorX, sizeof(int));
+    hipMalloc( (void**) &ddetectorY, sizeof(int));
+    hipMalloc( (void**) &dpixelSize, 2*sizeof(float));
+    hipMalloc( (void**) &dDetectDistIn, sizeof(float));
+    hipMalloc( (void**) &dDetectDistOut, sizeof(float));
+    hipMalloc( (void**) &dSourceDist, sizeof(float));
+    hipMalloc( (void**) &dEin, sizeof(float));
+    hipMalloc( (void**) &dReject, sizeof(float));
+    hipMalloc( (void**) &dHull, 5*sizeof(float));
+    hipError_t _err_alloc = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err_alloc));
     cudaCheckErrors("GPU Allocation failed!");
 
     //Copy Arrays to GPU
-    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
-    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dSourceDist, &sourcePos, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
+    hipMemcpy(dPosIn, posIn,sizeInputs ,hipMemcpyHostToDevice);
+    hipMemcpy(dPosOut, posOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirIn, dirIn,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(ddirOut, dirOut,sizeInputs,hipMemcpyHostToDevice);
+    hipMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dnumEntries, &numOfEntries,sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorX, &detectSizeX, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(ddetectorY, &detectSizeY, sizeof(int), hipMemcpyHostToDevice);
+    hipMemcpy(dpixelSize, pixelSize, 2*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dSourceDist, &sourcePos, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dEin, &ein, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dReject, &reject, sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dHull, ch_param, 5*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(dhist1, hist1, detectorMem, hipMemcpyHostToDevice);
+    hipMemcpy(dhist2, hist2, detectorMem, hipMemcpyHostToDevice);
     cudaCheckErrors("Host to device transport failed!");
 
 
@@ -1182,8 +1183,8 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
     
     ParticleKernelCone<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
             dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist);
-    cudaError_t _err = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err));
+    hipError_t _err = hipGetLastError();
+    mexPrintf("%s \n", hipGetErrorString(_err));
     cudaCheckErrors("Kernel fail!");
     
     //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
@@ -1191,12 +1192,12 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
     //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
         
     //Copy result from device to host
-    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
-    //cudaError_t _errcp = cudaGetLastError();
-    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
+    //hipMemcpy(outProjection, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist1, dhist1,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(hist2, dhist2,detectorMem ,hipMemcpyDeviceToHost);
+    hipMemcpy(&reject, dReject,sizeof(float) ,hipMemcpyDeviceToHost);
+    //hipError_t _errcp = hipGetLastError();
+    //mexPrintf("%s \n", hipGetErrorString(_errcp));
     cudaCheckErrors("Device to host transport failed!");
     
     for(int j = 0; j<detectSizeX*detectSizeY; j++){
@@ -1205,22 +1206,22 @@ __host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float
 
     std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
 
-    cudaFree(dPosIn);
-    cudaFree(dPosOut);
-    cudaFree(ddirIn);
-    cudaFree(ddirOut);
-    cudaFree(dhist1);
-    cudaFree(dhist2);
-    cudaFree(d_wepl);
-    cudaFree(dnumEntries);
-    cudaFree(ddetectorX);
-    cudaFree(ddetectorY);
-    cudaFree(dpixelSize);
-    cudaFree(dDetectDistIn);
-    cudaFree(dDetectDistOut);
-    cudaFree(dEin);
-    cudaFree(dReject);
-    cudaFree(dHull);
+    hipFree(dPosIn);
+    hipFree(dPosOut);
+    hipFree(ddirIn);
+    hipFree(ddirOut);
+    hipFree(dhist1);
+    hipFree(dhist2);
+    hipFree(d_wepl);
+    hipFree(dnumEntries);
+    hipFree(ddetectorX);
+    hipFree(ddetectorY);
+    hipFree(dpixelSize);
+    hipFree(dDetectDistIn);
+    hipFree(dDetectDistOut);
+    hipFree(dEin);
+    hipFree(dReject);
+    hipFree(dHull);
 
     delete(hist1);
     delete(hist2);
diff --git a/Common/CUDA/improvedForwardProjections_cone.cu.prehip b/Common/CUDA/improvedForwardProjections_cone.cu.prehip
new file mode 100644
index 00000000..7a4f6b46
--- /dev/null
+++ b/Common/CUDA/improvedForwardProjections_cone.cu.prehip
@@ -0,0 +1,1230 @@
+/*-------------------------------------------------------------------------
+ * CUDA function for optimized proton CT radiographies
+ * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
+ * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232)
+ */
+ 
+/*--------------------------------------------------------------------------
+ This file is part of the TIGRE Toolbox
+ 
+ Copyright (c) 2015, University of Bath and 
+                     CERN-European Organization for Nuclear Research
+                     All rights reserved.
+
+ License:            Open Source under BSD. 
+                     See the full license at
+                     https://github.com/CERN/TIGRE/blob/master/LICENSE
+
+ Contact:            tigre.toolbox@gmail.com
+ Codes:              https://github.com/CERN/TIGRE/
+ Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
+--------------------------------------------------------------------------*/
+
+
+#include <cuda.h>
+#include "mex.h"
+#include <cuda_runtime_api.h>
+#include "improvedForwardProjections.hpp"
+// #include <algorithm>
+// #include <math.h>
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
+        } \
+} while (0)
+
+
+__device__ int SolvePolynomialCone(float*x, float a, float b, float c){
+    // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method
+    // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 
+    // Herefore, we have to make a substitution: x = z - a/3
+    float p = b - a*a / 3.0;
+    float q = 2*a*a*a/27.0 - a*b / 3.0 + c;
+    float disc = q*q/4.0 + p*p*p/27.0;
+    if(disc > 0){
+        float u = cbrt(-0.5*q + sqrt(disc));
+        float v = cbrt(-0.5*q - sqrt(disc));
+        x[0] = u + v - a/3.0; // don't forget to substitute back z --> x
+        return 1;
+    }
+    else if(disc == 0 && p == 0){
+        x[0] = -a/3.0; // don't forget to substitute back z --> x
+        return 1;
+    }
+    else if(disc == 0 && p != 0){
+        x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x
+        x[1] = -3.0*q/(2.0*p) - a/3.0; 
+        return 2;
+    }
+    else{
+        x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x
+        x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0;
+        x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0;
+        return 3;
+    }
+}
+
+__device__ float csplineCone(float t, float a, float b, float c, float d){
+
+    return a*(t*t*t) + b*(t*t) + c*t +d;
+
+}
+
+__device__ void SimpleSortCone(float* arr, int size_arr){
+    // Insertion sorting method
+    float curr_elem;
+    int j;
+    
+    for (int i=1; i<size_arr; i++){
+    
+        curr_elem = arr[i];
+        j = i-1;    // minimum is zero
+
+        while(j>=0 && curr_elem<arr[j]){
+            arr[j+1] = arr[j];
+            j = j-1;
+        }//j
+        arr[j+1] = curr_elem;
+    }//i 
+  }
+
+
+__device__ int hullEntryExitCone(float* HullIntercept, float* position, float* direction, int in_or_out, float* hullparams, float detOff){
+  float a = hullparams[0];
+  float b = hullparams[1];
+  float alpha = hullparams[2];
+  float h = hullparams[3];
+  float kx = direction[0];
+  float dx = position[0] - kx*detOff;
+  float pref_z2 = b*b*kx*kx*cos(alpha)*cos(alpha) - 2.0 * b*b*kx*cos(alpha)*sin(alpha) + b*b*sin(alpha)*sin(alpha) \
+          + a*a*kx*kx*sin(alpha)*sin(alpha) + 2.0 * a*a*kx*cos(alpha)*sin(alpha) + a*a*cos(alpha)*cos(alpha);
+
+  float pref_z = b*b*2.0*kx*dx*cos(alpha)*cos(alpha) - 2.0*b*b*dx*cos(alpha)*sin(alpha) + \
+           a*a*2.0*kx*dx*sin(alpha)*sin(alpha) + 2.0*a*a*dx*cos(alpha)*sin(alpha);
+
+  float pref = b*b*dx*dx*cos(alpha)*cos(alpha) + a*a*dx*dx*sin(alpha)*sin(alpha) - a*a*b*b;
+
+  float p = pref_z/pref_z2;
+  float q = pref/pref_z2;
+  float disc = (p/2.0) * (p/2.0) - q;
+  
+  if(disc>0){
+
+    float z_1 = -p/2.0 + sqrt(disc);
+    float z_2 = -p/2.0 - sqrt(disc);
+    float z_solve;
+
+    if(in_or_out == 1){
+      z_solve = min(z_1, z_2);
+    }
+    else {
+      z_solve = max(z_1, z_2);
+    }
+
+  	float x_solve = kx*z_solve + dx;
+
+    float ky = direction[1];
+    float dy = position[1] - ky*detOff;
+    float y_solve = ky*z_solve + dy;
+
+    if(-h/2 <= y_solve && y_solve <= h/2){
+
+    HullIntercept[0] = x_solve;
+    HullIntercept[1] = y_solve;
+    HullIntercept[2] = z_solve;
+
+    return 0;
+    }
+    else{
+    float z1_h = (1.0/ky) * (0.5*h-dy); 
+    float z2_h = (1.0/ky) * (-0.5*h-dy);  
+
+    if(in_or_out == 1){
+      z_solve = min(z1_h, z2_h);
+      if(dy > 0){y_solve = -h*0.5;}
+      else{y_solve = h*0.5;}
+      x_solve = kx*z_solve + dx;
+    }
+    else {
+      z_solve = max(z1_h, z2_h);
+      if(dy < 0){y_solve = -h*0.5;}
+      else{y_solve = h*0.5;}
+      x_solve = kx*z_solve + dx;
+    }
+    
+    if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){
+
+    HullIntercept[0] = x_solve;
+    HullIntercept[1] = y_solve;
+    HullIntercept[2] = z_solve;
+
+    return 0;
+    }
+
+    else{return 1;}}
+  }
+else{return 1;}
+}
+
+
+
+__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag,
+        float sourcePos){
+  float tan_alpha, d_channel;
+  int counter = 0;
+  int nx, ny;
+  float sdd = abs(stop[2] - sourcePos);  // distance source detector
+  float sidd = abs(start[2] - sourcePos);   // distance sourcce inital detector
+  int select;
+
+  float pix_start_x = sidd * (pix[0]/sdd);
+  float pix_start_y = sidd * (pix[1]/sdd); 
+
+  nx = int(abs(stop[0]/pix[0] - start[0]/pix_start_x));
+  ny = int(abs(stop[1]/pix[1] - start[1]/pix_start_y));
+    if(nx+ny>=maxIntercep){
+        *protFlag = false;
+        return 1;}
+  
+  if (int(stop[0]/pix[0]) == int(start[0]/pix_start_x) && int(stop[1]/pix[1]) == int(start[1]/pix_start_y)) {
+  *protFlag = true;
+  return 0;
+  }
+          
+  if (int(stop[0]/pix[0]) != int(start[0]/pix_start_x)) {
+    float k = direction[0];
+    float d = start[0] - k*start[2];
+    if(stop[0]/pix[0] > start[0]/pix_start_x){
+    tan_alpha = (trunc(stop[0]/pix[0])*pix[0])/sdd;
+    d_channel = trunc(stop[0]/pix[0])*pix[0] - tan_alpha * stop[2];
+    select = 0;
+    }
+    else{
+    tan_alpha = (trunc(start[0]/pix_start_x)*pix_start_x)/sidd;
+    d_channel = trunc(start[0]/pix_start_x)*pix_start_x - tan_alpha * start[2];
+    select = 1;
+    }
+    
+    for (int ix=0; ix<nx; ix++){
+        if(ix != 0){
+          if (select == 0){
+          tan_alpha = (trunc((stop[0]-ix*pix[0])/pix[0])*pix[0])/sdd;
+          d_channel = trunc((stop[0]-ix*pix[0])/pix[0])*pix[0] - tan_alpha * stop[2];
+          }
+          else{
+          tan_alpha = (trunc((start[0]-ix*pix_start_x)/pix_start_x)*pix_start_x)/sidd;
+          d_channel = trunc((start[0]-ix*pix_start_x)/pix_start_x)*pix_start_x - tan_alpha * start[2];
+          }
+        }
+        float intercept = (d_channel - d)/(k - tan_alpha);
+
+        if(intercept > start[2] && intercept < stop[2]){
+          LinInterceptsVec[ix] = intercept; 
+          counter++;
+          if (counter >= maxIntercep){
+              *protFlag = false;
+              return counter;}
+        }
+    }
+  }
+
+  if (int(stop[1]/pix[1]) != int(start[1]/pix_start_y)) {
+    float k = direction[1];
+    float d = start[1] - k*start[2];
+    if(stop[1]/pix[1] > start[1]/pix_start_y){
+    tan_alpha = (trunc(stop[1]/pix[1])*pix[1])/sdd;
+    d_channel = trunc(stop[1]/pix[1])*pix[1] - tan_alpha * stop[2];
+    select = 0;
+    }
+    else{
+    tan_alpha = (trunc(start[1]/pix_start_y)*pix_start_y)/sidd;
+    d_channel = trunc(start[1]/pix_start_y)*pix_start_y - tan_alpha * start[2];
+    select = 1;
+    }
+    
+    for (int iy=nx; iy<nx+ny; iy++){
+        if(iy != nx){
+          if (select == 0){
+          tan_alpha = (trunc((stop[1]-(iy-nx)*pix[1])/pix[1])*pix[1])/sdd;
+          d_channel = trunc((stop[1]-(iy-nx)*pix[1])/pix[1])*pix[1] - tan_alpha * stop[2];
+          }
+          else{
+          tan_alpha = (trunc((start[1]-(iy-nx)*pix_start_y)/pix_start_y)*pix_start_y)/sidd;
+          d_channel = trunc((start[1]-(iy-nx)*pix_start_y)/pix_start_y)*pix_start_y - tan_alpha * start[2];
+          }
+        }
+        float intercept = (d_channel - d)/(k - tan_alpha);
+
+        if(intercept > start[2] && intercept < stop[2]){
+          LinInterceptsVec[iy] = intercept; 
+          counter++;
+          if (counter >= maxIntercep){
+              *protFlag = false;
+              return counter;}
+        }
+    }
+  }
+
+  int diff = maxIntercep - counter;
+  for(int j = 0; j<diff; j++){
+    LinInterceptsVec[counter+j] = 2*abs(stop[2]-start[2]); //Just ensure that array Element is larger than total distance                      
+  }
+  SimpleSortCone(LinInterceptsVec, maxIntercep);
+  for(int j = 0; j<diff; j++){
+    LinInterceptsVec[counter+j] = 0; // Set value back to zero (just for safety...)                     
+  } 
+  *protFlag = true;
+  return counter;
+}
+        
+
+__device__ int MinMaxCone(float* solutions, float a, float b, float c){
+    float p = 2*b/(3*a);
+    float q = c / (3*a);
+    float disc = 0.25*p*p - q;
+    if (disc > 0){
+        solutions[0] = -0.5*p + sqrt(disc);
+        solutions[1] = -0.5*p - sqrt(disc);
+        return 0;
+    }
+    solutions[0] = -1;
+    solutions[1] = -1;
+    return 1;
+}
+
+
+
+__device__ int calcInterceptsCone(float* InterceptsVec ,float* a, float* b, \
+                      float* c, float* d, float* pos1, float* pixelSize, bool* protFlag, int maxIntercep, \
+                      float sourcePos, float din, float dout){
+                          
+            /*Calculates channel Intercepts and the lengths the proton (ion) has spent in the
+              corresponding channel.
+              Returns 1 if proton is accepted and 0 if it is rejected due to too many Intercepts
+            */
+      float oneX, oneY, zeroX, zeroY, pix_oneX, pix_oneY, pix_zeroX, pix_zeroY;
+      float tan_alpha, d_channel;
+      float sdd_init = abs(dout - sourcePos)/abs(dout-din);  // normalize to 1!
+      float sidd_init = abs(din - sourcePos)/abs(dout-din);
+      float sdd_x = abs(dout - sourcePos)/abs(dout-din);  // normalize to 1!
+      float sidd_x = abs(din - sourcePos)/abs(dout-din);
+      float sdd_y = abs(dout - sourcePos)/abs(dout-din);  // normalize to 1!
+      float sidd_y = abs(din - sourcePos)/abs(dout-din);
+      int select;
+      float pix_start_x = sidd_init * (pixelSize[0]/sdd_init);
+      float pix_start_y = sidd_init * (pixelSize[1]/sdd_init);
+	  zeroX = d[0];
+	  oneX = pos1[0];
+	  zeroY = d[1];
+	  oneY = pos1[1];
+      pix_zeroX = pix_start_x;
+      pix_zeroY = pix_start_y;
+      pix_oneX = pixelSize[0];
+      pix_oneY = pixelSize[1];
+
+
+      int status, nx, ny;
+      float IntercepX[3];
+      float IntercepY[3];
+      float solutions[2];
+      // counter has to be implemented despite the initial discrimination because one can not state beforehand if
+      // the cubic spline has more than one Intercept with the channel boundary
+      int counter=0;
+
+      int test = MinMaxCone(solutions, a[0], b[0], c[0]);
+       if (test == 0){
+       if (solutions[0] < 1 && solutions[0] > 0){
+	   float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0];
+           float pix_cand = (sidd_init + solutions[0]) * (pixelSize[0]/sdd_init);
+           if (cand/pix_cand > d[0]/pix_start_x && cand/pix_cand > pos1[0]/pixelSize[0]){
+           (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand;
+           (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
+           (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din);
+           }
+           else if(cand/pix_cand < d[0]/pix_start_x && cand/pix_cand < pos1[0]/pixelSize[0]){
+            (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand;
+            (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
+            (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din);
+           }
+       }
+
+       if (solutions[1] < 1 && solutions[1] > 0){
+           float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0];
+           float pix_cand = (sidd_init + solutions[1]) * (pixelSize[0]/sdd_init);
+           if (cand/pix_cand > oneX/pix_oneX && cand/pix_cand > zeroX/pix_zeroX){
+            (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand;
+            (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
+            (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din);
+           }
+           else if(cand/pix_cand < oneX/pix_oneX && cand/pix_cand < zeroX/pix_zeroX){
+            (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand;
+            (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
+            (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din);
+           }
+       }
+       }
+
+       test = MinMaxCone(solutions, a[1], b[1], c[1]);
+       if (test == 0){
+       if (solutions[0] < 1 && solutions[0] > 0){
+           float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1];
+           float pix_cand = (sidd_init + solutions[0]) * (pixelSize[1]/sdd_init);
+           if (cand/pix_cand > d[1]/pix_start_y && cand/pix_cand > pos1[1]/pixelSize[1]){
+           (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand;
+           (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
+           (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din);
+           }
+           else if(cand/pix_cand < d[1]/pix_start_y && cand/pix_cand < pos1[1]/pixelSize[1]){
+            (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand;
+            (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
+            (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din);
+           }
+       }
+
+       if (solutions[1] < 1 && solutions[1] > 0){
+           float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1];
+           float pix_cand = (sidd_init + solutions[1]) * (pixelSize[1]/sdd_init);
+           if (cand/pix_cand > oneY/pix_oneY && cand/pix_cand > zeroY/pix_zeroY){
+            (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand;
+            (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
+            (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din);
+           }
+           else if(cand/pix_cand < oneY/pix_oneY && cand/pix_cand < zeroY/pix_zeroY){
+            (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand;
+            (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
+            (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din);
+           }
+       }
+       }
+      //Check how many Intercepts will occur approximately
+      nx = int(abs(oneX/pix_oneX - zeroX/pix_zeroX));
+      ny = int(abs(oneY/pix_oneY - zeroY/pix_zeroY));
+
+      if (nx + ny == 0) {
+      *protFlag = true;
+      return 0;
+      }
+      if ((nx + ny) <= maxIntercep){ 
+
+          if (int(oneX/pix_oneX) != int(zeroX/pix_zeroX)) {
+            if(oneX/pix_oneX > zeroX/pix_zeroX){            
+            tan_alpha = (trunc(oneX/pix_oneX)*pix_oneX)/sdd_x;
+            d_channel = trunc(oneX/pix_oneX)*pix_oneX * (sidd_init/sdd_x);
+            select = 0;
+            }
+            else{
+            tan_alpha = (trunc(zeroX/pix_zeroX)*pix_zeroX)/sidd_x;
+            d_channel = trunc(zeroX/pix_zeroX)*pix_zeroX * (sidd_init/sidd_x);
+            select = 1;
+            }
+            for (int ix=0; ix<nx; ix++){
+              if(ix != 0){
+                if (select == 0){
+                  tan_alpha = (trunc((oneX-ix*pix_oneX)/pix_oneX)*pix_oneX)/sdd_x;
+                  d_channel = trunc((oneX-ix*pix_oneX)/pix_oneX)*pix_oneX * (sidd_init/sdd_x);
+                  }
+                  else{
+                  tan_alpha = (trunc((zeroX-ix*pix_zeroX)/pix_zeroX)*pix_zeroX)/sidd_x;
+                  d_channel = trunc((zeroX-ix*pix_zeroX)/pix_zeroX)*pix_zeroX * (sidd_init/sidd_x);
+                  }
+              }
+              //Start from the largest pixel boundary and propagate to the smallest
+              status = SolvePolynomialCone(IntercepX, b[0]/a[0], c[0]/a[0] - tan_alpha/a[0], d[0]/a[0] - d_channel/a[0]);
+              for (int kx=0; kx < status; kx++ ){
+                if(IntercepX[kx]< 1. && IntercepX[kx] > 0. ){
+                  if (counter >=maxIntercep){break;}
+                  InterceptsVec[counter] = IntercepX[kx];
+                  counter++;
+                }
+              }//kx
+             if (counter >=maxIntercep){break;}     
+            }
+          }
+
+           if ( int(oneY/pix_oneY) != int(zeroY/pix_zeroY)) {
+              if(oneY/pix_oneY > zeroY/pix_zeroY){
+                tan_alpha = (trunc(oneY/pix_oneY)*pix_oneY)/sdd_y;
+                d_channel = trunc(oneY/pix_oneY)*pix_oneY * (sidd_init/sdd_y);
+                select = 0;
+                }
+                else{
+                tan_alpha = (trunc(zeroY/pix_zeroY)*pix_zeroY)/sidd_y;
+                d_channel = trunc(zeroY/pix_zeroY)*pix_zeroY * (sidd_init/sidd_y);
+                select = 1;
+                }
+                for (int iy=0; iy<ny; iy++){
+                  if(iy != 0){
+                    if (select == 0){
+                      tan_alpha = (trunc((oneY-iy*pix_oneY)/pix_oneY)*pix_oneY)/sdd_y;
+                      d_channel = trunc((oneY-iy*pix_oneY)/pix_oneY)*pix_oneY * (sidd_init/sdd_y);
+                      }
+                      else{
+                      tan_alpha = (trunc((zeroY-iy*pix_zeroY)/pix_zeroY)*pix_zeroY)/sidd_y;
+                      d_channel = trunc((zeroY-iy*pix_zeroY)/pix_zeroY)*pix_zeroY * (sidd_init/sidd_y);
+                      }
+                  }
+                  //Start from the largest pixel boundary and propagate to the smallest
+                  status = SolvePolynomialCone(IntercepY, b[1]/a[1], c[1]/a[1] - tan_alpha/a[1], d[1]/a[1] - d_channel/a[1]);
+                  for (int ky=0; ky < status; ky++ ){
+                    if(IntercepY[ky]< 1. && IntercepY[ky] > 0. ){
+                      if (counter >=maxIntercep){break;}
+                      InterceptsVec[counter] = IntercepY[ky];
+                      counter++;
+                    }
+                  }//kx
+                 if (counter >=maxIntercep){break;}     
+                }
+              }
+
+          if (counter >= maxIntercep){ // || counter == 0){ 
+            *protFlag = false;
+            return counter;
+          }
+
+         else{
+            int diff = maxIntercep - counter;
+            for(int j = 0; j<diff; j++){
+                InterceptsVec[counter+j] = 2. + (float)j; //Just ensure that array Element is larger than 1                        
+              }     
+
+            SimpleSortCone(InterceptsVec, maxIntercep);
+            *protFlag = true;
+            return counter;
+          }
+        }
+
+        else{
+          // Too many channel Intercepts - Proton neglected 
+          // Discrimination is implemented to neglect protons with large entry angles
+          // and to reduce the size of the array that has to be allocated for each thread
+          *protFlag = false;
+          return counter;
+          }
+        }
+
+
+__global__ void ParticleKernelCone(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
+                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int* detectSizeY, \
+                               float* pix, float* detectDistIn, float* detectDistOut, float *ein, float *hull, float *reject, \
+                               float* sourceDist){
+            
+    unsigned int protonIndex = blockIdx.x*blockDim.x  + threadIdx.x;
+    float dimX, dimY, lk, lenX, lenY;
+    float lenZ = abs(*detectDistIn) + abs(*detectDistOut);
+    dimX = (float) *detectSizeX;
+    dimY = (float) *detectSizeY;
+
+    //Dereference input parameters
+    int entries, dSizeX, dSizeY;
+    // float pix;
+    
+    entries = *numOfEntries;
+    dSizeX = *detectSizeX;
+    dSizeY = *detectSizeY;
+    // pix = *pixelSize;
+            
+            
+    if(hull[3] == 0){
+    lenX = sqrt((devicePosOut[protonIndex] - devicePosIn[protonIndex]) * (devicePosOut[protonIndex] - devicePosIn[protonIndex]) \
+            + lenZ*lenZ); 
+    lenY = sqrt((devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) * (devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) \
+            + lenZ*lenZ);
+   
+    float lambda0, lambda1, ref_wepl;
+    ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
+    lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
+    lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
+
+    float a[2], b[2], c[2], d[2], pos1[2];
+    
+    //Allocate memory for all pointers
+    // Calculate optimized xdir_in
+    devicedirIn[protonIndex] = devicedirIn[protonIndex] \
+            / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    //  ... dz = 1!
+    devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
+    
+    // Calculate optimized ydir_in
+    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
+            / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);  // ... dz = 1!
+    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
+    
+    // Calculate optimized xdir_out
+    devicedirOut[protonIndex] = devicedirOut[protonIndex] \
+            / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); //  ... dz = 1!
+    devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
+    
+    // Calculate optimized ydir_out
+    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
+            / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
+    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
+            
+    // Calculate spline parameters
+    a[0] = devicePosIn[protonIndex]*2. + devicedirIn[protonIndex] - 2.*devicePosOut[protonIndex] + devicedirOut[protonIndex];
+    a[1] = devicePosIn[protonIndex + entries]*2. + devicedirIn[protonIndex + entries] - \
+    2.*devicePosOut[protonIndex + entries] +  devicedirOut[protonIndex + entries];
+
+    b[0] = -3.*devicePosIn[protonIndex] -2.*devicedirIn[protonIndex] + 3.*devicePosOut[protonIndex] - devicedirOut[protonIndex];
+    b[1] = -3.*devicePosIn[protonIndex + entries] -2.* devicedirIn[protonIndex + entries] \
+    + 3.*devicePosOut[protonIndex + entries] - devicedirOut[protonIndex + entries];
+
+    c[0] = devicedirIn[protonIndex];
+    c[1] = devicedirIn[protonIndex + entries];
+
+    d[0] = devicePosIn[protonIndex];
+    d[1] = devicePosIn[protonIndex + entries];
+
+    pos1[0] = devicePosOut[protonIndex];
+    pos1[1] = devicePosOut[protonIndex + entries];
+    
+    /* --------------------------------------------------------------------------------- */
+    /* ------------------------ Start without Hull (CS only)  -------------------------- */
+    /* --------------------------------------------------------------------------------- */ 
+    int count;
+    bool status = false;
+    float InterceptsVec[vecSizeCS] = {0}; 
+    // float InterceptsLengths[vecSizeCS+1] = {0};          
+    count = calcInterceptsCone(InterceptsVec, a, b, c, d, pos1, pix, &status, vecSizeCS, *sourceDist, *detectDistIn, *detectDistOut);
+    if (status) { 
+        float pix_start_x = abs(*detectDistIn - *sourceDist) * (pix[0]/abs(*detectDistOut - *sourceDist));
+        float pix_start_y = abs(*detectDistIn - *sourceDist) * (pix[1]/abs(*detectDistOut - *sourceDist));
+        int indX, indY, linInd;
+        
+        // for cone beam we need this
+        // Calculate new lenZ
+        /*float lenZ_custom = 0.0;
+        float head[3], tail[3];
+        for (int i=0; i<=count; i++){
+            if (i == 0){
+                head[0] = cspline(InterceptsVec[i], a[0], b[0], c[0], d[0]);
+                head[1] = cspline(InterceptsVec[i], a[1], b[1], c[1], d[1]);
+                head[2] = InterceptsVec[i]*lenZ;
+                InterceptsLengths[i] = sqrt(powf(head[0] - d[0], 2.0) + powf(head[1] - d[1], 2.0) + powf(head[2], 2.0));
+                tail[0] = head[0];
+                tail[1] = head[1];
+                tail[2] = head[2];
+                lenZ_custom += InterceptsLengths[i];
+            }
+            else if (i == count){
+                InterceptsLengths[i] = sqrt(powf(pos1[0] - tail[0], 2.0) + powf(pos1[1] - tail[1], 2.0) + powf(*detectDistOut - tail[2], 2.0));
+                lenZ_custom += InterceptsLengths[i];
+            }
+            else{
+               head[0] = cspline(InterceptsVec[i], a[0], b[0], c[0], d[0]);
+               head[1] = cspline(InterceptsVec[i], a[1], b[1], c[1], d[1]);
+               head[2] = InterceptsVec[i]*lenZ;
+               InterceptsLengths[i] = sqrt(powf(head[0] - tail[0], 2.0) + powf(head[1] - tail[1], 2.0) + powf(head[2] - tail[2], 2.0));
+               tail[0] = head[0];
+               tail[1] = head[1];
+               tail[2] = head[2]; 
+               lenZ_custom += InterceptsLengths[i];
+            }
+        }*/
+
+         float tOld = 0.0;
+         if (count==0){ 
+           indX = int(pos1[0]/pix[0]+dimX/2.); // REPLACE: pos1 by pos0
+           indY = int(pos1[1]/pix[1]+dimY/2.);
+
+           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+               linInd = indY + indX*(dSizeY);  
+               atomicAdd(&dhist1[linInd], p_wepl[protonIndex]);
+               atomicAdd(&dhist2[linInd], 1.0f);
+           }
+
+         } 
+         else{
+            for(int i= 0; i<=count; i++){
+              // lk = InterceptsLengths[i]; 
+              lk = (InterceptsVec[i]- tOld)*lenZ;
+              if(i == 0){
+                indX = int(d[0]/pix_start_x + dimX/2);
+                indY = int(d[1]/pix_start_y + dimY/2);
+                linInd = indY + indX*(dSizeY); 
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+                    linInd = indY + indX*(dSizeY);
+		    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
+                }
+                // tOld = InterceptsVec[i]; 
+
+              }else if(i == count){
+                // lk = InterceptsLengths[i]; 
+                lk = lenZ - InterceptsVec[i-1]*lenZ;
+                indX = int(pos1[0]/pix[0] + dimX/2);
+                indY = int(pos1[1]/pix[1] + dimY/2);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
+                }
+
+              }else{
+                if (i != 0 && i != count){
+                float curr_pix_x = ((InterceptsVec[i]-eps)*lenZ + *detectDistIn - *sourceDist) * (pix[0]/abs(*detectDistOut - *sourceDist));
+                float curr_pix_y = ((InterceptsVec[i]-eps)*lenZ + *detectDistIn - *sourceDist) * (pix[1]/abs(*detectDistOut - *sourceDist));
+                indX = int(csplineCone(InterceptsVec[i] - eps, a[0], b[0], c[0], d[0])/curr_pix_x + dimX/2);
+                indY = int(csplineCone(InterceptsVec[i] - eps, a[1], b[1], c[1], d[1])/curr_pix_y + dimY/2);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
+                }
+                tOld = InterceptsVec[i]; 
+              }
+            }
+            }//i
+         }//if - Intercepts         
+     }
+    else{
+        atomicAdd(reject, 1.0);
+    }
+/* ------------------------ End no Hull calculation (CS only)  -------------------------- */
+    }
+
+else{
+    // WEIGHTING FACTORS FOR CHANNELS I 
+    float weight_air_in = 0.00479; 
+    float weight_air_out = 0.00479; 
+
+    float HullIn[3], HullOut[3], initpos[3], exitpos[3];  
+    float initdir[2], exitdir[2]; 
+            
+    initpos[0] = devicePosIn[protonIndex];
+    initpos[1] = devicePosIn[protonIndex + entries];
+    initpos[2] = *detectDistIn;
+
+    exitpos[0] = devicePosOut[protonIndex];
+    exitpos[1] = devicePosOut[protonIndex + entries];
+    exitpos[2] = *detectDistOut;
+
+    initdir[0] = devicedirIn[protonIndex];
+    initdir[1] = devicedirIn[protonIndex + entries];
+
+    exitdir[0] = devicedirOut[protonIndex];
+    exitdir[1] = devicedirOut[protonIndex + entries];
+
+    int check = hullEntryExitCone(HullIn, initpos, initdir, 1, hull, *detectDistIn);
+
+    if(check == 0){
+        check = hullEntryExitCone(HullOut, exitpos, exitdir, 0, hull, *detectDistOut);
+    }
+
+    if(check == 0 && HullOut[2] > HullIn[2]){            
+        /* --------------------------------------------------------------------------------- */
+        /* ------------------------ Start with Hull + SL outside  -------------------------- */
+        /* --------------------------------------------------------------------------------- */
+        const int hullIntercep = int(vecSizeCS);  
+        const int airIntercepIn = int(vecSizeIn);   
+        const int airIntercepOut = int(vecSizeOut);   
+        bool status1 = false;
+        bool status2 = false; 
+        bool status3 = false;
+        
+        int countIn, countHull, countOut;
+        float InterceptsVecOut[airIntercepOut] = {0}; 
+        float InterceptsVecIn[airIntercepIn] = {0};
+        float InterceptsVecHull[hullIntercep] = {0}; 
+        lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); 
+        lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2]));
+        
+        float newpix[2];
+        newpix[0] = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+        newpix[1] = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+        countIn = calcInterceptsLinearCone(InterceptsVecIn, initpos, HullIn, initdir, newpix, airIntercepIn, &status1, *sourceDist);
+        countOut = calcInterceptsLinearCone(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2, *sourceDist);
+
+        /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */
+        float lambda0, lambda1, ref_wepl;
+        ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
+        lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
+        lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
+
+        float a[2], b[2], c[2], d[2], pos1[2];
+
+        //Allocate memory for all pointers
+        // Calculate optimized xdir_in
+        devicedirIn[protonIndex] = devicedirIn[protonIndex] \
+                / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    // ... dz = 1!
+        devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
+
+        // Calculate optimized ydir_in
+        devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
+                / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);   // ... dz = 1!
+        devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
+
+        // Calculate optimized xdir_out
+        devicedirOut[protonIndex] = devicedirOut[protonIndex] \
+                / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1!
+        devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
+
+        // Calculate optimized ydir_out
+        devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
+                / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
+        devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
+
+        // Calculate spline parameters
+        a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex];
+        a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \
+        2.*HullOut[1] +  devicedirOut[protonIndex + entries];
+
+        b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex];
+        b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \
+        + 3.*HullOut[1] - devicedirOut[protonIndex + entries];
+
+        c[0] = devicedirIn[protonIndex];
+        c[1] = devicedirIn[protonIndex + entries];
+
+        d[0] = HullIn[0];
+        d[1] = HullIn[1];
+
+        pos1[0] = HullOut[0];
+        pos1[1] = HullOut[1];
+
+        // float newpix[2];
+        newpix[0] = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+        newpix[1] = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+        countHull = calcInterceptsCone(InterceptsVecHull, a, b, c, d, pos1, newpix, &status3, hullIntercep, *sourceDist, HullIn[2], HullOut[2]);
+        /* -------------------- End CS Preparations! -------------- */
+
+        if(status1 && status2 && status3){
+        float tOld = initpos[2];
+        int indX, indY, linInd;
+        // WEIGHTING FACTORS FOR CHANNELS II
+        float weight_water = 1;  
+
+        // ---------------------------------------- Start with SL from detector to hull
+        float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+        float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+        float pix_end_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+        float pix_end_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));  
+        if (countIn == 0){
+        indX = int(initpos[0]/pix_start_x + dimX/2.);
+        indY = int(initpos[1]/pix_start_y + dimY/2.);
+        lk = HullIn[2] - initpos[2];
+        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+           linInd = indY + indX*(dSizeY);  
+           atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+           atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+            }
+        }
+
+        else{
+        for(int i= 0; i<=countIn; i++){
+           lk = InterceptsVecIn[i] - tOld;
+           if(i == 0){
+             indX = int(initpos[0]/pix_start_x + dimX/2.);
+             indY = int(initpos[1]/pix_start_y + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecIn[i];
+             }   
+           }
+           else if(i == countIn){
+             lk = HullIn[2] - InterceptsVecIn[i-1];
+             indX = int(HullIn[0]/pix_end_x + dimX/2.);
+             indY = int(HullIn[1]/pix_end_y + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+             }
+           }
+
+           else{
+             float curr_pix_x = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+             float curr_pix_y = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+             indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2] )))/curr_pix_x + dimX/2.);
+             indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2] )))/curr_pix_y + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecIn[i];
+             }
+            }
+           }
+          }   // end else
+
+        // ---cone beam------------------------ CS within hull
+        
+             tOld = 0.0;
+             pix_start_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+             pix_start_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+             pix_end_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+             pix_end_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+             if (countHull==0){ 
+               indX = int(HullIn[0]/pix_start_x + dimX/2.); 
+               indY = int(HullIn[1]/pix_start_y + dimY/2.);
+               lk = HullOut[2] - HullIn[2];
+               if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+                   linInd = indY + indX*(dSizeY);  
+                   atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                   atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+               }
+
+             } else{
+                for(int i= 0; i<=countHull; i++){
+                  lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]);
+                  if(tOld == 0){
+                    indX = int(d[0]/pix_start_x + dimX/2.);
+                    indY = int(d[1]/pix_start_y + dimY/2.);
+                    linInd = indY + indX*(dSizeY); 
+
+                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
+                        linInd = indY + indX*(dSizeY);
+                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+                    }
+                    tOld = InterceptsVecHull[i];
+
+                  }else if(i == countHull){
+                    lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]);
+                    indX = int(pos1[0]/pix_end_x + dimX/2.);
+                    indY = int(pos1[1]/pix_end_y + dimY/2.);
+
+                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
+                        linInd = indY + indX*(dSizeY); 
+                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+                    }
+
+                  }else{
+                    float curr_len = (InterceptsVecHull[i]-eps)*(HullOut[2]-HullIn[2]) + (HullIn[2] - *sourceDist); // abs(((InterceptsVecHull[i]-eps)*lenZ + *detectDistIn) - *sourceDist)
+                    float curr_pix_x = curr_len * (pix[0]/abs(exitpos[2] - *sourceDist));
+                    float curr_pix_y = curr_len * (pix[1]/abs(exitpos[2] - *sourceDist));
+                    indX = int(csplineCone(InterceptsVecHull[i] - eps, a[0], b[0], c[0], d[0])/curr_pix_x + dimX/2.);
+                    indY = int(csplineCone(InterceptsVecHull[i] - eps, a[1], b[1], c[1], d[1])/curr_pix_y + dimY/2.);
+
+                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
+                        linInd = indY + indX*(dSizeY); 
+                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
+                    }
+                    tOld = InterceptsVecHull[i];
+                  }
+
+             }//i
+         }
+
+        // --------------------------- SL from hull to detector
+        tOld = HullOut[2];
+        pix_start_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+        pix_start_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+        if (countOut == 0){
+        indX = int(exitpos[0]/pix[0] + dimX/2.);
+        indY = int(exitpos[1]/pix[1] + dimY/2.);
+        lk = exitpos[2] - HullOut[2];
+        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
+           linInd = indY + indX*(dSizeY);  
+           atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+           atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+            }
+        }
+
+        else{
+        for(int i= 0; i<=countOut; i++){
+           lk = abs(InterceptsVecOut[i] - tOld);
+           if(i == 0){
+             indX = int(HullOut[0]/pix_start_x + dimX/2.);
+             indY = int(HullOut[1]/pix_start_y + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
+             linInd = indY + indX*(dSizeY);  
+             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecOut[i];
+             }   
+           }
+           else if(i == countOut){
+             lk = exitpos[2] - InterceptsVecOut[i-1];
+             indX = int(exitpos[0]/pix[0] + dimX/2.);
+             indY = int(exitpos[1]/pix[1] + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+             }
+           }
+
+           else{
+             float curr_pix_x = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+             float curr_pix_y = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+             indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/curr_pix_x + dimX/2.);
+             indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/curr_pix_y + dimY/2.);
+             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
+             linInd = indY + indX*(dSizeY);
+             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+             tOld = InterceptsVecOut[i];
+             }
+            }
+           }
+          }   // end else
+
+        }
+        else{
+        atomicAdd(reject, 1.0);
+    }
+
+        /* --------------------------- End Hull + SL outside ------------------------------- */
+        
+        }  
+
+    else{   
+    
+    /* --------------------------------------------------------------------------------- */
+    /* ----------------------------- Start with SL only!  ------------------------------ */
+    /* --------------------------------------------------------------------------------- */ 
+    int count;
+    bool status = false;
+    float InterceptsVec[vecSizeCS] = {0}; 
+    //float InterceptsLengths[vecSizeCS+1] = {0}; 
+    
+    float initpos[3], exitpos[3]; 
+    float mydir[2]; 
+    initpos[0] = devicePosIn[protonIndex];
+    initpos[1] = devicePosIn[protonIndex + entries];
+    initpos[2] = *detectDistIn;
+    exitpos[0] = devicePosOut[protonIndex];
+    exitpos[1] = devicePosOut[protonIndex + entries];
+    exitpos[2] = *detectDistOut;
+
+    mydir[0] = (exitpos[0] - initpos[0])/lenZ;
+    mydir[1] = (exitpos[1] - initpos[1])/lenZ;  // dz = 1
+    count = calcInterceptsLinearCone(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status, *sourceDist);
+
+    // for cone beam we need this
+    /*float lenZ_custom = 0.0;
+    float head[3], tail[3];
+    for (int i=0; i<=count; i++){
+        if (i == 0){
+            head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]);
+            head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]);
+            head[2] = InterceptsVec[i];
+            InterceptsLengths[i] = sqrt(powf(head[0] - initpos[0], 2.0) + powf(head[1] - initpos[1], 2.0) + powf(head[2] - initpos[2], 2.0));
+            tail[0] = head[0];
+            tail[1] = head[1];
+            tail[2] = head[2];
+            lenZ_custom += InterceptsLengths[i];
+        }
+        else if (i == count){
+            InterceptsLengths[i] = sqrt(powf(exitpos[0] - tail[0], 2.0) + powf(exitpos[1] - tail[1], 2.0) + powf(exitpos[2] - tail[2], 2.0));
+            lenZ_custom += InterceptsLengths[i];
+        }
+        else{
+           head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]);
+           head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]);
+           head[2] = InterceptsVec[i];
+           InterceptsLengths[i] = sqrt(powf(head[0] - tail[0], 2.0) + powf(head[1] - tail[1], 2.0) + powf(head[2] - tail[2], 2.0));
+           tail[0] = head[0];
+           tail[1] = head[1];
+           tail[2] = head[2]; 
+           lenZ_custom += InterceptsLengths[i];
+        }
+    }*/
+            
+    float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+    float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+
+    if (status) { 
+        int indX, indY, linInd;
+        // exitpos[0] / (exitpos[2] - *sourceDir);
+         float tOld = initpos[2];
+         if (count==0){ 
+           indX = int(initpos[0]/pix_start_x + dimX/2.); 
+           indY = int(initpos[1]/pix_start_y + dimY/2.);
+
+           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
+               linInd = indY + indX*(dSizeY);  
+               atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]);
+               atomicAdd(&dhist2[linInd], weight_air_out*1.0f);
+           }
+
+         } else{
+            for(int i= 0; i<=count; i++){
+              lk = InterceptsVec[i] - tOld; 
+              // lk = InterceptsLengths[i];
+              if(i == 0){
+                indX = int(initpos[0]/pix_start_x + dimX/2.);
+                indY = int(initpos[1]/pix_start_y + dimY/2.); 
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
+                    linInd = indY + indX*(dSizeY);
+                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+                }
+                tOld = InterceptsVec[i];
+
+              }else if(i == count){
+                lk = exitpos[2] - InterceptsVec[i-1];
+                indX = int(exitpos[0]/pix[0] + dimX/2.);
+                indY = int(exitpos[1]/pix[1] + dimY/2.);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+                }
+
+              }else{
+                float curr_pix_x = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
+                float curr_pix_y = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
+                indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/curr_pix_x+dimX/2.);
+                indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/curr_pix_y+dimY/2.);
+
+                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
+                    linInd = indY + indX*(dSizeY); 
+                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
+                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
+                }
+                tOld = InterceptsVec[i];
+              }
+
+            } //i
+         }//if - Intercepts
+     }
+    else{
+        // *reject += 1;
+        atomicAdd(reject, 1.0);
+    }
+    /* ------------------------------ End SL only! ------ -------------------------- */
+    }   
+   }
+}
+
+__global__ void sumHistCone(float* hist, float* histNorm){
+    
+    unsigned int index = blockIdx.x*blockDim.x  + threadIdx.x;
+    hist[index] = hist[index]/histNorm[index];
+}
+
+__host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \
+                                  float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \
+                                  float detectDistIn, float detectDistOut, float sourcePos, \
+                                  float ein, float* ch_param){
+
+    /*
+    Detect Size = 400x400
+    Prepare Input for GPU*/
+
+    const int sizeInputs = 2*numOfEntries*sizeof(float);
+    const int detectorMem = detectSizeX*detectSizeY*sizeof(float);
+    float reject = 0.0;
+
+    float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull;
+    int *dnumEntries, *ddetectorX, *ddetectorY;
+    float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dSourceDist, *dEin, *dReject;
+
+    float *hist1, *hist2;
+    hist1 = new float[detectSizeX*detectSizeY];
+    hist2 = new float[detectSizeX*detectSizeY];
+    for(int i = 0; i<detectSizeX*detectSizeY; i++){
+        hist1[i] = 0.f;
+        hist2[i]= 0.f;
+    
+    }
+
+    //Allocate Memory on GPU
+    cudaMalloc( (void**) &dPosIn, sizeInputs );
+    cudaMalloc( (void**) &dPosOut, sizeInputs );
+    cudaMalloc( (void**) &ddirIn, sizeInputs );
+    cudaMalloc( (void**) &ddirOut, sizeInputs );
+    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
+    cudaMalloc( (void**) &dhist1, detectorMem );
+    cudaMalloc( (void**) &dhist2, detectorMem );
+    cudaMalloc( (void**) &dnumEntries, sizeof(int));
+    cudaMalloc( (void**) &ddetectorX, sizeof(int));
+    cudaMalloc( (void**) &ddetectorY, sizeof(int));
+    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
+    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
+    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
+    cudaMalloc( (void**) &dSourceDist, sizeof(float));
+    cudaMalloc( (void**) &dEin, sizeof(float));
+    cudaMalloc( (void**) &dReject, sizeof(float));
+    cudaMalloc( (void**) &dHull, 5*sizeof(float));
+    cudaError_t _err_alloc = cudaGetLastError();
+    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
+    cudaCheckErrors("GPU Allocation failed!");
+
+    //Copy Arrays to GPU
+    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
+    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
+    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
+    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
+    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
+    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dSourceDist, &sourcePos, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
+    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
+    cudaCheckErrors("Host to device transport failed!");
+
+
+
+    dim3 grid(floor(numOfEntries/maxthreads),1,1);
+    dim3 block(maxthreads,1,1);
+
+    
+    ParticleKernelCone<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
+            dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist);
+    cudaError_t _err = cudaGetLastError();
+    mexPrintf("%s \n", cudaGetErrorString(_err));
+    cudaCheckErrors("Kernel fail!");
+    
+    //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
+    //dim3 block_sum(64,1,1);
+    //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
+        
+    //Copy result from device to host
+    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
+    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
+    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
+    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
+    //cudaError_t _errcp = cudaGetLastError();
+    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
+    cudaCheckErrors("Device to host transport failed!");
+    
+    for(int j = 0; j<detectSizeX*detectSizeY; j++){
+        outProjection[j] = hist1[j]/hist2[j]; 
+    }
+
+    std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
+
+    cudaFree(dPosIn);
+    cudaFree(dPosOut);
+    cudaFree(ddirIn);
+    cudaFree(ddirOut);
+    cudaFree(dhist1);
+    cudaFree(dhist2);
+    cudaFree(d_wepl);
+    cudaFree(dnumEntries);
+    cudaFree(ddetectorX);
+    cudaFree(ddetectorY);
+    cudaFree(dpixelSize);
+    cudaFree(dDetectDistIn);
+    cudaFree(dDetectDistOut);
+    cudaFree(dEin);
+    cudaFree(dReject);
+    cudaFree(dHull);
+
+    delete(hist1);
+    delete(hist2);
+    // delete(&reject);
+
+
+}
diff --git a/Common/CUDA/projection.cpp.prehip b/Common/CUDA/projection.cpp.prehip
new file mode 100644
index 00000000..aaebf6ef
--- /dev/null
+++ b/Common/CUDA/projection.cpp.prehip
@@ -0,0 +1,35 @@
+#include "projection.hpp"
+#include <math.h>
+#include <algorithm>
+
+float maxDistanceCubeXY(Geometry geo, float alpha,int i){
+    ///////////
+    // Compute initial "t" so we access safely as less as out of bounds as possible.
+    //////////
+
+    float maxCubX,maxCubY;
+    // Forgetting Z, compute max distance: diagonal+offset
+    maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX;
+    maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY;
+
+    return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY);
+}
+
+void rollPitchYaw(Geometry geo,int i, Point3D* point){
+    Point3D auxPoint;
+    auxPoint.x=point->x;
+    auxPoint.y=point->y;
+    auxPoint.z=point->z;
+
+    point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
+         +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
+         +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
+
+    point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
+         +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
+         +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
+
+    point->z=-sin(geo.dPitch[i])*auxPoint.x
+         +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y
+         +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z;
+}
\ No newline at end of file
diff --git a/Common/CUDA/projection.hpp.prehip b/Common/CUDA/projection.hpp.prehip
new file mode 100644
index 00000000..54597d92
--- /dev/null
+++ b/Common/CUDA/projection.hpp.prehip
@@ -0,0 +1,9 @@
+#ifndef PROJECTION_HPP
+#define PROJECTION_HPP
+
+#include "types_TIGRE.hpp"
+
+float maxDistanceCubeXY(Geometry geo, float alpha,int i);
+void rollPitchYaw(Geometry geo,int i, Point3D* point);
+
+#endif
diff --git a/Common/CUDA/ray_interpolated_projection.cu b/Common/CUDA/ray_interpolated_projection.cu
index e71c5b59..8ab4a7e7 100644
--- a/Common/CUDA/ray_interpolated_projection.cu
+++ b/Common/CUDA/ray_interpolated_projection.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for texture-memory interpolation based projection
@@ -53,19 +54,19 @@
 
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "ray_interpolated_projection.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                        mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\
+                hipDeviceReset();\
+                        mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -100,7 +101,7 @@ do { \
      *
      *
      **/
-    void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate);
+    void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate);
 __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
 __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
 
@@ -119,7 +120,7 @@ template<bool sphericalrotation>
         float* detector,
         const int currProjSetNumber,
         const int totalNoOfProjections,
-        cudaTextureObject_t tex){
+        hipTextureObject_t tex){
     
     unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
     unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
@@ -255,10 +256,10 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     if (!fits_in_memory){
         dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
         for (dev = 0; dev < deviceCount; dev++) {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < 2; ++i){
-                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
-                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
+                hipMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
+                hipMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
                 cudaCheckErrors("cudaMallocauxiliarty projections fail");
             }
         }
@@ -267,12 +268,12 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     // This is happening regarthless if the image fits on memory
     float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         
         for (int i = 0; i < 2; ++i){
-            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
-            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
-            cudaCheckErrors("cudaMalloc projections fail");
+            hipMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
+            hipMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
+            cudaCheckErrors("hipMalloc projections fail");
         }
     }
     
@@ -284,34 +285,34 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
    
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & splits>1){
-        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     cudaCheckErrors("Error pinning memory");
 #endif
     Point3D source, deltaU, deltaV, uvOrigin;
     
     Point3D* projParamsArrayHost = 0;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
     float* projFloatsArrayHost = 0;
-    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
+    hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
     cudaCheckErrors("Error allocating auxiliary constant memory");
     
     // Create Streams for overlapping memcopy and compute
     int nStream_device=2;
     int nStreams=deviceCount*nStream_device;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < nStream_device; ++i){
-            cudaStreamCreate(&stream[i+dev*nStream_device]);
+            hipStreamCreate(&stream[i+dev*nStream_device]);
             
         }
     }
@@ -324,8 +325,8 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
 
 
     
-    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
-    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
+    hipTextureObject_t *texImg = new hipTextureObject_t[deviceCount];
+    hipArray **d_cuArrTex = new hipArray*[deviceCount];
     for (unsigned int sp=0;sp<splits;sp++){
         // Create texture objects for all GPUs
         
@@ -350,7 +351,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
         for (unsigned int i=0; i<noOfKernelCalls; i++) {
             for (dev=0;dev<deviceCount;dev++){
                 float is_spherical=0;
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 
                 for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
                     proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
@@ -378,9 +379,9 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                     projFloatsArrayHost[2*j+1]=floor(maxdist);
                 }
                 
-                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
-                cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
-                cudaStreamSynchronize(stream[dev*nStream_device]);
+                hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[dev*nStream_device]);
+                hipMemcpyToSymbolAsync(HIP_SYMBOL(projFloatsArrayDev), projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[dev*nStream_device]);
+                hipStreamSynchronize(stream[dev*nStream_device]);
                 
                 
                 //TODO: we could do this around X and Y axis too, but we would need to compute the new axis of rotation (not possible to know from jsut the angles)
@@ -407,7 +408,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                 // 1) grab previous results and put them in the auxiliary variable dProjection_accum
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -419,12 +420,12 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                                                   nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
                     else
                         projection_this_block=PROJ_PER_BLOCK;
-                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
+                    hipMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyHostToDevice,stream[dev*2+1]);
                 }
                 //  2) take the results from current compute call and add it to the code in execution.
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on this set on this GPU
                     proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
                     if(proj_global>=nangles) 
@@ -436,7 +437,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                                                   nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
                     else
                         projection_this_block=PROJ_PER_BLOCK;
-                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
+                    hipStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
                     vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
                 }
             } // end accumulation case, where the image needs to be split 
@@ -446,7 +447,7 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
             {
                 for (dev = 0; dev < deviceCount; dev++)
                 {
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     //Global index of FIRST projection on previous set on this GPU
                     proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
                     if (dev+1==deviceCount) {    //is it the last device?
@@ -466,21 +467,21 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
                     else {
                         projection_this_block=PROJ_PER_BLOCK;
                     }
-                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+                    hipMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
                 }
             }
             // Make sure Computation on kernels has finished before we launch the next batch.
             for (dev = 0; dev < deviceCount; dev++)
             {
-                cudaSetDevice(gpuids[dev]);
-                cudaStreamSynchronize(stream[dev*2]);
+                hipSetDevice(gpuids[dev]);
+                hipStreamSynchronize(stream[dev*2]);
             }
         } // End noOfKernelCalls (i) loop.
         
         // We still have the last set of projections to get out of GPUs
         for (dev = 0; dev < deviceCount; dev++)
         {
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             //Global index of FIRST projection on this set on this GPU
             proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
             if(proj_global>=nangles) 
@@ -489,15 +490,15 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
             projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
                                       nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
 
-            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
+            hipDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
             cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
-            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+            hipMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[dev*2+1]);
         }
         // Make sure everyone has done their bussiness before the next image split:
         for (dev = 0; dev < deviceCount; dev++)
         {
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
     } // End image split loop.
     
@@ -505,99 +506,99 @@ int interpolation_projection(float  *  img, Geometry geo, float** result,float c
     ///////////////////////////////////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDestroyTextureObject(texImg[dev]);
-        cudaFreeArray(d_cuArrTex[dev]);
+        hipSetDevice(gpuids[dev]);
+        hipDestroyTextureObject(texImg[dev]);
+        hipFreeArray(d_cuArrTex[dev]);
     }
     delete[] texImg; texImg = 0;
     delete[] d_cuArrTex; d_cuArrTex = 0;
     // Freeing Stage
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dProjection[dev*2]);
-        cudaFree(dProjection[dev*2+1]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dProjection[dev*2]);
+        hipFree(dProjection[dev*2+1]);
         
     }
     free(dProjection);
     
     if(!fits_in_memory){
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(dProjection_accum[dev*2]);
-            cudaFree(dProjection_accum[dev*2+1]);
+            hipSetDevice(gpuids[dev]);
+            hipFree(dProjection_accum[dev*2]);
+            hipFree(dProjection_accum[dev*2+1]);
             
         }
         free(dProjection_accum);
     }
     freeGeoArray(splits,geoArray);
-    cudaFreeHost(projParamsArrayHost);
-    cudaFreeHost(projFloatsArrayHost);
+    hipHostFree(projParamsArrayHost);
+    hipHostFree(projFloatsArrayHost);
     
     
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]) ;
+        hipStreamDestroy(stream[i]) ;
 #ifndef NO_PINNED_MEMORY
     if (isHostRegisterSupported & splits>1){
-        cudaHostUnregister(img);
+        hipHostUnregister(img);
     }
 #endif
-    cudaCheckErrors("cudaFree  fail");
+    cudaCheckErrors("hipFree  fail");
     
-//     cudaDeviceReset();
+//     hipDeviceReset();
     return 0;
 }
-void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate)
+void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,bool allocate)
 {
     const unsigned int num_devices = gpuids.GetLength();
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
     if(allocate){
         
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
+            //hipArray Descriptor
             
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
             cudaCheckErrors("Texture memory allocation fail");
         }
         
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaMemcpy3DParms copyParams = {0};
-        cudaSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams);
         //cudaCheckErrors("Texture memory data copy fail");
         //Array creation End
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
         if (geo.accuracy>1){
-            texDescr.filterMode = cudaFilterModePoint;
+            texDescr.filterMode = hipFilterModePoint;
             geo.accuracy=1;
         }
         else{
-            texDescr.filterMode = cudaFilterModeLinear;
+            texDescr.filterMode = hipFilterModeLinear;
         }
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
         cudaCheckErrors("Texture object creation fail");
     }
 }
@@ -828,8 +829,8 @@ void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
     size_t memtotal;
     int deviceCount = gpuids.GetLength();
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("ray_interpolated_projection:ax:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/ray_interpolated_projection.cu.prehip b/Common/CUDA/ray_interpolated_projection.cu.prehip
new file mode 100644
index 00000000..e71c5b59
--- /dev/null
+++ b/Common/CUDA/ray_interpolated_projection.cu.prehip
@@ -0,0 +1,843 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for texture-memory interpolation based projection
+ *
+ * This file has the necessary fucntiosn to perform X-ray CBCT projection
+ * operation given a geaometry, angles and image. It uses the 3D texture
+ * memory linear interpolation to uniformily sample a path to integrate the
+ * X-rays.
+ *
+ * CODE by       Ander Biguri
+ *               Sepideh Hatamikia (arbitrary rotation)
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+
+
+
+
+#include <algorithm>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include "ray_interpolated_projection.hpp"
+#include "TIGRE_common.hpp"
+#include <math.h>
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                cudaDeviceReset();\
+                        mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+    
+    
+#define MAXTREADS 1024
+#define PROJ_PER_BLOCK 9
+#define PIXEL_SIZE_BLOCK 9
+    /*GEOMETRY DEFINITION
+     *
+     *                Detector plane, behind
+     *            |-----------------------------|
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |      +--------+             |
+     *            |     /        /|             |
+     *   A Z      |    /        / |*D           |
+     *   |        |   +--------+  |             |
+     *   |        |   |        |  |             |
+     *   |        |   |     *O |  +             |
+     *    --->y   |   |        | /              |
+     *  /         |   |        |/               |
+     * V X        |   +--------+                |
+     *            |-----------------------------|
+     *
+     *           *S
+     *
+     *
+     *
+     *
+     *
+     **/
+    void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate);
+__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
+__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
+
+
+__global__ void vecAddInPlaceInterp(float *a, float *b, unsigned long  n)
+{
+    int idx = blockIdx.x*blockDim.x+threadIdx.x;
+    // Make sure we do not go out of bounds
+    if (idx < n)
+        a[idx] = a[idx] + b[idx];
+}
+
+
+template<bool sphericalrotation>
+        __global__ void kernelPixelDetector( Geometry geo,
+        float* detector,
+        const int currProjSetNumber,
+        const int totalNoOfProjections,
+        cudaTextureObject_t tex){
+    
+    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned long long projNumber=threadIdx.z;
+    
+    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
+        return;
+    
+#if IS_FOR_MATLAB_TIGRE
+    size_t idx =  (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#else
+    size_t idx =  (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#endif
+
+    unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array
+    
+    if(indAlpha>=totalNoOfProjections)
+        return;
+    
+    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
+    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
+    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
+    Point3D source = projParamsArrayDev[4*projNumber+3];
+    
+    float DSO = projFloatsArrayDev[2*projNumber+0];
+    float cropdist_init = projFloatsArrayDev[2*projNumber+1];
+    
+    
+    
+    /////// Get coordinates XYZ of pixel UV
+    unsigned long pixelV = geo.nDetecV-v-1;
+    unsigned long pixelU = u;
+    
+    
+    float vectX,vectY,vectZ;
+    Point3D P;
+    P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
+    P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
+    P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
+    
+    // Length is the ray length in normalized space
+    float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z));
+    //now legth is an integer of Nsamples that are required on this line
+    length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer
+    vectX=__fdividef(P.x -source.x,length);
+    vectY=__fdividef(P.y -source.y,length);
+    vectZ=__fdividef(P.z -source.z,length);
+    
+    
+//     //Integrate over the line
+    float tx,ty,tz;
+    float sum=0;
+    float i;
+    
+    
+    
+//  Because I have no idea how to efficiently cutoff the legth path in 3D, a very upper limit is computed (see maxdistanceCuboid)
+//  for the 3D case. However it would be bad to lose performance in the 3D case
+//  TODO: can ge really improve this?
+    if (sphericalrotation){
+        if ((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy  <   length)
+            length=ceilf((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy);
+    }
+    else{
+        if ((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy  <   length)
+            length=ceilf((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy);
+    }
+    
+    
+    //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel")
+    for (i=floorf(cropdist_init/geo.accuracy); i<=length; i=i+1){
+        tx=vectX*i+source.x;
+        ty=vectY*i+source.y;
+        tz=vectZ*i+source.z;
+        
+        sum += tex3D<float>(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time.
+    }
+    
+    float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+
+            (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+
+            (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) );
+    
+    detector[idx]=sum*deltalength;
+}
+
+
+
+// legnth(angles)=3 x nagnles, as we have roll, pitch, yaw.
+int interpolation_projection(float  *  img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
+    
+    
+    // Prepare for MultiGPU
+    int deviceCount = gpuids.GetLength();
+    cudaCheckErrors("Device query fail");
+    if (deviceCount == 0) {
+        mexErrMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","There are no available device(s) that support CUDA\n");
+    }
+    //
+    // CODE assumes
+    // 1.-All available devices are usable by this code
+    // 2.-All available devices are equal, they are the same machine (warning thrown)
+    // Check the available devices, and if they are the same
+    if (!gpuids.AreEqualDevices()) {
+        mexWarnMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+    }
+    int dev;
+    
+    // Check free memory
+    size_t mem_GPU_global;
+    checkFreeMemory(gpuids,&mem_GPU_global);
+
+    // printf("geo.nDetec (U, V) = %d, %d\n", geo.nDetecU, geo.nDetecV);
+    
+    size_t mem_image=(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
+    size_t mem_proj =(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV * sizeof(float);
+    
+    // Does everything fit in the GPUs?
+    const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj<mem_GPU_global;
+    unsigned int splits=1;
+    if (!fits_in_memory) {
+        // Nope nope.
+        // approx free memory we have. We already have left some extra 5% free for internal stuff
+        // we need a second projection memory to combine multi-GPU stuff.
+        size_t mem_free=mem_GPU_global-4*PROJ_PER_BLOCK*mem_proj;
+        splits=mem_image/mem_free+1;// Ceil of the truncation
+    }
+    Geometry* geoArray = (Geometry*)malloc(splits*sizeof(Geometry));
+    splitImageInterp(splits,geo,geoArray,nangles);
+    
+    // Allocate auiliary memory for projections on the GPU to accumulate partial results
+    float ** dProjection_accum;
+    size_t num_bytes_proj = PROJ_PER_BLOCK*geo.nDetecU*geo.nDetecV * sizeof(float);
+    if (!fits_in_memory){
+        dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
+        for (dev = 0; dev < deviceCount; dev++) {
+            cudaSetDevice(gpuids[dev]);
+            for (int i = 0; i < 2; ++i){
+                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
+                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
+                cudaCheckErrors("cudaMallocauxiliarty projections fail");
+            }
+        }
+    }
+    
+    // This is happening regarthless if the image fits on memory
+    float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        
+        for (int i = 0; i < 2; ++i){
+            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
+            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
+            cudaCheckErrors("cudaMalloc projections fail");
+        }
+    }
+    
+    
+    
+    
+    //Pagelock memory for synchronous copy.
+    // Lets try to make the host memory pinned:
+    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
+    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
+   
+#ifndef NO_PINNED_MEMORY
+    if (isHostRegisterSupported & splits>1){
+        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+    }
+    cudaCheckErrors("Error pinning memory");
+#endif
+    Point3D source, deltaU, deltaV, uvOrigin;
+    
+    Point3D* projParamsArrayHost = 0;
+    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    float* projFloatsArrayHost = 0;
+    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
+    cudaCheckErrors("Error allocating auxiliary constant memory");
+    
+    // Create Streams for overlapping memcopy and compute
+    int nStream_device=2;
+    int nStreams=deviceCount*nStream_device;
+    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+    
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        for (int i = 0; i < nStream_device; ++i){
+            cudaStreamCreate(&stream[i+dev*nStream_device]);
+            
+        }
+    }
+    cudaCheckErrors("Stream creation fail");
+    int nangles_device=(nangles+deviceCount-1)/deviceCount;
+    int nangles_last_device=(nangles-(deviceCount-1)*nangles_device);
+    unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
+    unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management.
+    int projection_this_block;
+
+
+    
+    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
+    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
+    for (unsigned int sp=0;sp<splits;sp++){
+        // Create texture objects for all GPUs
+        
+        
+        size_t linear_idx_start;
+        // They are all the same size, except the last one.
+        linear_idx_start= (size_t)sp*(size_t)geoArray[0].nVoxelX*(size_t)geoArray[0].nVoxelY*(size_t)geoArray[0].nVoxelZ;
+        CreateTextureInterp(gpuids,&img[linear_idx_start],geoArray[sp],d_cuArrTex,texImg,!sp);
+        cudaCheckErrors("Texture object creation fail");
+        
+        
+        int divU,divV;
+        divU=PIXEL_SIZE_BLOCK;
+        divV=PIXEL_SIZE_BLOCK;
+        dim3 grid((geoArray[sp].nDetecU+divU-1)/divU,(geoArray[0].nDetecV+divV-1)/divV,1);
+        dim3 block(divU,divV,PROJ_PER_BLOCK);
+        
+        unsigned int proj_global;
+        float maxdist;
+        // Now that we have prepared the image (piece of image) and parameters for kernels
+        // we project for all angles.
+        for (unsigned int i=0; i<noOfKernelCalls; i++) {
+            for (dev=0;dev<deviceCount;dev++){
+                float is_spherical=0;
+                cudaSetDevice(gpuids[dev]);
+                
+                for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
+                    proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
+                    if (proj_global>=nangles)
+                        break;
+                     if ((i*PROJ_PER_BLOCK+j)>=nangles_device)
+                        break;
+                    geoArray[sp].alpha=angles[proj_global*3];
+                    geoArray[sp].theta=angles[proj_global*3+1];
+                    geoArray[sp].psi  =angles[proj_global*3+2];
+                    
+                    is_spherical+=abs(geoArray[sp].theta)+abs(geoArray[sp].psi);
+                    
+                    //precomute distances for faster execution
+                    maxdist=maxdistanceCuboid(geoArray[sp],proj_global);
+                    //Precompute per angle constant stuff for speed
+                    computeDeltas(geoArray[sp], proj_global, &uvOrigin, &deltaU, &deltaV, &source);
+                    //Ray tracing!
+                    projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
+                    projParamsArrayHost[4*j+1]=deltaU;
+                    projParamsArrayHost[4*j+2]=deltaV;
+                    projParamsArrayHost[4*j+3]=source;
+                    
+                    projFloatsArrayHost[2*j]=geo.DSO[proj_global];
+                    projFloatsArrayHost[2*j+1]=floor(maxdist);
+                }
+                
+                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
+                cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
+                cudaStreamSynchronize(stream[dev*nStream_device]);
+                
+                
+                //TODO: we could do this around X and Y axis too, but we would need to compute the new axis of rotation (not possible to know from jsut the angles)
+                if (!is_spherical){
+                    kernelPixelDetector<false><<<grid,block,0,stream[dev*nStream_device]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
+                }
+                else{
+                    kernelPixelDetector<true> <<<grid,block,0,stream[dev*nStream_device]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
+                }
+            }
+            
+            
+            // Now that the computation is happening, we need to either prepare the memory for
+            // combining of the projections (splits>1) and start removing previous results.
+            
+            
+            // If our image does not fit in memory then we need to make sure we accumulate previous results too.
+            // This is done in 2 steps: 
+            // 1)copy previous results back into GPU 
+            // 2)accumulate with current results
+            // The code to take them out is the same as when there are no splits needed
+            if( !fits_in_memory&&sp>0)
+            {
+                // 1) grab previous results and put them in the auxiliary variable dProjection_accum
+                for (dev = 0; dev < deviceCount; dev++)
+                {
+                    cudaSetDevice(gpuids[dev]);
+                    //Global index of FIRST projection on this set on this GPU
+                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
+                    if(proj_global>=nangles) 
+                        break;
+
+                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
+                    if(i+1==noOfKernelCalls) //is it the last block?
+                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
+                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
+                    else
+                        projection_this_block=PROJ_PER_BLOCK;
+                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
+                }
+                //  2) take the results from current compute call and add it to the code in execution.
+                for (dev = 0; dev < deviceCount; dev++)
+                {
+                    cudaSetDevice(gpuids[dev]);
+                    //Global index of FIRST projection on this set on this GPU
+                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
+                    if(proj_global>=nangles) 
+                        break;
+
+                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
+                    if(i+1==noOfKernelCalls) //is it the last block?
+                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
+                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
+                    else
+                        projection_this_block=PROJ_PER_BLOCK;
+                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
+                    vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
+                }
+            } // end accumulation case, where the image needs to be split 
+
+            // Now, lets get out the projections from the previous execution of the kernels.
+            if (i>0)
+            {
+                for (dev = 0; dev < deviceCount; dev++)
+                {
+                    cudaSetDevice(gpuids[dev]);
+                    //Global index of FIRST projection on previous set on this GPU
+                    proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
+                    if (dev+1==deviceCount) {    //is it the last device?
+                        // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device
+                        if (i-1 < noOfKernelCallsLastDev) {
+                            // The previous set(block) was not empty.
+                            projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global);
+                        }
+                        else {
+                            // The previous set was empty.
+                            // This happens if deviceCount > PROJ_PER_BLOCK+1.
+                            // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199.
+                            // e.g. PROJ_PER_BLOCK = 1, deviceCount =  3, nangles =   7.
+                            break;
+                        }
+                    }
+                    else {
+                        projection_this_block=PROJ_PER_BLOCK;
+                    }
+                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+                }
+            }
+            // Make sure Computation on kernels has finished before we launch the next batch.
+            for (dev = 0; dev < deviceCount; dev++)
+            {
+                cudaSetDevice(gpuids[dev]);
+                cudaStreamSynchronize(stream[dev*2]);
+            }
+        } // End noOfKernelCalls (i) loop.
+        
+        // We still have the last set of projections to get out of GPUs
+        for (dev = 0; dev < deviceCount; dev++)
+        {
+            cudaSetDevice(gpuids[dev]);
+            //Global index of FIRST projection on this set on this GPU
+            proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
+            if(proj_global>=nangles) 
+                break;
+            // How many projections are left here?
+            projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
+                                      nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
+
+            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
+            cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
+            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
+        }
+        // Make sure everyone has done their bussiness before the next image split:
+        for (dev = 0; dev < deviceCount; dev++)
+        {
+            cudaSetDevice(gpuids[dev]);
+            cudaDeviceSynchronize();
+        }
+    } // End image split loop.
+    
+    cudaCheckErrors("Main loop  fail");
+    ///////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaDestroyTextureObject(texImg[dev]);
+        cudaFreeArray(d_cuArrTex[dev]);
+    }
+    delete[] texImg; texImg = 0;
+    delete[] d_cuArrTex; d_cuArrTex = 0;
+    // Freeing Stage
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaFree(dProjection[dev*2]);
+        cudaFree(dProjection[dev*2+1]);
+        
+    }
+    free(dProjection);
+    
+    if(!fits_in_memory){
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaFree(dProjection_accum[dev*2]);
+            cudaFree(dProjection_accum[dev*2+1]);
+            
+        }
+        free(dProjection_accum);
+    }
+    freeGeoArray(splits,geoArray);
+    cudaFreeHost(projParamsArrayHost);
+    cudaFreeHost(projFloatsArrayHost);
+    
+    
+    for (int i = 0; i < nStreams; ++i)
+        cudaStreamDestroy(stream[i]) ;
+#ifndef NO_PINNED_MEMORY
+    if (isHostRegisterSupported & splits>1){
+        cudaHostUnregister(img);
+    }
+#endif
+    cudaCheckErrors("cudaFree  fail");
+    
+//     cudaDeviceReset();
+    return 0;
+}
+void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate)
+{
+    const unsigned int num_devices = gpuids.GetLength();
+    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    if(allocate){
+        
+        for (unsigned int dev = 0; dev < num_devices; dev++){
+            cudaSetDevice(gpuids[dev]);
+            
+            //cudaArray Descriptor
+            
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //cuda Array
+            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            cudaCheckErrors("Texture memory allocation fail");
+        }
+        
+    }
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaMemcpy3DParms copyParams = {0};
+        cudaSetDevice(gpuids[dev]);
+        //Array creation
+        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.dstArray = d_cuArrTex[dev];
+        copyParams.extent   = extent;
+        copyParams.kind     = cudaMemcpyHostToDevice;
+        cudaMemcpy3DAsync(&copyParams);
+        //cudaCheckErrors("Texture memory data copy fail");
+        //Array creation End
+    }
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(cudaResourceDesc));
+        texRes.resType = cudaResourceTypeArray;
+        texRes.res.array.array  = d_cuArrTex[dev];
+        cudaTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        texDescr.normalizedCoords = false;
+        if (geo.accuracy>1){
+            texDescr.filterMode = cudaFilterModePoint;
+            geo.accuracy=1;
+        }
+        else{
+            texDescr.filterMode = cudaFilterModeLinear;
+        }
+        texDescr.addressMode[0] = cudaAddressModeBorder;
+        texDescr.addressMode[1] = cudaAddressModeBorder;
+        texDescr.addressMode[2] = cudaAddressModeBorder;
+        texDescr.readMode = cudaReadModeElementType;
+        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        cudaCheckErrors("Texture object creation fail");
+    }
+}
+
+/* This code generates the geometries needed to split the image properly in
+ * cases where the entire image does not fit in the memory of the GPU
+ **/
+void splitImageInterp(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){
+    
+    unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible
+    for(unsigned int sp=0;sp<splits;sp++){
+        geoArray[sp]=geo;
+        // All of them are splitsize, but the last one, possible
+        geoArray[sp].nVoxelZ=((sp+1)*splitsize<geo.nVoxelZ)?  splitsize:  geo.nVoxelZ-splitsize*sp;
+        geoArray[sp].sVoxelZ= geoArray[sp].nVoxelZ* geoArray[sp].dVoxelZ;
+        
+        // We need to redefine the offsets, as now each subimage is not aligned in the origin.
+        geoArray[sp].offOrigZ=(float *)malloc(nangles*sizeof(float));
+        for (unsigned int i=0;i<nangles;i++){
+            geoArray[sp].offOrigZ[i]=geo.offOrigZ[i]-geo.sVoxelZ/2+sp*geoArray[0].sVoxelZ+geoArray[sp].sVoxelZ/2;
+        }
+        
+    }
+}
+
+
+
+/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
+ * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
+ * it does saves about 30% of each of the kernel calls. Thats something!
+ **/
+void computeDeltas(Geometry geo,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
+    Point3D S;
+    S.x=geo.DSO[i];
+    S.y=0;
+    S.z=0;
+    
+    //End point
+    Point3D P,Pu0,Pv0;
+    
+    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       P.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    Pu0.x=-(geo.DSD[i]-geo.DSO[i]);   Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5);       Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    Pv0.x=-(geo.DSD[i]-geo.DSO[i]);   Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1);
+    // Geomtric trasnformations:
+    
+    
+    // Now we have the Real world (OXYZ) coordinates of the bottom corner and its two neighbours.
+    // The obkjective is to get a position of the detector in a coordinate system where:
+    // 1-units are voxel size (in each direction can be different)
+    // 2-The image has the its first voxel at (0,0,0)
+    // 3-The image never rotates
+    
+    // To do that, we need to compute the "deltas" the detector, or "by how much
+    // (in new xyz) does the voxels change when and index is added". To do that
+    // several geometric steps needs to be changed
+    
+    //1.Roll,pitch,jaw
+    // The detector can have a small rotation.
+    // according to
+    //"A geometric calibration method for cone beam CT systems" Yang K1, Kwan AL, Miller DF, Boone JM. Med Phys. 2006 Jun;33(6):1695-706.
+    // Only the Z rotation will have a big influence in the image quality when they are small.
+    // Still all rotations are supported
+    
+    // To roll pitch jaw, the detector has to be in centered in OXYZ.
+    P.x=0;Pu0.x=0;Pv0.x=0;
+    
+    // Roll pitch yaw
+    rollPitchYaw(geo,i,&P);
+    rollPitchYaw(geo,i,&Pu0);
+    rollPitchYaw(geo,i,&Pv0);
+    //Now ltes translate the detector coordinates to DOD (original position on real coordinate system:
+    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
+    Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]);
+    Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]);
+    //2: Offset detector
+    
+    
+    //S doesnt need to chagne
+    
+    
+    
+    //3: Rotate around RZ RY RZ
+    Point3D Pfinal, Pfinalu0, Pfinalv0;
+    Pfinal.x  =P.x;
+    Pfinal.y  =P.y  +geo.offDetecU[i]; Pfinal.z  =P.z  +geo.offDetecV[i];
+    Pfinalu0.x=Pu0.x;
+    Pfinalu0.y=Pu0.y  +geo.offDetecU[i]; Pfinalu0.z  =Pu0.z  +geo.offDetecV[i];
+    Pfinalv0.x=Pv0.x;
+    Pfinalv0.y=Pv0.y  +geo.offDetecU[i]; Pfinalv0.z  =Pv0.z  +geo.offDetecV[i];
+    
+    eulerZYZ(geo,&Pfinal);
+    eulerZYZ(geo,&Pfinalu0);
+    eulerZYZ(geo,&Pfinalv0);
+    eulerZYZ(geo,&S);
+    
+    
+    //3: Offset image (instead of offseting image, -offset everything else)
+    
+    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
+    Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i];   Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i];   Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i];
+    Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i];   Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i];   Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i];
+    S.x=S.x-geo.offOrigX[i];                 S.y=S.y-geo.offOrigY[i];                 S.z=S.z-geo.offOrigZ[i];
+    
+    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
+    Pfinal.x  =Pfinal.x+geo.sVoxelX/2-geo.dVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2-geo.dVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2-geo.dVoxelZ/2;
+    Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
+    Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
+    S.x       =S.x+geo.sVoxelX/2-geo.dVoxelX/2;           S.y       =S.y+geo.sVoxelY/2-geo.dVoxelY/2;               S.z       =S.z      +geo.sVoxelZ/2-geo.dVoxelZ/2;
+    
+    //4. Scale everything so dVoxel==1
+    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
+    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
+    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
+    S.x       =S.x/geo.dVoxelX;           S.y       =S.y/geo.dVoxelY;             S.z       =S.z/geo.dVoxelZ;
+    
+    
+    //mexPrintf("COR: %f \n",geo.COR[i]);
+    //5. apply COR. Wherever everything was, now its offesetd by a bit.
+//     Only wors for standard rotaiton, not aribtary axis rotation.
+    float CORx, CORy;
+    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
+    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
+    Pfinal.x+=CORx;   Pfinal.y+=CORy;
+    Pfinalu0.x+=CORx;   Pfinalu0.y+=CORy;
+    Pfinalv0.x+=CORx;   Pfinalv0.y+=CORy;
+    S.x+=CORx; S.y+=CORy;
+    
+    // return
+    
+    *uvorigin=Pfinal;
+    
+    deltaU->x=Pfinalu0.x-Pfinal.x;
+    deltaU->y=Pfinalu0.y-Pfinal.y;
+    deltaU->z=Pfinalu0.z-Pfinal.z;
+    
+    deltaV->x=Pfinalv0.x-Pfinal.x;
+    deltaV->y=Pfinalv0.y-Pfinal.y;
+    deltaV->z=Pfinalv0.z-Pfinal.z;
+    
+    *source=S;
+}
+
+float maxdistanceCuboid(Geometry geo,unsigned int i){
+    ///////////
+    // Compute initial "t" so we access safely as less as out of bounds as possible.
+    //////////
+    
+    
+    float maxCubX,maxCubY,maxCubZ;
+    // Forgetting Z, compute mas distance: diagonal+offset
+    maxCubX=(geo.nVoxelX/2+ abs(geo.offOrigX[i])/geo.dVoxelX);
+    maxCubY=(geo.nVoxelY/2+ abs(geo.offOrigY[i])/geo.dVoxelY);
+    maxCubZ=(geo.nVoxelZ/2+ abs(geo.offOrigZ[i])/geo.dVoxelZ);
+    
+    float a,b;
+    a=geo.DSO[i]/geo.dVoxelX;
+    b=geo.DSO[i]/geo.dVoxelY;
+    
+//  As the return of this value is in "voxel space", the source may have an elliptical curve.
+//  The distance returned is the safe distance that can be skipped for a given angle alpha, before we need to start sampling.
+    
+    if (geo.theta==0.0f & geo.psi==0.0f) // Special case, it will make the code faster
+        return max(a*b/sqrt(a*a*sin(geo.alpha)*sin(geo.alpha)+b*b*cos(geo.alpha)*cos(geo.alpha))-
+                sqrt(maxCubX*maxCubX+maxCubY*maxCubY),0.0f);
+    //TODO: think of more special cases?
+    return max(geo.DSO[i]/max(max(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)-sqrt(maxCubX*maxCubX+maxCubY*maxCubY+maxCubZ*maxCubZ),0.0f);
+
+}
+void rollPitchYaw(Geometry geo,unsigned int i, Point3D* point){
+    Point3D auxPoint;
+    auxPoint.x=point->x;
+    auxPoint.y=point->y;
+    auxPoint.z=point->z;
+    
+    point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
+            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
+            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
+    
+    point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
+            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
+            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
+    
+    point->z=-sin(geo.dPitch[i])*auxPoint.x
+            +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y
+            +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z;
+    
+}
+void eulerZYZ(Geometry geo,  Point3D* point){
+    Point3D auxPoint;
+    auxPoint.x=point->x;
+    auxPoint.y=point->y;
+    auxPoint.z=point->z;
+    
+    point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+
+            (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+
+            cos(geo.alpha)*sin(geo.theta)*auxPoint.z;
+    
+    point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+
+            (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+
+            sin(geo.alpha)*sin(geo.theta)*auxPoint.z;
+    
+    point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+
+            sin(geo.theta)*sin(geo.psi)*auxPoint.y+
+            cos(geo.theta)*auxPoint.z;
+    
+    
+}
+//______________________________________________________________________________
+//
+//      Function:       freeGeoArray
+//
+//      Description:    Frees the memory from the geometry array for multiGPU.
+//______________________________________________________________________________
+void freeGeoArray(unsigned int splits,Geometry* geoArray){
+    for(unsigned int sp=0;sp<splits;sp++){
+        free(geoArray[sp].offOrigZ);
+    }
+    free(geoArray);
+}
+//______________________________________________________________________________
+//
+//      Function:       checkFreeMemory
+//
+//      Description:    check available memory on devices
+//______________________________________________________________________________
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
+    size_t memfree;
+    size_t memtotal;
+    int deviceCount = gpuids.GetLength();
+    for (int dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMemGetInfo(&memfree,&memtotal);
+        if(dev==0) *mem_GPU_global=memfree;
+        if(memfree<memtotal/2){
+            mexErrMsgIdAndTxt("ray_interpolated_projection:ax:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
+        }
+        cudaCheckErrors("Check mem error");
+        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
+    }
+    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
+    
+    //*mem_GPU_global= insert your known number here, in bytes.
+}
diff --git a/Common/CUDA/ray_interpolated_projection.hpp.prehip b/Common/CUDA/ray_interpolated_projection.hpp.prehip
new file mode 100644
index 00000000..2adb8baa
--- /dev/null
+++ b/Common/CUDA/ray_interpolated_projection.hpp.prehip
@@ -0,0 +1,66 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA functions for texture-memory interpolation based projection
+ *
+ *
+ * CODE by       Ander Biguri
+ *               Sepideh Hatamikia (arbitrary rotation)
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+#include "types_TIGRE.hpp"
+#include "GpuIds.hpp"
+
+
+#ifndef PROJECTION_HPP
+#define PROJECTION_HPP
+
+int interpolation_projection(float* img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids);
+float computeMaxLength(Geometry geo, float alpha);
+void computeDeltas(Geometry geo,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
+void rollPitchYaw(Geometry geo,unsigned int i, Point3D* point);
+float maxdistanceCuboid(Geometry geo,unsigned int i);
+void eulerZYZ(Geometry geo, Point3D* point);
+void splitImageInterp(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles);
+void freeGeoArray(unsigned int splits,Geometry* geoArray);
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
+// below, not used
+Geometry nomralizeGeometryImage(Geometry geo);
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/ray_interpolated_projection_parallel.cu b/Common/CUDA/ray_interpolated_projection_parallel.cu
index 4aad5d6f..4793821f 100644
--- a/Common/CUDA/ray_interpolated_projection_parallel.cu
+++ b/Common/CUDA/ray_interpolated_projection_parallel.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA functions for texture-memory interpolation based projection
@@ -50,18 +51,18 @@
 
 
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "ray_interpolated_projection_parallel.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
 
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -96,7 +97,7 @@ do { \
  *
  *
  **/
-void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
+void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream);
 __constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
 __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
 
@@ -104,7 +105,7 @@ __constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on
 
 __global__ void kernelPixelDetector_parallel_interpolated( Geometry geo,
         float* detector,
-        const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+        const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex)
 {
 //         Point3D source ,
 //         Point3D deltaU,
@@ -199,23 +200,23 @@ int interpolation_projection_parallel(float  *  img, Geometry geo, float** resul
     size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float);
     float** dProjection=(float **)malloc(2*sizeof(float *));
     for (int i = 0; i < 2; ++i){
-        cudaMalloc((void**)&dProjection[i],   num_bytes);
-        cudaCheckErrors("cudaMalloc projections fail");
+        hipMalloc((void**)&dProjection[i],   num_bytes);
+        cudaCheckErrors("hipMalloc projections fail");
     }
     // allocate streams for memory and compute
     int nStreams=2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (int i = 0; i < 2; ++i){
-        cudaStreamCreate(&stream[i]);
+        hipStreamCreate(&stream[i]);
     }
     
     
     // Texture object variables
-    cudaTextureObject_t *texImg = 0;
-    cudaArray **d_cuArrTex = 0;
-    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
+    hipTextureObject_t *texImg = 0;
+    hipArray **d_cuArrTex = 0;
+    texImg =(hipTextureObject_t*)malloc(1*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(1*sizeof(hipArray*));
     
     CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream);
     cudaCheckErrors("Texture allocation fail");
@@ -226,9 +227,9 @@ int interpolation_projection_parallel(float  *  img, Geometry geo, float** resul
     Point3D source, deltaU, deltaV, uvOrigin;
     
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
     float* projFloatsArrayHost;
-    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
+    hipHostMalloc((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
     
     // 16x16 gave the best performance empirically
     // Funnily that makes it compatible with most GPUs.....
@@ -266,39 +267,39 @@ int interpolation_projection_parallel(float  *  img, Geometry geo, float** resul
             projFloatsArrayHost[2*j+1]=floor(maxdist);
             
         }
-        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-        cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-        cudaStreamSynchronize(stream[0]);
+        hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]);
+        hipMemcpyToSymbolAsync(HIP_SYMBOL(projFloatsArrayDev), projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,hipMemcpyHostToDevice,stream[0]);
+        hipStreamSynchronize(stream[0]);
         
         kernelPixelDetector_parallel_interpolated<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
         // copy result to host
         if (i>0)
-             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);    
+             hipMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, hipMemcpyDeviceToHost,stream[1]);    
     }
-    cudaDeviceSynchronize();
+    hipDeviceSynchronize();
     
     int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
-    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
+    hipMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), hipMemcpyDeviceToHost,stream[1]);
 
     
-    cudaDestroyTextureObject(texImg[0]);
-    cudaFreeArray(d_cuArrTex[0]);
+    hipDestroyTextureObject(texImg[0]);
+    hipFreeArray(d_cuArrTex[0]);
     free(texImg); texImg = 0;
     free(d_cuArrTex); d_cuArrTex = 0;
     cudaCheckErrors("Unbind  fail");
-    cudaFree(dProjection[0]);
-    cudaFree(dProjection[1]);
+    hipFree(dProjection[0]);
+    hipFree(dProjection[1]);
     free(dProjection);
-    cudaFreeHost(projParamsArrayHost);
-    cudaFreeHost(projFloatsArrayHost);
+    hipHostFree(projParamsArrayHost);
+    hipHostFree(projFloatsArrayHost);
 
-    cudaCheckErrors("cudaFree d_imagedata fail");
+    cudaCheckErrors("hipFree d_imagedata fail");
     
     
     for (int i = 0; i < 2; ++i){
-      cudaStreamDestroy(stream[i]);
+      hipStreamDestroy(stream[i]);
     }
-//     cudaDeviceReset();
+//     hipDeviceReset();
     
     return 0;
 }
@@ -410,40 +411,40 @@ void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* u
     
     *source=S;
 }
-void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex, hipTextureObject_t *texImage,hipStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
     
     
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    const hipExtent extent = make_hipExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
     
-    //cudaArray Descriptor
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    //hipArray Descriptor
+    hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
     //cuda Array
-    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
     
     
-    cudaMemcpy3DParms copyParams = {0};
+    hipMemcpy3DParms copyParams = {0};
     //Array creation
-    copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
+    copyParams.srcPtr   = make_hipPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
     copyParams.dstArray = d_cuArrTex[0];
     copyParams.extent   = extent;
-    copyParams.kind     = cudaMemcpyHostToDevice;
-    cudaMemcpy3DAsync(&copyParams,stream[1]);
+    copyParams.kind     = hipMemcpyHostToDevice;
+    hipMemcpy3DAsync(&copyParams,stream[1]);
     
     
     //Array creation End
     
-    cudaResourceDesc    texRes;
-    memset(&texRes, 0, sizeof(cudaResourceDesc));
-    texRes.resType = cudaResourceTypeArray;
+    hipResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(hipResourceDesc));
+    texRes.resType = hipResourceTypeArray;
     texRes.res.array.array  = d_cuArrTex[0];
-    cudaTextureDesc     texDescr;
-    memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    hipTextureDesc     texDescr;
+    memset(&texDescr, 0, sizeof(hipTextureDesc));
     texDescr.normalizedCoords = false;
-    texDescr.filterMode = cudaFilterModeLinear;
-    texDescr.addressMode[0] = cudaAddressModeBorder;
-    texDescr.addressMode[1] = cudaAddressModeBorder;
-    texDescr.addressMode[2] = cudaAddressModeBorder;
-    texDescr.readMode = cudaReadModeElementType;
-    cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+    texDescr.filterMode = hipFilterModeLinear;
+    texDescr.addressMode[0] = hipAddressModeBorder;
+    texDescr.addressMode[1] = hipAddressModeBorder;
+    texDescr.addressMode[2] = hipAddressModeBorder;
+    texDescr.readMode = hipReadModeElementType;
+    hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
     
 }
\ No newline at end of file
diff --git a/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip b/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip
new file mode 100644
index 00000000..4aad5d6f
--- /dev/null
+++ b/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip
@@ -0,0 +1,449 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for texture-memory interpolation based projection
+ *
+ * This file has the necessary functions to perform X-ray parallel projection
+ * operation given a geaometry, angles and image. It uses the 3D texture
+ * memory linear interpolation to uniformily sample a path to integrate the
+ * X-rays.
+ *
+ * CODE by       Ander Biguri
+ *               Sepideh Hatamikia (arbitrary rotation)
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+
+#include <algorithm>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include "ray_interpolated_projection_parallel.hpp"
+#include "TIGRE_common.hpp"
+#include <math.h>
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+    
+
+#define MAXTREADS 1024
+#define PROJ_PER_BLOCK 8
+#define PIXEL_SIZE_BLOCK 8
+/*GEOMETRY DEFINITION
+ *
+ *                Detector plane, behind
+ *            |-----------------------------|
+ *            |                             |
+ *            |                             |
+ *            |                             |
+ *            |                             |
+ *            |      +--------+             |
+ *            |     /        /|             |
+ *   A Z      |    /        / |*D           |
+ *   |        |   +--------+  |             |
+ *   |        |   |        |  |             |
+ *   |        |   |     *O |  +             |
+ *    --->y   |   |        | /              |
+ *  /         |   |        |/               |
+ * V X        |   +--------+                |
+ *            |-----------------------------|
+ *
+ *           *S
+ *
+ *
+ *
+ *
+ *
+ **/
+void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
+__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
+__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
+
+
+
+__global__ void kernelPixelDetector_parallel_interpolated( Geometry geo,
+        float* detector,
+        const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+{
+//         Point3D source ,
+//         Point3D deltaU,
+//         Point3D deltaV,
+//         Point3D uvOrigin,
+//         float DSO,
+//         float maxdist){
+    
+    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned long long projNumber=threadIdx.z;
+    
+    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
+        return;
+    
+    int indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array
+    
+    
+#if IS_FOR_MATLAB_TIGRE
+    size_t idx =  (size_t)(u  * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#else
+    size_t idx =  (size_t)(v  * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
+#endif
+    
+    if(indAlpha>=totalNoOfProjections)
+        return;
+    
+    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
+    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
+    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
+    Point3D source = projParamsArrayDev[4*projNumber+3];
+    
+    float DSO = projFloatsArrayDev[2*projNumber+0];
+    float maxdist = projFloatsArrayDev[2*projNumber+1];
+    
+    
+    /////// Get coordinates XYZ of pixel UV
+    unsigned long pixelV = geo.nDetecV-v-1;
+    unsigned long pixelU = u;
+    
+    
+    float vectX,vectY,vectZ;
+    Point3D P;
+    P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
+    P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
+    P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
+    Point3D S;
+    S.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x);
+    S.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y);
+    S.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z);
+    
+    // Length is the ray length in normalized space
+    double length=sqrtf((S.x-P.x)*(S.x-P.x)+(S.y-P.y)*(S.y-P.y)+(S.z-P.z)*(S.z-P.z));
+    //now legth is an integer of Nsamples that are required on this line
+    length=ceilf(length/geo.accuracy);//Divide the directional vector by an integer
+    vectX=(P.x -S.x)/(length);
+    vectY=(P.y -S.y)/(length);
+    vectZ=(P.z -S.z)/(length);
+    
+    
+//     //Integrate over the line
+    float tx,ty,tz;
+    float sum=0;
+    float i;
+    
+    
+    // limit the amount of mem access after the cube, but before the detector.
+    if ((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy  <   length)
+        length=ceilf((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy);
+    //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel")
+    
+    for (i=floorf(maxdist/geo.accuracy); i<=length; i=i+1){
+        tx=vectX*i+S.x;
+        ty=vectY*i+S.y;
+        tz=vectZ*i+S.z;
+        
+        sum += tex3D<float>(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time.
+        
+    }
+    float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+
+            (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+
+            (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) );
+    detector[idx]=sum*deltalength;
+}
+
+
+
+int interpolation_projection_parallel(float  *  img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
+    
+    
+    
+    size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float);
+    float** dProjection=(float **)malloc(2*sizeof(float *));
+    for (int i = 0; i < 2; ++i){
+        cudaMalloc((void**)&dProjection[i],   num_bytes);
+        cudaCheckErrors("cudaMalloc projections fail");
+    }
+    // allocate streams for memory and compute
+    int nStreams=2;
+    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    
+    for (int i = 0; i < 2; ++i){
+        cudaStreamCreate(&stream[i]);
+    }
+    
+    
+    // Texture object variables
+    cudaTextureObject_t *texImg = 0;
+    cudaArray **d_cuArrTex = 0;
+    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
+    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
+    
+    CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream);
+    cudaCheckErrors("Texture allocation fail");
+    //Done! Image put into texture memory.
+    
+    
+    
+    Point3D source, deltaU, deltaV, uvOrigin;
+    
+    Point3D* projParamsArrayHost;
+    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
+    float* projFloatsArrayHost;
+    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
+    
+    // 16x16 gave the best performance empirically
+    // Funnily that makes it compatible with most GPUs.....
+    int divU,divV,divangle;
+    divU=PIXEL_SIZE_BLOCK;
+    divV=PIXEL_SIZE_BLOCK;
+    
+    dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1);
+    dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK);
+    unsigned int proj_global;
+    unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
+    unsigned int i;
+    
+    float maxdist;
+    for ( i=0; i<noOfKernelCalls; i++){
+        for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
+            proj_global=i*PROJ_PER_BLOCK+j;
+            if (proj_global>=nangles)
+                break;
+            
+            geo.alpha=angles[proj_global*3];
+            geo.theta=angles[proj_global*3+1];
+            geo.psi  =angles[proj_global*3+2];
+            //precomute distances for faster execution
+            maxdist=maxdistanceCuboid(geo,proj_global);
+            //Precompute per angle constant stuff for speed
+            computeDeltas_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source);
+            //Ray tracing!
+            projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
+            projParamsArrayHost[4*j+1]=deltaU;
+            projParamsArrayHost[4*j+2]=deltaV;
+            projParamsArrayHost[4*j+3]=source;
+            
+            projFloatsArrayHost[2*j]=geo.DSO[proj_global];
+            projFloatsArrayHost[2*j+1]=floor(maxdist);
+            
+        }
+        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
+        cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
+        cudaStreamSynchronize(stream[0]);
+        
+        kernelPixelDetector_parallel_interpolated<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
+        // copy result to host
+        if (i>0)
+             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);    
+    }
+    cudaDeviceSynchronize();
+    
+    int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
+    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
+
+    
+    cudaDestroyTextureObject(texImg[0]);
+    cudaFreeArray(d_cuArrTex[0]);
+    free(texImg); texImg = 0;
+    free(d_cuArrTex); d_cuArrTex = 0;
+    cudaCheckErrors("Unbind  fail");
+    cudaFree(dProjection[0]);
+    cudaFree(dProjection[1]);
+    free(dProjection);
+    cudaFreeHost(projParamsArrayHost);
+    cudaFreeHost(projFloatsArrayHost);
+
+    cudaCheckErrors("cudaFree d_imagedata fail");
+    
+    
+    for (int i = 0; i < 2; ++i){
+      cudaStreamDestroy(stream[i]);
+    }
+//     cudaDeviceReset();
+    
+    return 0;
+}
+
+
+
+
+/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
+ * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
+ * it does saves about 30% of each of the kernel calls. Thats something!
+ **/
+void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
+    Point3D S;
+    S.x=geo.DSO[i];
+    S.y=geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);
+    S.z=geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    
+    //End point
+    Point3D P,Pu0,Pv0;
+    
+    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       P.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    Pu0.x=-(geo.DSD[i]-geo.DSO[i]);   Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5);       Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
+    Pv0.x=-(geo.DSD[i]-geo.DSO[i]);   Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1);
+        // Geometric trasnformations:
+    P.x=0;Pu0.x=0;Pv0.x=0;
+    
+    // Roll pitch yaw
+    rollPitchYaw(geo,i,&P);
+    rollPitchYaw(geo,i,&Pu0);
+    rollPitchYaw(geo,i,&Pv0);
+    //Now lets translate the points where they should be:
+    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
+    Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]);
+    Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]);
+
+    S.x=0;
+    // Roll pitch yaw
+    rollPitchYaw(geo,i,&S);
+    //Now lets translate the points where they should be:
+    S.x=S.x+geo.DSO[i];
+
+    
+    //1: Offset detector
+    
+    //P.x
+    P.y  =P.y  +geo.offDetecU[i];    P.z  =P.z  +geo.offDetecV[i];
+    Pu0.y=Pu0.y+geo.offDetecU[i];    Pu0.z=Pu0.z+geo.offDetecV[i];
+    Pv0.y=Pv0.y+geo.offDetecU[i];    Pv0.z=Pv0.z+geo.offDetecV[i];
+    //S doesnt need to chagne
+    
+    
+    //3: Rotate (around z)!
+    Point3D Pfinal, Pfinalu0, Pfinalv0;
+    Pfinal.x  =P.x;
+    Pfinal.y  =P.y  +geo.offDetecU[i]; Pfinal.z  =P.z  +geo.offDetecV[i];
+    Pfinalu0.x=Pu0.x;
+    Pfinalu0.y=Pu0.y  +geo.offDetecU[i]; Pfinalu0.z  =Pu0.z  +geo.offDetecV[i];
+    Pfinalv0.x=Pv0.x;
+    Pfinalv0.y=Pv0.y  +geo.offDetecU[i]; Pfinalv0.z  =Pv0.z  +geo.offDetecV[i];
+    
+    eulerZYZ(geo,&Pfinal);
+    eulerZYZ(geo,&Pfinalu0);
+    eulerZYZ(geo,&Pfinalv0);
+    eulerZYZ(geo,&S);
+    
+    
+    
+    //2: Offset image (instead of offseting image, -offset everything else)
+    
+    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
+    Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i];   Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i];   Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i];
+    Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i];   Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i];   Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i];
+    S.x=S.x-geo.offOrigX[i];       S.y=S.y-geo.offOrigY[i];       S.z=S.z-geo.offOrigZ[i];
+    
+    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
+    Pfinal.x  =Pfinal.x+geo.sVoxelX/2-geo.dVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2-geo.dVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2-geo.dVoxelZ/2;
+    Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
+    Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
+    S.x       =S.x+geo.sVoxelX/2-geo.dVoxelX/2;           S.y       =S.y+geo.sVoxelY/2-geo.dVoxelY/2;               S.z       =S.z      +geo.sVoxelZ/2-geo.dVoxelZ/2;
+    
+    //4. Scale everything so dVoxel==1
+    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
+    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
+    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
+    S.x       =S.x/geo.dVoxelX;           S.y       =S.y/geo.dVoxelY;             S.z       =S.z/geo.dVoxelZ;
+    
+    
+    
+    //5. apply COR. Wherever everything was, now its offesetd by a bit
+    float CORx, CORy;
+    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
+    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
+    Pfinal.x+=CORx;   Pfinal.y+=CORy;
+    Pfinalu0.x+=CORx;   Pfinalu0.y+=CORy;
+    Pfinalv0.x+=CORx;   Pfinalv0.y+=CORy;
+    S.x+=CORx; S.y+=CORy;
+    
+    // return
+    
+    *uvorigin=Pfinal;
+    
+    deltaU->x=Pfinalu0.x-Pfinal.x;
+    deltaU->y=Pfinalu0.y-Pfinal.y;
+    deltaU->z=Pfinalu0.z-Pfinal.z;
+    
+    deltaV->x=Pfinalv0.x-Pfinal.x;
+    deltaV->y=Pfinalv0.y-Pfinal.y;
+    deltaV->z=Pfinalv0.z-Pfinal.z;
+    
+    *source=S;
+}
+void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+    
+    
+    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    
+    //cudaArray Descriptor
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    //cuda Array
+    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+    
+    
+    cudaMemcpy3DParms copyParams = {0};
+    //Array creation
+    copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
+    copyParams.dstArray = d_cuArrTex[0];
+    copyParams.extent   = extent;
+    copyParams.kind     = cudaMemcpyHostToDevice;
+    cudaMemcpy3DAsync(&copyParams,stream[1]);
+    
+    
+    //Array creation End
+    
+    cudaResourceDesc    texRes;
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
+    texRes.resType = cudaResourceTypeArray;
+    texRes.res.array.array  = d_cuArrTex[0];
+    cudaTextureDesc     texDescr;
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    texDescr.normalizedCoords = false;
+    texDescr.filterMode = cudaFilterModeLinear;
+    texDescr.addressMode[0] = cudaAddressModeBorder;
+    texDescr.addressMode[1] = cudaAddressModeBorder;
+    texDescr.addressMode[2] = cudaAddressModeBorder;
+    texDescr.readMode = cudaReadModeElementType;
+    cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+    
+}
\ No newline at end of file
diff --git a/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip b/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip
new file mode 100644
index 00000000..1280b6ed
--- /dev/null
+++ b/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA functions for texture-memory interpolation based projection
+ *
+ *
+ * CODE by       Ander Biguri
+ *               Sepideh Hatamikia (arbitrary rotation)
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+#include "ray_interpolated_projection.hpp"
+
+#include "types_TIGRE.hpp"
+#include "GpuIds.hpp"
+
+#ifndef PROJECTION_PARALLEL_HPP
+#define PROJECTION_PARALLEL_HPP
+
+int interpolation_projection_parallel(float* img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids);
+// float computeMaxLength(Geometry geo, float alpha);
+void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
+
+// float maxDistanceCubeXY(Geometry geo, float alpha,int i);
+
+// below, not used
+Geometry nomralizeGeometryImage(Geometry geo);
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/tv_proximal.cu b/Common/CUDA/tv_proximal.cu
index 32ae99c2..87d5407f 100644
--- a/Common/CUDA/tv_proximal.cu
+++ b/Common/CUDA/tv_proximal.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * MATLAB MEX  functions for TV image denoising. Check inputs and parses
@@ -57,17 +58,17 @@
 #include "tv_proximal.hpp"
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                cudaDeviceReset();\
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
+                hipDeviceReset();\
                 mexPrintf("%s \n",msg);\
-                        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",cudaGetErrorString(__err));\
+                        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",hipGetErrorString(__err));\
         } \
 } while (0)
 void cpy_from_host(float* device_array,float* host_array, 
                    unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
                    unsigned long long pixels_per_slice, unsigned int buffer_length, 
-                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size);  
+                   hipStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size);  
     
     
     __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
@@ -263,11 +264,11 @@ void cpy_from_host(float* device_array,float* host_array,
         // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
         int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+        hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
         if (isHostRegisterSupported & splits>1){
-            cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            hipHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
+            hipHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),hipHostRegisterPortable);
         }
         cudaCheckErrors("Error pinning memory");
         
@@ -282,21 +283,21 @@ void cpy_from_host(float* device_array,float* host_array,
             if (buffer_length<maxIter){ // if we do only 1 big iter, they are not needed.
                 mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:Memory","TV dneoising requires 5 times the image memory. Your GPU(s) do not have the required memory.\n This memory will be attempted to allocate on the CPU, Whic may fail or slow the computation by a very significant amount.\n If you want to kill the execution: CTRL+C");
                 
-                cudaMallocHost((void**)&h_px,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                hipHostMalloc((void**)&h_px,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
                 cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
                 
-                cudaMallocHost((void**)&h_py,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                hipHostMalloc((void**)&h_py,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
                 cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
                 
-                cudaMallocHost((void**)&h_pz,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                hipHostMalloc((void**)&h_pz,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
                 cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
             }
             h_u=dst;
         }else{
-            cudaMallocHost((void**)&buffer_u,  pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_px, pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_py, pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_pz, pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_u,  pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_px, pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_py, pixels_per_slice*sizeof(float));
+            hipHostMalloc((void**)&buffer_pz, pixels_per_slice*sizeof(float));
             
         }
         // We should be good to go memory wise.
@@ -310,31 +311,31 @@ void cpy_from_host(float* device_array,float* host_array,
         
         //Malloc
         for(dev=0;dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             // F
-            cudaMalloc((void**)&d_src[dev], mem_img_each_GPU);
+            hipMalloc((void**)&d_src[dev], mem_img_each_GPU);
             // U
-            cudaMalloc((void**)&d_u [dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_u [dev],  mem_img_each_GPU);
             // PX
-            cudaMalloc((void**)&d_px[dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_px[dev],  mem_img_each_GPU);
             // PY
-            cudaMalloc((void**)&d_py[dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_py[dev],  mem_img_each_GPU);
             // PZ
-            cudaMalloc((void**)&d_pz[dev],  mem_img_each_GPU);
+            hipMalloc((void**)&d_pz[dev],  mem_img_each_GPU);
         }
-        cudaDeviceSynchronize();
+        hipDeviceSynchronize();
         cudaCheckErrors("Malloc  error");
         
         
         // Create streams
         int nStream_device=5;
         int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
+                hipStreamCreate(&stream[i+dev*nStream_device]);
             }
         }
         cudaCheckErrors("Stream creation fail");
@@ -391,23 +392,23 @@ void cpy_from_host(float* device_array,float* host_array,
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
 
-                        cudaSetDevice(gpuids[dev]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_src[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        hipSetDevice(gpuids[dev]);
+                        if (is_last_chunk) {hipMemsetAsync(d_src[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
                         cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
 
-                        cudaMemcpyAsync(d_u[dev], d_src[dev], mem_img_each_GPU, cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
-                        cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
-                        cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
-                        cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        hipMemcpyAsync(d_u[dev], d_src[dev], mem_img_each_GPU, hipMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                        hipMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        hipMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        hipMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
                     }
 
                     // Sync
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                     }
                     cudaCheckErrors("Memcpy failure");
                     
@@ -419,34 +420,34 @@ void cpy_from_host(float* device_array,float* host_array,
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_u[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
+                        if (is_last_chunk) {hipMemsetAsync(d_u[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
                         cpy_from_host(d_u[dev],h_u,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
                     }
 
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+2]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+2]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+2]);
+                        if (is_last_chunk) {hipMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+2]);}
                         cpy_from_host(d_px[dev],h_px,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+2],  is_first_chunk,  is_last_chunk, image_size);
                     }
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+3]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+3]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+3]);
+                        if (is_last_chunk) {hipMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+3]);}
                         cpy_from_host(d_py[dev],h_py,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+3],  is_first_chunk,  is_last_chunk, image_size);
                     }
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+4]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+4]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+4]);
+                        if (is_last_chunk) {hipMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+4]);}
                         cpy_from_host(d_pz[dev],h_pz,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+4],  is_first_chunk,  is_last_chunk, image_size);
                         // Z derivative must be negated in sign to keep Neumman conditions
                         if (is_first_chunk){
@@ -459,15 +460,15 @@ void cpy_from_host(float* device_array,float* host_array,
                     for (dev = 0; dev < deviceCount; dev++){ 
                         is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
                         is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        hipSetDevice(gpuids[dev]);
+                        hipStreamSynchronize(stream[dev*nStream_device+1]);
+                        if (is_last_chunk) {hipMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
                         cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
                     }
 
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
+                        hipSetDevice(gpuids[dev]);
+                        hipDeviceSynchronize();
                         cudaCheckErrors("Memcpy failure on multi split");
                     }
                 }
@@ -480,7 +481,7 @@ void cpy_from_host(float* device_array,float* host_array,
                     // bdim and gdim
                     
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
                         dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
@@ -490,7 +491,7 @@ void cpy_from_host(float* device_array,float* host_array,
                                 spacing[2], spacing[1], spacing[0]);
                     }
                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
                         dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
@@ -503,8 +504,8 @@ void cpy_from_host(float* device_array,float* host_array,
                 
                 // Synchronize mathematics, make sure bounding pixels are correct
                 for(dev=0; dev<deviceCount;dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
+                    hipSetDevice(gpuids[dev]);
+                    hipDeviceSynchronize();
                 }
 
                 // We have done as many iterations as our buffer allowed. We now need to syncronize the buffers.
@@ -519,45 +520,45 @@ void cpy_from_host(float* device_array,float* host_array,
                     // Pass buffer_pixels amount of data from the start of the image to the previous GPU
                     for(dev=0; dev<deviceCount;dev++){
                         if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpyAsync(buffer_u , d_u[dev+1] , buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+1]);
-                            cudaMemcpyAsync(buffer_px, d_px[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+2]);
-                            cudaMemcpyAsync(buffer_py, d_py[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+3]);
-                            cudaMemcpyAsync(buffer_pz, d_pz[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev+1]);
+                            hipMemcpyAsync(buffer_u , d_u[dev+1] , buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+1]);
+                            hipMemcpyAsync(buffer_px, d_px[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+2]);
+                            hipMemcpyAsync(buffer_py, d_py[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+3]);
+                            hipMemcpyAsync(buffer_pz, d_pz[dev+1], buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev+1)*nStream_device+4]);
 
                             
-                            cudaSetDevice(gpuids[dev]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+1]);
-                            cudaMemcpyAsync(d_u[dev] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+2]);
-                            cudaMemcpyAsync(d_px[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+3]);
-                            cudaMemcpyAsync(d_py[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+4]);
-                            cudaMemcpyAsync(d_pz[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+1]);
+                            hipMemcpyAsync(d_u[dev] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_u , buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+2]);
+                            hipMemcpyAsync(d_px[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_px, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+3]);
+                            hipMemcpyAsync(d_py[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_py, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
+                            hipStreamSynchronize(stream[(dev+1)*nStream_device+4]);
+                            hipMemcpyAsync(d_pz[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pz, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
                             
                             
                         }
-                        cudaDeviceSynchronize();
+                        hipDeviceSynchronize();
                         // Pass buffer_pixels amoung of data of the end part of the image to the next GPU.
                         if (dev>0){
                             // U
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer_u,  d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]);
-                            cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]);
-                            cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]);
-                            cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev-1]);
+                            hipMemcpyAsync(buffer_u,  d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]);
+                            hipMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]);
+                            hipMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]);
+                            hipMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]);
                             
                             
-                            cudaSetDevice(gpuids[dev]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]);
-                            cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]);
-                            cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]);
-                            cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]);
-                            cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
+                            hipSetDevice(gpuids[dev]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+1]);
+                            hipMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+2]);
+                            hipMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+3]);
+                            hipMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
+                            hipStreamSynchronize(stream[(dev-1)*nStream_device+4]);
+                            hipMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), hipMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
                             
                             
                         }
@@ -567,22 +568,22 @@ void cpy_from_host(float* device_array,float* host_array,
                 }else{
                     // Vopy all the U variable into the host.
                     for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
+                        hipSetDevice(gpuids[dev]);
                         curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                         linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
                         total_pixels     = curr_slices*pixels_per_slice;
-                        cudaMemcpyAsync(&h_u[linear_idx_start],  d_u [dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                        hipMemcpyAsync(&h_u[linear_idx_start],  d_u [dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                     }
                     if ((i+buffer_length)<maxIter){ // If its the last iteration, we don't need to get these out.
                         // if its not, copy them to host fully. 
                         for(dev=0; dev<deviceCount;dev++){
-                            cudaSetDevice(gpuids[dev]);
+                            hipSetDevice(gpuids[dev]);
                             curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
                             linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
                             total_pixels     = curr_slices*pixels_per_slice;
-                            cudaMemcpyAsync(&h_px[linear_idx_start], d_px[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+2]);
-                            cudaMemcpyAsync(&h_py[linear_idx_start], d_py[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+3]);
-                            cudaMemcpyAsync(&h_pz[linear_idx_start], d_pz[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+4]);
+                            hipMemcpyAsync(&h_px[linear_idx_start], d_px[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+2]);
+                            hipMemcpyAsync(&h_py[linear_idx_start], d_py[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+3]);
+                            hipMemcpyAsync(&h_pz[linear_idx_start], d_pz[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+4]);
                             
                         }
                     }
@@ -592,8 +593,8 @@ void cpy_from_host(float* device_array,float* host_array,
         }//END main iter
         
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("TV minimization");
         
@@ -601,47 +602,47 @@ void cpy_from_host(float* device_array,float* host_array,
         // lets get it out. 
         if(splits==1){
             for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
+                hipSetDevice(gpuids[dev]);
                 curr_slices  = ((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
                 total_pixels = curr_slices*pixels_per_slice;
-                cudaMemcpyAsync(dst+slices_per_split*pixels_per_slice*dev, d_u[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                hipMemcpyAsync(dst+slices_per_split*pixels_per_slice*dev, d_u[dev]+buffer_pixels,total_pixels*sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
             }
         } // done, everything in GPU and auxiliary variables are good to go. 
 
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("Copy result back");
         for(dev=0; dev<deviceCount;dev++){
             
-            cudaFree(d_src[dev]);
-            cudaFree(d_u [dev]);
-            cudaFree(d_pz[dev]);
-            cudaFree(d_py[dev]);
-            cudaFree(d_px[dev]);
+            hipFree(d_src[dev]);
+            hipFree(d_u [dev]);
+            hipFree(d_pz[dev]);
+            hipFree(d_py[dev]);
+            hipFree(d_px[dev]);
         }
         if(splits>1 && buffer_length<maxIter){
-            cudaFreeHost(h_px);
-            cudaFreeHost(h_py);
-            cudaFreeHost(h_pz);
+            hipHostFree(h_px);
+            hipHostFree(h_py);
+            hipHostFree(h_pz);
         }else if(splits==1){
-            cudaFreeHost(buffer_u);
-            cudaFreeHost(buffer_px);
-            cudaFreeHost(buffer_py);
-            cudaFreeHost(buffer_pz);
+            hipHostFree(buffer_u);
+            hipHostFree(buffer_px);
+            hipHostFree(buffer_py);
+            hipHostFree(buffer_pz);
         }
         
         for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
+           hipStreamDestroy(stream[i]) ;
 
         if (isHostRegisterSupported & splits>1){
-            cudaHostUnregister(src);
-            cudaHostUnregister(dst);
+            hipHostUnregister(src);
+            hipHostUnregister(dst);
         }
         for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
         }
         cudaCheckErrors("Copy free ");
         
@@ -654,8 +655,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
         const int deviceCount = gpuids.GetLength();
 
         for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
+            hipSetDevice(gpuids[dev]);
+            hipMemGetInfo(&memfree,&memtotal);
             if(dev==0) *mem_GPU_global=memfree;
             if(memfree<memtotal/2){
                 mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
@@ -672,22 +673,22 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
 void cpy_from_host(float* device_array,float* host_array, 
                    unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
                    unsigned long long pixels_per_slice, unsigned int buffer_length, 
-                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size)
+                   hipStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size)
 {
 
     // Initial and last cases are special. These define the boundary condition. In our case, we are using Neumann boundary condition
     // so we need to copy the edge slice into the buffer
     if(is_first_chunk){
         for (unsigned int j=0;j<buffer_length;j++){
-            cudaMemcpyAsync(device_array+pixels_per_slice*j, host_array+pixels_per_slice*(buffer_length-j), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream); 
+            hipMemcpyAsync(device_array+pixels_per_slice*j, host_array+pixels_per_slice*(buffer_length-j), pixels_per_slice*sizeof(float), hipMemcpyHostToDevice,stream); 
         }       
     }
     if(is_last_chunk){  
 
         for (unsigned int j=0;j<buffer_length;j++){
-           cudaMemcpyAsync(device_array+bytes_device+pixels_per_slice*j, host_array+pixels_per_slice*(image_size[2]-j-2), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream);
+           hipMemcpyAsync(device_array+bytes_device+pixels_per_slice*j, host_array+pixels_per_slice*(image_size[2]-j-2), pixels_per_slice*sizeof(float), hipMemcpyHostToDevice,stream);
         }
     }
-    cudaStreamSynchronize(stream);
-    cudaMemcpyAsync(device_array +offset_device, host_array +offset_host,  bytes_device*sizeof(float), cudaMemcpyHostToDevice,stream);
+    hipStreamSynchronize(stream);
+    hipMemcpyAsync(device_array +offset_device, host_array +offset_host,  bytes_device*sizeof(float), hipMemcpyHostToDevice,stream);
 }
\ No newline at end of file
diff --git a/Common/CUDA/tv_proximal.cu.prehip b/Common/CUDA/tv_proximal.cu.prehip
new file mode 100644
index 00000000..32ae99c2
--- /dev/null
+++ b/Common/CUDA/tv_proximal.cu.prehip
@@ -0,0 +1,693 @@
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX  functions for TV image denoising. Check inputs and parses
+ * MATLAB data to C++ data.
+ *
+ *
+ * CODE by   Imanol Luengo
+ *           PhD student University of Nottingham
+ *           imaluengo@gmail.com
+ *           2015
+ *           Modified by Ander Biguri for multi-GPU 
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+
+// http://gpu4vision.icg.tugraz.at/papers/2010/knoll.pdf#pub47
+#define MAXTREADS 1024
+#define MAX_BUFFER 60
+#define BLOCK_SIZE 10  // BLOCK_SIZE^3 must be smaller than MAXTREADS
+
+#include "tv_proximal.hpp"
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                cudaDeviceReset();\
+                mexPrintf("%s \n",msg);\
+                        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",cudaGetErrorString(__err));\
+        } \
+} while (0)
+void cpy_from_host(float* device_array,float* host_array, 
+                   unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
+                   unsigned long long pixels_per_slice, unsigned int buffer_length, 
+                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size);  
+    
+    
+    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
+    {
+        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
+        for(; i<n; i+=gridDim.x*blockDim.x) {
+            vec[i]*=scalar;
+        }
+    }
+
+    __device__ __inline__
+            float divergence(const float* pz, const float* py, const float* px,
+            long z, long y, long x, long depth, long rows, long cols,
+            float dz, float dy, float dx)
+    {
+        long size2d = rows*cols;
+        long idx = z * size2d + y * cols + x;
+        float _div = 0.0f;
+        
+        if ( z - 1 >= 0 ) {
+            _div += (pz[idx] - pz[(z-1)*size2d + y*cols + x]) / dz;
+        } else {
+            _div += pz[idx];
+        }
+        
+        if ( y - 1 >= 0 ) {
+            _div += (py[idx] - py[z*size2d + (y-1)*cols + x]) / dy;
+        } else {
+            _div += py[idx];
+        }
+        
+        if ( x - 1 >= 0 ) {
+            _div += (px[idx] - px[z*size2d + y*cols + (x-1)]) / dx;
+        } else {
+            _div += px[idx];
+        }
+        
+        return _div;
+    }
+    
+    __device__ __inline__
+            void gradient(const float* u, float* grad,
+            long z, long y, long x,
+            long depth, long rows, long cols,
+            float dz, float dy, float dx)
+    {
+        long size2d = rows*cols;
+        long idx = z * size2d + y * cols + x;
+        
+        float uidx = u[idx];
+        
+        if ( z + 1 < depth ) {
+            grad[0] = (u[(z+1)*size2d + y*cols + x] - uidx) / dz;
+        }
+        
+        if ( y + 1 < rows ) {
+            grad[1] = (u[z*size2d + (y+1)*cols + x] - uidx) / dy;
+        }
+        
+        if ( x + 1 < cols ) {
+            grad[2] = (u[z*size2d + y*cols + (x+1)] - uidx) / dx;
+        }
+    }
+    
+    
+    __global__
+            void update_u(const float* f, const float* pz, const float* py, const float* px, float* u,
+            float tau, float lambda,
+            long depth, long rows, long cols,
+            float dz, float dy, float dx)
+    {
+        long x = threadIdx.x + blockIdx.x * blockDim.x;
+        long y = threadIdx.y + blockIdx.y * blockDim.y;
+        long z = threadIdx.z + blockIdx.z * blockDim.z;
+        long idx = z * rows * cols + y * cols + x;
+        
+        if ( x >= cols || y >= rows || z >= depth )
+            return;
+        
+        float _div = divergence(pz, py, px, z, y, x, depth, rows, cols, dz, dy, dx);
+        
+        u[idx] = u[idx] * (1.0f - tau) + tau * (f[idx] + (1.0f/lambda) * _div);
+    }
+    
+    
+    __global__
+            void update_p(const float* u, float* pz, float* py, float* px,
+            float tau, long depth, long rows, long cols,
+            float dz, float dy, float dx)
+    {
+        long x = threadIdx.x + blockIdx.x * blockDim.x;
+        long y = threadIdx.y + blockIdx.y * blockDim.y;
+        long z = threadIdx.z + blockIdx.z * blockDim.z;
+        long idx = z * rows * cols + y * cols + x;
+        
+        if ( x >= cols || y >= rows || z >= depth )
+            return;
+        
+        float grad[3] = {0,0,0}, q[3];
+        gradient(u, grad, z, y, x, depth, rows, cols, dz, dy, dx);
+        
+        q[0] = pz[idx] + tau * grad[0];
+        q[1] = py[idx] + tau * grad[1];
+        q[2] = px[idx] + tau * grad[2];
+        
+        float norm = fmaxf(1.0f, sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2]));
+        
+        pz[idx] = q[0] / norm;
+        py[idx] = q[1] / norm;
+        px[idx] = q[2] / norm;
+    }
+    
+    
+// Main function
+    void tvdenoising(float* src, float* dst, float lambda,
+            const float* spacing, const long* image_size, int maxIter, const GpuIds& gpuids) {
+        
+        // Prepare for MultiGPU
+        int deviceCount = gpuids.GetLength();
+        cudaCheckErrors("Device query fail");
+        if (deviceCount == 0) {
+            mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","There are no available device(s) that support CUDA\n");
+        }
+        //
+        // CODE assumes
+        // 1.-All available devices are usable by this code
+        // 2.-All available devices are equal, they are the same machine (warning thrown)
+        // Check the available devices, and if they are the same
+        if (!gpuids.AreEqualDevices()) {
+            mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+        }
+        int dev;
+
+        // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
+        
+        size_t mem_GPU_global;
+        checkFreeMemory(gpuids, &mem_GPU_global);
+        
+        
+        // %5 of free memory should be enough, we have almost no variables in these kernels
+       size_t total_pixels           = image_size[0] * image_size[1] * image_size[2] ;
+       const size_t pixels_per_slice = image_size[0] * image_size[1] ;
+       const size_t mem_slice_image  = sizeof(float)* pixels_per_slice  ;
+       const size_t mem_size_image   = sizeof(float)* total_pixels;
+        
+        // Decide how are we handling the distribution of computation
+        size_t mem_img_each_GPU;
+        
+        unsigned int buffer_length=1;
+        //Does everything fit in the GPU?
+        unsigned int slices_per_split;
+        unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this.
+        if(mem_GPU_global> 5*mem_size_image+5*mem_slice_image*buffer_length*2){
+            // We only need to split if we have extra GPUs
+            slices_per_split=(image_size[2]+deviceCount-1)/deviceCount;
+            mem_img_each_GPU=mem_slice_image*(  (image_size[2]+deviceCount-1)/deviceCount  + buffer_length*2);
+        }else{
+            // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all
+            size_t mem_free=mem_GPU_global;
+            
+            splits=(unsigned int)(ceil(((float)(5*mem_size_image)/(float)(deviceCount))/mem_free));
+            // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images.
+            // lets make sure these 2 slices fit, if they do not, add 1 to splits.
+            slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits);
+            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
+            
+            // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory
+            if (mem_GPU_global< 5*mem_img_each_GPU){
+                // one more split should do the job, as its an edge case.
+                splits++;
+                //recompute for later
+                slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap
+                mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
+            }
+            
+            // How many EXTRA buffer slices should be able to fit in here??!?!
+            mem_free=mem_GPU_global-(5*mem_img_each_GPU);
+            unsigned int extra_buff=(mem_free/mem_slice_image);
+            buffer_length=(extra_buff/2)/5; // we need double whatever this results in, rounded down.
+            
+            buffer_length=min(MAX_BUFFER,buffer_length);
+            
+            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
+            
+            // Assert
+            if (mem_GPU_global< 5*mem_img_each_GPU){
+                mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","Bad assert. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n");
+            }
+        }
+        
+        
+        // Lets try to make the host memory pinned:
+        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+        int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+        if (isHostRegisterSupported & splits>1){
+            cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
+        }
+        cudaCheckErrors("Error pinning memory");
+        
+        
+        
+        // Lets allocate auxiliary  variables.
+        float* buffer_u, *buffer_px, *buffer_py, *buffer_pz;
+        float* h_px, *h_py, *h_pz, *h_u;
+        if(splits>1){
+            
+            //These take A LOT of memory and A LOT of time to use. If we can avoid using them, better.
+            if (buffer_length<maxIter){ // if we do only 1 big iter, they are not needed.
+                mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:Memory","TV dneoising requires 5 times the image memory. Your GPU(s) do not have the required memory.\n This memory will be attempted to allocate on the CPU, Whic may fail or slow the computation by a very significant amount.\n If you want to kill the execution: CTRL+C");
+                
+                cudaMallocHost((void**)&h_px,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
+                
+                cudaMallocHost((void**)&h_py,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
+                
+                cudaMallocHost((void**)&h_pz,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
+                cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
+            }
+            h_u=dst;
+        }else{
+            cudaMallocHost((void**)&buffer_u,  pixels_per_slice*sizeof(float));
+            cudaMallocHost((void**)&buffer_px, pixels_per_slice*sizeof(float));
+            cudaMallocHost((void**)&buffer_py, pixels_per_slice*sizeof(float));
+            cudaMallocHost((void**)&buffer_pz, pixels_per_slice*sizeof(float));
+            
+        }
+        // We should be good to go memory wise.
+        
+        
+        float** d_src   =(float**)malloc(deviceCount*sizeof(float*));
+        float** d_u     =(float**)malloc(deviceCount*sizeof(float*));
+        float** d_px    =(float**)malloc(deviceCount*sizeof(float*));
+        float** d_py    =(float**)malloc(deviceCount*sizeof(float*));
+        float** d_pz    =(float**)malloc(deviceCount*sizeof(float*));
+        
+        //Malloc
+        for(dev=0;dev<deviceCount;dev++){
+            cudaSetDevice(gpuids[dev]);
+            // F
+            cudaMalloc((void**)&d_src[dev], mem_img_each_GPU);
+            // U
+            cudaMalloc((void**)&d_u [dev],  mem_img_each_GPU);
+            // PX
+            cudaMalloc((void**)&d_px[dev],  mem_img_each_GPU);
+            // PY
+            cudaMalloc((void**)&d_py[dev],  mem_img_each_GPU);
+            // PZ
+            cudaMalloc((void**)&d_pz[dev],  mem_img_each_GPU);
+        }
+        cudaDeviceSynchronize();
+        cudaCheckErrors("Malloc  error");
+        
+        
+        // Create streams
+        int nStream_device=5;
+        int nStreams=deviceCount*nStream_device;
+        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
+        
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            for (int i = 0; i < nStream_device; ++i){
+                cudaStreamCreate(&stream[i+dev*nStream_device]);
+            }
+        }
+        cudaCheckErrors("Stream creation fail");
+        
+        
+        
+        
+        // Allocate CPU buffer if needed, warn user if not.
+        
+        
+        
+        unsigned int curr_slices;
+        unsigned long long curr_pixels;
+        size_t linear_idx_start;
+        
+        unsigned long long buffer_pixels=buffer_length*pixels_per_slice;
+        
+        unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        unsigned long long* offset_host  =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
+        bool is_first_chunk;
+        bool is_last_chunk;
+        
+        float tau2, tau1;
+
+        for(unsigned int i=0;i<maxIter;i+=(buffer_length)){
+           
+            for(unsigned int sp=0;sp<splits;sp++){
+                
+                // For each iteration we need to compute all the image. The ordering of these loops
+                // need to be like this due to the bounding layers between splits. If more than 1 split is needed
+                // for each GPU then there is no other way that taking the entire memory out of GPU and putting it back.
+                // If the memory can be shared between GPUs fully without extra splits, then there is an easy way of synchronizing the memory
+                
+                // Copy image to memory
+                for (dev = 0; dev < deviceCount; dev++){
+                    // Precompute indices and needed bytes
+                    curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                    curr_pixels=curr_slices*pixels_per_slice;
+                    linear_idx_start=pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
+                    
+                    // Check if its the first or last chunck
+                    is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                    is_first_chunk=!(sp*deviceCount+dev);
+                    
+                    // lets compute where we start copies and how much. This avoids 3 calls to Memcpy
+                    offset_device[dev]=buffer_pixels*is_first_chunk;
+                    offset_host[dev]=linear_idx_start-buffer_pixels*!is_first_chunk;
+                    bytes_device[dev]=curr_pixels+buffer_pixels*!is_first_chunk+buffer_pixels*!is_last_chunk;
+                }
+                // copy data to the GPU if we are just starting
+                if(i==0){
+                    for (dev = 0; dev < deviceCount; dev++){
+                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                        is_first_chunk=!(sp*deviceCount+dev);
+
+                        cudaSetDevice(gpuids[dev]);
+                        if (is_last_chunk) {cudaMemsetAsync(d_src[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+
+                        cudaMemcpyAsync(d_u[dev], d_src[dev], mem_img_each_GPU, cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
+                        cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                        cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
+                    }
+
+                    // Sync
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                    }
+                    cudaCheckErrors("Memcpy failure");
+                    
+                }
+                // if we need to split and its not the first iteration, then we need to copy from Host memory.
+                // d_src is the original image, with no change.
+                if (splits>1 & i>0){
+
+                    for (dev = 0; dev < deviceCount; dev++){ 
+                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                        is_first_chunk=!(sp*deviceCount+dev);
+                        cudaSetDevice(gpuids[dev]);
+                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
+                        if (is_last_chunk) {cudaMemsetAsync(d_u[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        cpy_from_host(d_u[dev],h_u,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
+                    }
+
+                    for (dev = 0; dev < deviceCount; dev++){ 
+                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                        is_first_chunk=!(sp*deviceCount+dev);
+                        cudaSetDevice(gpuids[dev]);
+                        cudaStreamSynchronize(stream[dev*nStream_device+2]);
+                        if (is_last_chunk) {cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+2]);}
+                        cpy_from_host(d_px[dev],h_px,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+2],  is_first_chunk,  is_last_chunk, image_size);
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){ 
+                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                        is_first_chunk=!(sp*deviceCount+dev);
+                        cudaSetDevice(gpuids[dev]);
+                        cudaStreamSynchronize(stream[dev*nStream_device+3]);
+                        if (is_last_chunk) {cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+3]);}
+                        cpy_from_host(d_py[dev],h_py,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+3],  is_first_chunk,  is_last_chunk, image_size);
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){ 
+                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                        is_first_chunk=!(sp*deviceCount+dev);
+                        cudaSetDevice(gpuids[dev]);
+                        cudaStreamSynchronize(stream[dev*nStream_device+4]);
+                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+4]);}
+                        cpy_from_host(d_pz[dev],h_pz,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+4],  is_first_chunk,  is_last_chunk, image_size);
+                        // Z derivative must be negated in sign to keep Neumman conditions
+                        if (is_first_chunk){
+                            multiplyArrayScalar<<<60,MAXTREADS,0,stream[dev*nStream_device+4]>>>(d_pz[dev],             -1,  pixels_per_slice*buffer_length);    
+                        }
+                        if (is_last_chunk){
+                            multiplyArrayScalar<<<60,MAXTREADS,0,stream[dev*nStream_device+4]>>>(d_pz[dev]+bytes_device[dev],-1,  pixels_per_slice*buffer_length);    
+                        }
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){ 
+                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
+                        is_first_chunk=!(sp*deviceCount+dev);
+                        cudaSetDevice(gpuids[dev]);
+                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
+                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
+                        cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
+                    }
+
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        cudaDeviceSynchronize();
+                        cudaCheckErrors("Memcpy failure on multi split");
+                    }
+                }
+                
+                // Inter interations.
+                for(unsigned int ib=0;  (ib<(buffer_length)) && ((i+ib)<maxIter);  ib++){
+
+                    tau2 = 0.3f + 0.02f * (i+ib);
+                    tau1 = (1.f/tau2) * ((1.f/6.f) - (5.f/(15.f+(i+ib))));
+                    // bdim and gdim
+                    
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
+                        dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
+                        
+                        update_u<<<grid, block,0,stream[dev*nStream_device]>>>(d_src[dev], d_pz[dev], d_py[dev], d_px[dev], d_u[dev], tau1, lambda,
+                                (long)(curr_slices+buffer_length*2), image_size[1],image_size[0],
+                                spacing[2], spacing[1], spacing[0]);
+                    }
+                    for (dev = 0; dev < deviceCount; dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
+                        dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
+                        
+                        update_p<<<grid, block,0,stream[dev*nStream_device]>>>(d_u[dev], d_pz[dev], d_py[dev], d_px[dev], tau2,
+                                (long)(curr_slices+buffer_length*2), image_size[1], image_size[0],
+                                spacing[2], spacing[1], spacing[0]);
+                    }
+                }// END internal iter
+                
+                // Synchronize mathematics, make sure bounding pixels are correct
+                for(dev=0; dev<deviceCount;dev++){
+                    cudaSetDevice(gpuids[dev]);
+                    cudaDeviceSynchronize();
+                }
+
+                // We have done as many iterations as our buffer allowed. We now need to syncronize the buffers.
+            
+                if(splits==1){
+                    // If everything fits in the GPUs, we can just share the updates between GPUs directly.
+                    // We iterate for each device, and we copy the buffer pixels from each device. 
+                    // "buffer" variables are just host auxiliary variables to allow the copy from GPU to GPU. 
+                    // Essentially this code takes for each device (exceptions for the first and last devices included) "buffer_pixels" amount 
+                    // of the beggining and end of each important variable and passes it to the next/previous GPU (the one containing the next/previous chunck)
+
+                    // Pass buffer_pixels amount of data from the start of the image to the previous GPU
+                    for(dev=0; dev<deviceCount;dev++){
+                        if (dev<deviceCount-1){
+                            cudaSetDevice(gpuids[dev+1]);
+                            cudaMemcpyAsync(buffer_u , d_u[dev+1] , buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+1]);
+                            cudaMemcpyAsync(buffer_px, d_px[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+2]);
+                            cudaMemcpyAsync(buffer_py, d_py[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+3]);
+                            cudaMemcpyAsync(buffer_pz, d_pz[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+4]);
+
+                            
+                            cudaSetDevice(gpuids[dev]);
+                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+1]);
+                            cudaMemcpyAsync(d_u[dev] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
+                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+2]);
+                            cudaMemcpyAsync(d_px[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
+                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+3]);
+                            cudaMemcpyAsync(d_py[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
+                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+4]);
+                            cudaMemcpyAsync(d_pz[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
+                            
+                            
+                        }
+                        cudaDeviceSynchronize();
+                        // Pass buffer_pixels amoung of data of the end part of the image to the next GPU.
+                        if (dev>0){
+                            // U
+                            cudaSetDevice(gpuids[dev-1]);
+                            cudaMemcpyAsync(buffer_u,  d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]);
+                            cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]);
+                            cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]);
+                            cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]);
+                            
+                            
+                            cudaSetDevice(gpuids[dev]);
+                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]);
+                            cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
+                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]);
+                            cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
+                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]);
+                            cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
+                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]);
+                            cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
+                            
+                            
+                        }
+                    }
+                // This is the case when we can't solely use GPU memory, as the total size of the images+variables exceeds total amounf of memory among GPUs.
+                // This situation requires partial results and full memory allocation in the host. 
+                }else{
+                    // Vopy all the U variable into the host.
+                    for(dev=0; dev<deviceCount;dev++){
+                        cudaSetDevice(gpuids[dev]);
+                        curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                        linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
+                        total_pixels     = curr_slices*pixels_per_slice;
+                        cudaMemcpyAsync(&h_u[linear_idx_start],  d_u [dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+                    }
+                    if ((i+buffer_length)<maxIter){ // If its the last iteration, we don't need to get these out.
+                        // if its not, copy them to host fully. 
+                        for(dev=0; dev<deviceCount;dev++){
+                            cudaSetDevice(gpuids[dev]);
+                            curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
+                            linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
+                            total_pixels     = curr_slices*pixels_per_slice;
+                            cudaMemcpyAsync(&h_px[linear_idx_start], d_px[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+2]);
+                            cudaMemcpyAsync(&h_py[linear_idx_start], d_py[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+3]);
+                            cudaMemcpyAsync(&h_pz[linear_idx_start], d_pz[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+4]);
+                            
+                        }
+                    }
+
+                }
+            }//END splits
+        }//END main iter
+        
+        for(dev=0; dev<deviceCount;dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDeviceSynchronize();
+        }
+        cudaCheckErrors("TV minimization");
+        
+        // We are done. If we were solely using GPU memory, because the problem fitted fully on all GPU memory available, then the result is still inside the GPU.
+        // lets get it out. 
+        if(splits==1){
+            for(dev=0; dev<deviceCount;dev++){
+                cudaSetDevice(gpuids[dev]);
+                curr_slices  = ((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
+                total_pixels = curr_slices*pixels_per_slice;
+                cudaMemcpyAsync(dst+slices_per_split*pixels_per_slice*dev, d_u[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
+            }
+        } // done, everything in GPU and auxiliary variables are good to go. 
+
+        for(dev=0; dev<deviceCount;dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDeviceSynchronize();
+        }
+        cudaCheckErrors("Copy result back");
+        for(dev=0; dev<deviceCount;dev++){
+            
+            cudaFree(d_src[dev]);
+            cudaFree(d_u [dev]);
+            cudaFree(d_pz[dev]);
+            cudaFree(d_py[dev]);
+            cudaFree(d_px[dev]);
+        }
+        if(splits>1 && buffer_length<maxIter){
+            cudaFreeHost(h_px);
+            cudaFreeHost(h_py);
+            cudaFreeHost(h_pz);
+        }else if(splits==1){
+            cudaFreeHost(buffer_u);
+            cudaFreeHost(buffer_px);
+            cudaFreeHost(buffer_py);
+            cudaFreeHost(buffer_pz);
+        }
+        
+        for (int i = 0; i < nStreams; ++i)
+           cudaStreamDestroy(stream[i]) ;
+
+        if (isHostRegisterSupported & splits>1){
+            cudaHostUnregister(src);
+            cudaHostUnregister(dst);
+        }
+        for(dev=0; dev<deviceCount;dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDeviceSynchronize();
+        }
+        cudaCheckErrors("Copy free ");
+        
+    }
+    
+
+void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
+        size_t memfree;
+        size_t memtotal;
+        const int deviceCount = gpuids.GetLength();
+
+        for (int dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaMemGetInfo(&memfree,&memtotal);
+            if(dev==0) *mem_GPU_global=memfree;
+            if(memfree<memtotal/2){
+                mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
+            }
+            cudaCheckErrors("Check mem error");
+            
+            *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
+        }
+        *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
+        
+        //*mem_GPU_global= insert your known number here, in bytes.
+}
+
+void cpy_from_host(float* device_array,float* host_array, 
+                   unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
+                   unsigned long long pixels_per_slice, unsigned int buffer_length, 
+                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size)
+{
+
+    // Initial and last cases are special. These define the boundary condition. In our case, we are using Neumann boundary condition
+    // so we need to copy the edge slice into the buffer
+    if(is_first_chunk){
+        for (unsigned int j=0;j<buffer_length;j++){
+            cudaMemcpyAsync(device_array+pixels_per_slice*j, host_array+pixels_per_slice*(buffer_length-j), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream); 
+        }       
+    }
+    if(is_last_chunk){  
+
+        for (unsigned int j=0;j<buffer_length;j++){
+           cudaMemcpyAsync(device_array+bytes_device+pixels_per_slice*j, host_array+pixels_per_slice*(image_size[2]-j-2), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream);
+        }
+    }
+    cudaStreamSynchronize(stream);
+    cudaMemcpyAsync(device_array +offset_device, host_array +offset_host,  bytes_device*sizeof(float), cudaMemcpyHostToDevice,stream);
+}
\ No newline at end of file
diff --git a/Common/CUDA/tv_proximal.hpp.prehip b/Common/CUDA/tv_proximal.hpp.prehip
new file mode 100644
index 00000000..d65f7f50
--- /dev/null
+++ b/Common/CUDA/tv_proximal.hpp.prehip
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header MATLAB MEX  functions for TV image denoising. Check inputs and parses 
+ * MATLAB data to C++ data.
+ *
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+#ifndef TVDENOISE
+#define TVDENOISE
+#include "TIGRE_common.hpp"
+#include "GpuIds.hpp"
+
+void tvdenoising(float* src, float* dst, float lambda,
+                 const float* spacing,const long* image_size, int maxIter, const GpuIds& gpuids);
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
+
+
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/types_TIGRE.hpp.prehip b/Common/CUDA/types_TIGRE.hpp.prehip
new file mode 100644
index 00000000..0a3abc4d
--- /dev/null
+++ b/Common/CUDA/types_TIGRE.hpp.prehip
@@ -0,0 +1,109 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA functions for texture-memory interpolation based projection
+ *
+ *
+ * CODE by       Ander Biguri
+ *               Sepideh Hatamikia (arbitrary rotation)
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+#ifndef TYPES_CBCT
+#define TYPES_CBCT
+struct  Geometry {
+    // Geometry assumptions:
+    //  -> Origin is at (0,0,0). Image center is there +offOrig
+    //  -> at angle 0, source + image centre (without the offset) + detector centre (without offset) 
+    //     are aligned in the Y_Z plane.
+    //  -> detector is orthonormal to projection plane.
+    
+    //Parameters part of the image geometry
+    int   nVoxelX, nVoxelY, nVoxelZ;
+    float sVoxelX, sVoxelY, sVoxelZ;
+    float dVoxelX, dVoxelY, dVoxelZ;
+    float *offOrigX,*offOrigY,*offOrigZ;
+    float* DSO;
+    // Parameters  of the Detector.
+    int   nDetecU, nDetecV;
+    float sDetecU, sDetecV;
+    float dDetecU, dDetecV;
+    float *offDetecU, *offDetecV;
+    float* DSD;
+    float* dRoll;
+    float* dPitch;
+    float* dYaw;
+    // The base unit we are working with in mm. 
+    float unitX;
+    float unitY;
+    float unitZ;
+    
+    //rotation angle for e uler (ZYZ)
+    float alpha;
+    float theta;
+    float psi;
+    // Centre of Rotation correction.
+    float* COR;
+    //Maximum length of cube
+    float maxLength;
+    //User option
+    float accuracy;
+};
+
+ struct Point3D{
+    float x;
+    float y;
+    float z;
+};
+
+struct Point3Ddouble{
+    double x;
+    double y;
+    double z;
+
+    // cast to float member function for "copying" Point3Ddouble to Point3D
+    Point3D to_float()
+    {
+        Point3D castToFloat;
+        castToFloat.x = (float)x;
+        castToFloat.y = (float)y;
+        castToFloat.z = (float)z;
+        return(castToFloat);
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection.cu b/Common/CUDA/voxel_backprojection.cu
index bec4d909..8fb9df3c 100644
--- a/Common/CUDA/voxel_backprojection.cu
+++ b/Common/CUDA/voxel_backprojection.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA function for backrpojection using FDK weigts for CBCT
@@ -45,8 +46,8 @@
 
 #define  PI_2 1.57079632679489661923
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "voxel_backprojection.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
@@ -55,10 +56,10 @@
 // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -91,7 +92,7 @@ do { \
      *
      **/
     
-    void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate);
+    void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, int nStreamDevice,bool allocate);
 
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -134,7 +135,7 @@ __constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL];
 //      Description:    Main FDK backprojection kernel
 //______________________________________________________________________________
 
-__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex)
 {
     
     // Old kernel call signature:
@@ -323,16 +324,16 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
 #ifndef NO_PINNED_MEMORY    
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
-        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     if (isHostRegisterSupported ){ 
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); 
+        hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable); 
     } 
 #endif
     cudaCheckErrors("Error pinning memory");
@@ -348,20 +349,20 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
     float** dimage=(float**)malloc(deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMalloc((void**)&dimage[dev], num_bytes_img);
-        cudaCheckErrors("cudaMalloc fail");
+        hipSetDevice(gpuids[dev]);
+        hipMalloc((void**)&dimage[dev], num_bytes_img);
+        cudaCheckErrors("hipMalloc fail");
     }
     
     //If it is the first time, lets make sure our image is zeroed.
     int nStreamDevice=2;
     int nStreams=deviceCount*nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < nStreamDevice; ++i){
-            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
+            hipStreamCreate(&stream[i+dev*nStreamDevice]);
             
         }
     }
@@ -371,16 +372,16 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     
     // Kernel auxiliary variables
     Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D));
     float* projSinCosArrayHost;
-    cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float));
+    hipHostMalloc((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float));
     
     
     // Texture object variables
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
+    hipTextureObject_t *texProj;
+    hipArray **d_cuArrTex;
+    texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*));
     
     // Auxiliary Host page-locked memory for fast and asycnornous memcpy.
 
@@ -401,8 +402,8 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
     for(unsigned int img_slice=0;img_slice<split_image;img_slice++){
         // Initialize the memory if its the first time.
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemset(dimage[dev],0,num_bytes_img);
+            hipSetDevice(gpuids[dev]);
+            hipMemset(dimage[dev],0,num_bytes_img);
             cudaCheckErrors("memset fail");
         }
         
@@ -451,8 +452,8 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
                         (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
+                    hipSetDevice(gpuids[dev]);
+                    hipStreamSynchronize(stream[dev*nStreamDevice+1]);
                  }
                                
                 // Pin the next chunk of projection data, unpin the current one.
@@ -463,7 +464,7 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
                     if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
                         break;
                     
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     
                     
                     
@@ -538,9 +539,9 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
                         }   // END for (preparing params for kernel call)
                         
                         // Copy the prepared parameter arrays to constant memory to make it available for the kernel
-                        cudaMemcpyToSymbolAsync(projSinCosArrayDev, projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArrayDev), projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDev), projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipStreamSynchronize(stream[dev*nStreamDevice]);
                         
                         kernelPixelBackprojectionFDK<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
                     }  // END for
@@ -551,8 +552,8 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
             } // END sub-split of current projection chunk
             
             for (dev = 0; dev < deviceCount; dev++){
-                cudaSetDevice(gpuids[dev]);
-                cudaDeviceSynchronize();
+                hipSetDevice(gpuids[dev]);
+                hipDeviceSynchronize();
             }
             
         } // END projection splits
@@ -560,15 +561,15 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
        
         // Now we need to take the image out of the GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             // We do not need to sycnronize because the array dealocators already do.
             num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
             img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
-            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
+            hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
         }
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
+            hipSetDevice(gpuids[dev]);
+            hipDeviceSynchronize();
             cudaCheckErrors("Main loop fail");
         }
         
@@ -582,38 +583,38 @@ int voxel_backprojection(float  *  projections, Geometry geo, float* result,floa
         if (!two_buffers_used && i==1)
             break;
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
-            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
+            hipSetDevice(gpuids[dev]);
+            hipDestroyTextureObject(texProj[i*deviceCount+dev]);
+            hipFreeArray(d_cuArrTex[i*deviceCount+dev]);
         }
     }
     cudaCheckErrors("cudadestroy textures result fail");
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dimage[dev]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dimage[dev]);
     }
-    cudaFreeHost(projSinCosArrayHost);
-    cudaFreeHost(projParamsArrayHost);
+    hipHostFree(projSinCosArrayHost);
+    hipHostFree(projParamsArrayHost);
     free(partial_projection);
     free(proj_split_size);
     
     freeGeoArray(split_image*deviceCount,geoArray);
 #ifndef NO_PINNED_MEMORY        
     if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
-        cudaHostUnregister(result);
+        hipHostUnregister(result);
     }
     if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
+        hipHostUnregister(projections);
     }
 #endif
     
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
+        hipStreamDestroy(stream[i]);
     
-    cudaCheckErrors("cudaFree fail");
+    cudaCheckErrors("hipFree fail");
     
-    //cudaDeviceReset(); // For the Nvidia Visual Profiler
+    //hipDeviceReset(); // For the Nvidia Visual Profiler
     return 0;
     
 }  // END voxel_backprojection
@@ -664,52 +665,52 @@ void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsign
 }
 
 
-void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
+void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
 #if IS_FOR_MATLAB_TIGRE
-    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles);
 #else
-    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles);
 #endif
     const unsigned int num_devices = gpuids.GetLength();
     if (allocate){
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //hipArray Descriptor
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
             
         }
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
     }
 
     //Array creation End
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModeLinear;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
     }
 }
 
@@ -903,8 +904,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
     const int deviceCount = gpuids.GetLength();
     
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/voxel_backprojection.cu.prehip b/Common/CUDA/voxel_backprojection.cu.prehip
new file mode 100644
index 00000000..bec4d909
--- /dev/null
+++ b/Common/CUDA/voxel_backprojection.cu.prehip
@@ -0,0 +1,920 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA function for backrpojection using FDK weigts for CBCT
+ *
+ *
+ * CODE by  Ander Biguri
+ *          Optimized and modified by RB
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+#define  PI_2 1.57079632679489661923
+#include <algorithm>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include "voxel_backprojection.hpp"
+#include "TIGRE_common.hpp"
+#include <math.h>
+#include "GpuIds.hpp"
+
+// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+    
+#define MAXTREADS 1024
+    /*GEOMETRY DEFINITION
+     *
+     *                Detector plane, behind
+     *            |-----------------------------|
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |      +--------+             |
+     *            |     /        /|             |
+     *   A Z      |    /        / |*D           |
+     *   |        |   +--------+  |             |
+     *   |        |   |        |  |             |
+     *   |        |   |     *O |  +             |
+     *   *--->y   |   |        | /              |
+     *  /         |   |        |/               |
+     * V X        |   +--------+                |
+     *            |-----------------------------|
+     *
+     *           *S
+     *
+     *
+     *
+     *
+     *
+     **/
+    
+    void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate);
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were:
+// PROJ_PER_KERNEL = 32 or 16 (very similar times)
+// VOXELS_PER_THREAD = 8
+// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code.
+// (e.g. 16.2 s vs. ~62 s).
+
+const int PROJ_PER_KERNEL = 32;  // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck.
+const int VOXELS_PER_THREAD = 8;  // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck.
+
+// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection:
+//   deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec
+// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel
+// (they will be updated in the main loop before each kernel call).
+
+__constant__ Point3D projParamsArrayDev[6*PROJ_PER_KERNEL];  // Dev means it is on device
+
+// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above)
+// Point3D projParamsArrayHost[6*PROJ_PER_KERNEL];   // Host means it is host memory
+
+// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection)
+__constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL];
+
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+//______________________________________________________________________________
+//
+//      Function:       kernelPixelBackprojectionFDK
+//
+//      Description:    Main FDK backprojection kernel
+//______________________________________________________________________________
+
+__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+{
+    
+    // Old kernel call signature:
+    // kernelPixelBackprojectionFDK<<<grid,block>>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha);
+    // We just read in most of the params from the constant memory instead of getting them from the param list.
+    // This is because we now have MANY params, since single kernel processes more than one projection!
+    /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo,
+     * float* image,
+     * const int indAlpha,
+     * const Point3D deltaX ,
+     * const Point3D deltaY,
+     * const Point3D deltaZ,
+     * const Point3D xyzOrigin,
+     * const Point3D xyzOffset,
+     * const Point3D uv0Offset,
+     * const float sinalpha,
+     * const float cosalpha){
+     */
+    unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x;
+    // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
+    unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
+    //Make sure we don't go out of bounds
+    if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ)
+        return;
+    
+    // We'll keep a local auxiliary array of values of a column of voxels that this thread will update
+    float voxelColumn[VOXELS_PER_THREAD];
+    
+    // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then
+    // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes
+    
+    unsigned long colIdx;
+#pragma unroll
+    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+    {
+        unsigned long long indZ = startIndZ + colIdx;
+        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the updated values go back to the main volume)
+        if(indZ>=geo.nVoxelZ)
+            break;   // break the loop.
+        
+        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
+        voxelColumn[colIdx] = image[idx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
+        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
+    }  // END copy 3D volume voxels to local array
+    
+    // Now iterate through projections
+#pragma unroll
+    for(unsigned long projNumber=0; projNumber<PROJ_PER_KERNEL; projNumber++)
+    {
+        // Get the current parameters from parameter arrays in constant memory.
+        unsigned long indAlpha = currProjSetNumber*PROJ_PER_KERNEL+projNumber;  // This is the ABSOLUTE projection number in the projection array
+        
+        // Our currImageVal will be updated by hovewer many projections we had left in the "remainder" - that's OK.
+        if(indAlpha>=totalNoOfProjections)
+            break;
+        
+        Point3D deltaX = projParamsArrayDev[6*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
+        Point3D deltaY = projParamsArrayDev[6*projNumber+1];
+        Point3D deltaZ = projParamsArrayDev[6*projNumber+2];
+        Point3D xyzOrigin = projParamsArrayDev[6*projNumber+3];
+        Point3D xyzOffset = projParamsArrayDev[6*projNumber+4];
+        Point3D S = projParamsArrayDev[6*projNumber+5];
+        
+        float sinalpha = projSinCosArrayDev[5*projNumber];     // 2*projNumber because we have 2 float (sin or cos angle) values per projection
+        float cosalpha = projSinCosArrayDev[5*projNumber+1];
+        float COR = projSinCosArrayDev[5*projNumber+2];
+        float DSD = projSinCosArrayDev[5*projNumber+3];
+        float DSO = projSinCosArrayDev[5*projNumber+4];
+        
+        float auxCOR=COR/geo.dDetecU;
+        // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION
+#pragma unroll
+        for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+        {
+            unsigned long long indZ = startIndZ + colIdx;
+            
+            // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+            // be trying to copy the out of bounds values anyway (bounds checks will be done in the final loop where the values go to the main volume)
+            if(indZ>=geo.nVoxelZ)
+                break;   // break the loop.
+            
+            // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles.
+            Point3D P;
+            P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x);
+            P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-auxCOR;
+            P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z);
+            
+            // This is the vector defining the line from the source to the Voxel
+            float vectX,vectY,vectZ;
+            vectX=(P.x -S.x);
+            vectY=(P.y -S.y);
+            vectZ=(P.z -S.z);
+            
+            // Get the coordinates in the detector UV where the mid point of the voxel is projected.
+            float t=__fdividef(DSO-DSD-S.x,vectX);
+            float y,z;
+            y=vectY*t+S.y;
+            z=vectZ*t+S.z;
+            float u,v;
+            u=y+(float)geo.nDetecU*0.5f;
+            v=z+(float)geo.nDetecV*0.5f;
+            
+            float weight;
+            float realx,realy;
+            realx=-(geo.sVoxelX-geo.dVoxelX)*0.5f  +indX*geo.dVoxelX   +xyzOffset.x;
+            realy=-(geo.sVoxelY-geo.dVoxelY)*0.5f  +indY*geo.dVoxelY   +xyzOffset.y+COR;
+            
+            weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO);
+            
+            weight=__frcp_rd(weight*weight);
+            
+            // Get Value in the computed (U,V) and multiply by the corresponding weight.
+            // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
+            
+#if IS_FOR_MATLAB_TIGRE
+            voxelColumn[colIdx]+=tex3D<float>(tex, v, u ,indAlpha+0.5f)*weight;
+#else
+            voxelColumn[colIdx]+=tex3D<float>(tex, u, v ,indAlpha+0.5f)*weight;
+#endif
+        }  // END iterating through column of voxels
+        
+    }  // END iterating through multiple projections
+    
+    // And finally copy the updated local voxelColumn array back to our 3D volume (main memory)
+#pragma unroll
+    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+    {
+        unsigned long long indZ = startIndZ + colIdx;
+        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the values go to the main volume)
+        if(indZ>=geo.nVoxelZ)
+            break;   // break the loop.
+        
+        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
+        image[idx] = voxelColumn[colIdx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
+        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
+        // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write.
+        // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is
+        // better for avoiding memory congestion.
+    }  // END copy updated voxels from local array to our 3D volume
+    
+}  // END kernelPixelBackprojectionFDK
+
+
+
+
+//______________________________________________________________________________
+//
+//      Function:       voxel_backprojection
+//
+//      Description:    Main host function for FDK backprojection (invokes the kernel)
+//______________________________________________________________________________
+
+int voxel_backprojection(float  *  projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids)
+{
+    // printf("voxel_backprojection(geo.nDetector = %d, %d)\n", geo.nDetecU, geo.nDetecV);
+    // printf("geo.nVoxel    = %d, %d, %d\n", geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
+    
+    // Prepare for MultiGPU
+    int deviceCount = gpuids.GetLength();
+    cudaCheckErrors("Device query fail");
+    if (deviceCount == 0) {
+        mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n");
+    }
+
+    // CODE assumes
+    // 1.-All available devices are usable by this code
+    // 2.-All available devices are equal, they are the same machine (warning thrown)
+    // Check the available devices, and if they are the same
+    if (!gpuids.AreEqualDevices()) {
+        mexWarnMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+    }
+
+    int dev;
+    // Split the CT problem
+    unsigned int split_image;
+    unsigned int split_projections;
+    splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections);
+    
+    
+    cudaCheckErrors("Error");
+    //Pagelock memory for synchronous copy.
+    // Lets try to make the host memory pinned:
+    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
+    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
+#ifndef NO_PINNED_MEMORY    
+    if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
+        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+    }
+    if (isHostRegisterSupported ){ 
+        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); 
+    } 
+#endif
+    cudaCheckErrors("Error pinning memory");
+    
+    
+    // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the
+    // image slices. The rest of the Geometry is the same
+    Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry));
+    createGeoArray(split_image*deviceCount,geo,geoArray,nalpha);
+    
+    // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly
+    // in the previous section this should leave enough space for the textures.
+    size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
+    float** dimage=(float**)malloc(deviceCount*sizeof(float*));
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMalloc((void**)&dimage[dev], num_bytes_img);
+        cudaCheckErrors("cudaMalloc fail");
+    }
+    
+    //If it is the first time, lets make sure our image is zeroed.
+    int nStreamDevice=2;
+    int nStreams=deviceCount*nStreamDevice;
+    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        for (int i = 0; i < nStreamDevice; ++i){
+            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
+            
+        }
+    }
+    
+
+     
+    
+    // Kernel auxiliary variables
+    Point3D* projParamsArrayHost;
+    cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D));
+    float* projSinCosArrayHost;
+    cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float));
+    
+    
+    // Texture object variables
+    cudaTextureObject_t *texProj;
+    cudaArray **d_cuArrTex;
+    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
+    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
+    
+    // Auxiliary Host page-locked memory for fast and asycnornous memcpy.
+
+    // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop
+    // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution
+    // of the code, as repeated allocation and deallocation only happens when the projection data is very very big,
+    // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time.
+    unsigned long long proj_linear_idx_start;
+    unsigned int proj_split_overlap_number;
+    unsigned int current_proj_split_size,current_proj_overlap_split_size;
+    size_t num_bytes_img_curr;
+    size_t img_linear_idx_start;
+    float** partial_projection;
+    size_t* proj_split_size;
+    
+    
+    
+    for(unsigned int img_slice=0;img_slice<split_image;img_slice++){
+        // Initialize the memory if its the first time.
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaMemset(dimage[dev],0,num_bytes_img);
+            cudaCheckErrors("memset fail");
+        }
+        
+        for( unsigned int proj=0;proj<split_projections;proj++){
+            
+            
+            // What is the size of the current chunk of proejctions we need in?
+            current_proj_split_size=(nalpha+split_projections-1)/split_projections;
+            // if its the last one its probably less
+            current_proj_split_size=((proj+1)*current_proj_split_size<nalpha)?  current_proj_split_size:  nalpha-current_proj_split_size*proj;
+            
+            // We are going to split it in the same amount of kernels we need to execute.
+            proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;
+            
+            // Create pointer to pointers of projections and precompute their location and size.
+            if(!proj && !img_slice){
+                partial_projection=(float**)malloc(proj_split_overlap_number*sizeof(float*));
+                proj_split_size=(size_t*)malloc(proj_split_overlap_number*sizeof(size_t*));
+            }
+            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
+                // Crop the last one, as its likely its not completely divisible.
+                // now lets split this for simultanoeus memcopy and compute.
+                // We want to make sure that if we can, we run PROJ_PER_KERNEL projections, to maximize kernel acceleration
+                // current_proj_overlap_split_size units = angles
+                current_proj_overlap_split_size=max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL);
+                current_proj_overlap_split_size=(proj_block_split<proj_split_overlap_number-1)?current_proj_overlap_split_size:current_proj_split_size-(proj_split_overlap_number-1)*current_proj_overlap_split_size;
+                //Get the linear index where the current memory chunk starts.
+                
+                proj_linear_idx_start=(unsigned long long)((nalpha+split_projections-1)/split_projections)*(unsigned long long)proj*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
+                proj_linear_idx_start+=proj_block_split*max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL)*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
+                //Store result
+                proj_split_size[proj_block_split]=current_proj_overlap_split_size;
+                partial_projection[proj_block_split]=&projections[proj_linear_idx_start];
+                
+            }                
+            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
+
+                // Now get the projections on memory
+
+                CreateTexture(gpuids,
+                        partial_projection[proj_block_split],geo,
+                        &d_cuArrTex[(proj_block_split%2)*deviceCount],
+                        proj_split_size[proj_block_split],
+                        &texProj   [(proj_block_split%2)*deviceCount],
+                        stream, nStreamDevice,
+                        (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
+                
+                for (dev = 0; dev < deviceCount; dev++){
+                    cudaSetDevice(gpuids[dev]);
+                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
+                 }
+                               
+                // Pin the next chunk of projection data, unpin the current one.
+                for (dev = 0; dev < deviceCount; dev++){
+                    //Safety:
+                    // Depends on the amount of GPUs, the case where a image slice is zero hight can happen.
+                    // Just break the loop if we reached that point
+                    if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
+                        break;
+                    
+                    cudaSetDevice(gpuids[dev]);
+                    
+                    
+                    
+                    int divx,divy,divz;
+                    // RB: Use the optimal (in their tests) block size from paper by Zinsser and Keck (16 in x and 32 in y).
+                    // I tried different sizes and shapes of blocks (tiles), but it does not appear to significantly affect throughput, so
+                    // let's stick with the values from Zinsser and Keck.
+                    divx=16;
+                    divy=32;
+                    divz=VOXELS_PER_THREAD;      // We now only have 32 x 16 threads per block (flat tile, see below), BUT each thread works on a Z column of VOXELS_PER_THREAD voxels, so we effectively need fewer blocks!
+                    
+                    
+                    dim3 grid((geo.nVoxelX+divx-1)/divx,
+                            (geo.nVoxelY+divy-1)/divy,
+                            (geoArray[img_slice*deviceCount+dev].nVoxelZ+divz-1)/divz);
+                    
+                    dim3 block(divx,divy,1);    // Note that we have 1 in the Z size, not divz, since each thread works on a vertical set of VOXELS_PER_THREAD voxels (so we only need a "flat" tile of threads, with depth of 1)
+                    //////////////////////////////////////////////////////////////////////////////////////
+                    // Main reconstruction loop: go through projections (rotation angles) and backproject
+                    //////////////////////////////////////////////////////////////////////////////////////
+                    
+                    // Since we'll have multiple projections processed by a SINGLE kernel call, compute how many
+                    // kernel calls we'll need altogether.
+                    unsigned int noOfKernelCalls = (proj_split_size[proj_block_split]+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_KERNEL
+                    for (unsigned int i=0; i<noOfKernelCalls; i++){
+                        
+                        // Now we need to generate and copy all data for PROJ_PER_KERNEL projections to constant memory so that our kernel can use it
+                        unsigned int j;
+                        for(j=0; j<PROJ_PER_KERNEL; j++){
+                            
+                            unsigned int currProjNumber_slice=i*PROJ_PER_KERNEL+j;
+                            unsigned int currProjNumber_global=i*PROJ_PER_KERNEL+j                                                                          // index within kernel
+                                                               +proj*(nalpha+split_projections-1)/split_projections                                          // index of the global projection split
+                                                               +proj_block_split*max(current_proj_split_size/proj_split_overlap_number,PROJ_PER_KERNEL); // indexof overlap current split
+                            
+                            if(currProjNumber_slice>=proj_split_size[proj_block_split])
+                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
+                            if(currProjNumber_global>=nalpha)
+                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
+                            
+                            Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source;
+                            float sinalpha,cosalpha;
+                            
+                            geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now.
+                            geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1];
+                            geoArray[img_slice*deviceCount+dev].psi  =-alphas[currProjNumber_global*3+2];
+                            
+//                             mexPrintf("%u %f \n",i,geoArray[img_slice*deviceCount+dev].alpha);
+//                             mexPrintf("%u \n",currProjNumber_global);
+                            
+                            sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha);
+                            cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha);
+                            
+                            projSinCosArrayHost[5*j]=sinalpha;  // 2*j because we have 2 float (sin or cos angle) values per projection
+                            projSinCosArrayHost[5*j+1]=cosalpha;
+                            projSinCosArrayHost[5*j+2]=geo.COR[currProjNumber_global];
+                            projSinCosArrayHost[5*j+3]=geo.DSD[currProjNumber_global];
+                            projSinCosArrayHost[5*j+4]=geo.DSO[currProjNumber_global];
+                            
+                            computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
+                            
+                            offOrig.x=geo.offOrigX[currProjNumber_global];
+                            offOrig.y=geo.offOrigY[currProjNumber_global];
+                            offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global];
+                            
+                            projParamsArrayHost[6*j]=deltaX;		// 6*j because we have 6 Point3D values per projection
+                            projParamsArrayHost[6*j+1]=deltaY;
+                            projParamsArrayHost[6*j+2]=deltaZ;
+                            projParamsArrayHost[6*j+3]=xyzOrigin;
+                            projParamsArrayHost[6*j+4]=offOrig;
+                            projParamsArrayHost[6*j+5]=source;
+                        }   // END for (preparing params for kernel call)
+                        
+                        // Copy the prepared parameter arrays to constant memory to make it available for the kernel
+                        cudaMemcpyToSymbolAsync(projSinCosArrayDev, projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
+                        
+                        kernelPixelBackprojectionFDK<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
+                    }  // END for
+                    //////////////////////////////////////////////////////////////////////////////////////
+                    // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject
+                    //////////////////////////////////////////////////////////////////////////////////////
+                }// END for deviceCount
+            } // END sub-split of current projection chunk
+            
+            for (dev = 0; dev < deviceCount; dev++){
+                cudaSetDevice(gpuids[dev]);
+                cudaDeviceSynchronize();
+            }
+            
+        } // END projection splits
+        
+       
+        // Now we need to take the image out of the GPU
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            // We do not need to sycnronize because the array dealocators already do.
+            num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
+            img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
+            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
+        }
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDeviceSynchronize();
+            cudaCheckErrors("Main loop fail");
+        }
+        
+    } // end image splits
+
+    ///////// Cleaning:
+    
+    
+    bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1;
+    for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
+        if (!two_buffers_used && i==1)
+            break;
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
+            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
+        }
+    }
+    cudaCheckErrors("cudadestroy textures result fail");
+    
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaFree(dimage[dev]);
+    }
+    cudaFreeHost(projSinCosArrayHost);
+    cudaFreeHost(projParamsArrayHost);
+    free(partial_projection);
+    free(proj_split_size);
+    
+    freeGeoArray(split_image*deviceCount,geoArray);
+#ifndef NO_PINNED_MEMORY        
+    if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
+        cudaHostUnregister(result);
+    }
+    if (isHostRegisterSupported){
+        cudaHostUnregister(projections);
+    }
+#endif
+    
+    for (int i = 0; i < nStreams; ++i)
+        cudaStreamDestroy(stream[i]);
+    
+    cudaCheckErrors("cudaFree fail");
+    
+    //cudaDeviceReset(); // For the Nvidia Visual Profiler
+    return 0;
+    
+}  // END voxel_backprojection
+//
+
+void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){
+    
+    
+    // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
+    
+    size_t mem_GPU_global;
+    checkFreeMemory(gpuids, &mem_GPU_global);
+
+    const int deviceCount = gpuids.GetLength();
+        
+    // Compute how much memory each of the relevant memory pieces need
+    size_t mem_image=       (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
+    size_t mem_proj=        (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float);
+    
+    
+    
+    
+    // Does everything fit in the GPU?
+    
+    if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2<mem_GPU_global){
+        // We only need to split if we have extra GPUs
+        *split_image=1;
+        *split_projections=1;
+    }
+    // We know we need to split, but:
+    // Does all the image fit in the GPU, with some slack for a stack of projections??
+    else
+    {
+        // As we can overlap memcpys from H2D of the projections, we should then minimize the amount of image splits.
+        // Lets assume to start with that we only need 1 stack of PROJ_PER_KERNEL projections. The rest is for the image.
+        size_t mem_free=mem_GPU_global-2*mem_proj*PROJ_PER_KERNEL;
+        
+        *split_image=(mem_image/deviceCount+mem_free-1)/mem_free;
+        // Now knowing how many splits we have for images, we can recompute how many slices of projections actually
+        // fit on the GPU. Must be more than 0 obviously.
+        
+        mem_free=mem_GPU_global-(mem_image/deviceCount)/(*split_image); // NOTE: There is some rounding error, but its in the order of bytes, and we have 5% of GPU free jsut in case. We are safe
+        
+        
+        *split_projections=(mem_proj*PROJ_PER_KERNEL*2+mem_free-1)/mem_free;
+        
+    }
+}
+
+
+void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
+    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+#if IS_FOR_MATLAB_TIGRE
+    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+#else
+    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+#endif
+    const unsigned int num_devices = gpuids.GetLength();
+    if (allocate){
+        for (unsigned int dev = 0; dev < num_devices; dev++){
+            cudaSetDevice(gpuids[dev]);
+            
+            //cudaArray Descriptor
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //cuda Array
+            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            
+        }
+    }
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMemcpy3DParms copyParams = {0};
+        //Array creation
+        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.dstArray = d_cuArrTex[dev];
+        copyParams.extent   = extent;
+        copyParams.kind     = cudaMemcpyHostToDevice;
+        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
+    }
+
+    //Array creation End
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(cudaResourceDesc));
+        texRes.resType = cudaResourceTypeArray;
+        texRes.res.array.array  = d_cuArrTex[dev];
+        cudaTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        texDescr.normalizedCoords = false;
+        texDescr.filterMode = cudaFilterModeLinear;
+        texDescr.addressMode[0] = cudaAddressModeBorder;
+        texDescr.addressMode[1] = cudaAddressModeBorder;
+        texDescr.addressMode[2] = cudaAddressModeBorder;
+        texDescr.readMode = cudaReadModeElementType;
+        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+    }
+}
+
+//______________________________________________________________________________
+//
+//      Function:       createGeoArray
+//
+//      Description:    This code generates the geometries needed to split the image properly in
+//                      cases where the entire image does not fit in the memory of the GPU
+//______________________________________________________________________________
+
+void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles){
+    
+    
+    unsigned int  splitsize=(geo.nVoxelZ+image_splits-1)/image_splits;
+    
+    for(unsigned int sp=0;sp<image_splits;sp++){
+        geoArray[sp]=geo;
+        // All of them are splitsize, but the last one, possible
+        geoArray[sp].nVoxelZ=((sp+1)*splitsize<geo.nVoxelZ)?  splitsize:  max(geo.nVoxelZ-splitsize*sp,0);
+        geoArray[sp].sVoxelZ= geoArray[sp].nVoxelZ* geoArray[sp].dVoxelZ;
+        
+        // We need to redefine the offsets, as now each subimage is not aligned in the origin.
+        geoArray[sp].offOrigZ=(float *)malloc(nangles*sizeof(float));
+        for (unsigned int i=0;i<nangles;i++){
+            geoArray[sp].offOrigZ[i]=geo.offOrigZ[i]-geo.sVoxelZ/2+sp*geoArray[0].sVoxelZ+geoArray[sp].sVoxelZ/2;
+        }
+    }
+    
+}
+//______________________________________________________________________________
+//
+//      Function:       freeGeoArray
+//
+//      Description:    Frees the memory from the geometry array for multiGPU.
+//______________________________________________________________________________
+void freeGeoArray(unsigned int splits,Geometry* geoArray){
+    for(unsigned int sp=0;sp<splits;sp++){
+        free(geoArray[sp].offOrigZ);
+    }
+    free(geoArray);
+}
+
+
+//______________________________________________________________________________
+//
+//      double precision functions for rotating Point3Ddouble coordinates
+//______________________________________________________________________________
+
+void eulerZYZT(Geometry geo, Point3Ddouble* point){
+    
+    Point3Ddouble auxPoint;
+    auxPoint.x=point->x;
+    auxPoint.y=point->y;
+    auxPoint.z=point->z;
+
+    // calculate sin and cos of 3 angles (used multiple times)
+    double sin_alpha, cos_alpha, sin_theta, cos_theta, sin_psi, cos_psi;
+    sin_alpha = sin((double)geo.alpha);
+    cos_alpha = cos((double)geo.alpha);
+    sin_theta = sin((double)geo.theta);
+    cos_theta = cos((double)geo.theta);
+    sin_psi = sin((double)geo.psi);
+    cos_psi = cos((double)geo.psi);
+    
+    point->x = auxPoint.x*(cos_psi*cos_theta*cos_alpha-sin_psi*sin_alpha)
+    +auxPoint.y*(-cos_psi*cos_theta*sin_alpha-sin_psi*cos_alpha)
+    +auxPoint.z*cos_psi*sin_theta;
+    point->y = auxPoint.x*(sin_psi*cos_theta*cos_alpha+cos_psi*sin_alpha)
+    +auxPoint.y*(-sin_psi*cos_theta*sin_alpha+cos_psi*cos_alpha)
+    +auxPoint.z*sin_psi*sin_theta;
+    point->z =-auxPoint.x*sin_theta*cos_alpha
+    +auxPoint.y*sin_theta*sin_alpha
+    +auxPoint.z*cos_theta;
+}
+
+void rollPitchYawT(Geometry geo,int i, Point3Ddouble* point){
+
+    Point3Ddouble auxPoint;
+    auxPoint.x=point->x;
+    auxPoint.y=point->y;
+    auxPoint.z=point->z;
+
+    // calculate sin and cos of 3 angles (used multiple times)
+    double sin_dRoll, cos_dRoll, sin_dPitch, cos_dPitch, sin_dYaw, cos_dYaw;
+    sin_dRoll = sin((double)geo.dRoll[i]);
+    cos_dRoll = cos((double)geo.dRoll[i]);
+    sin_dPitch = sin((double)geo.dPitch[i]);
+    cos_dPitch = cos((double)geo.dPitch[i]);
+    sin_dYaw = sin((double)geo.dYaw[i]);
+    cos_dYaw = cos((double)geo.dYaw[i]);
+    
+    point->x=cos_dRoll*cos_dPitch*auxPoint.x
+            +sin_dRoll*cos_dPitch*auxPoint.y
+            -sin_dPitch*auxPoint.z;
+    
+    point->y=(cos_dRoll*sin_dPitch*sin_dYaw - sin_dRoll*cos_dYaw)*auxPoint.x
+            +(sin_dRoll*sin_dPitch*sin_dYaw + cos_dRoll*cos_dYaw)*auxPoint.y
+            +cos_dPitch*sin_dYaw*auxPoint.z;
+    
+    point->z=(cos_dRoll*sin_dPitch*cos_dYaw + sin_dRoll*sin_dYaw)*auxPoint.x
+            +(sin_dRoll*sin_dPitch*cos_dYaw - cos_dRoll*sin_dYaw)*auxPoint.y
+            +cos_dPitch*cos_dYaw*auxPoint.z;
+}
+
+//______________________________________________________________________________
+//
+//      Function:       computeDeltasCube
+//
+//      Description:    Computes relative increments for each projection (volume rotation).
+//						Increments get passed to the backprojection kernel.
+//______________________________________________________________________________
+
+void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S)
+{
+    
+    // initialize points with double precision
+    Point3Ddouble P, Px,Py,Pz;
+
+    // Get coords of Img(0,0,0)
+    P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i];
+    P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i];
+    P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i];
+    
+    // Get coords from next voxel in each direction
+    Px.x=P.x+geo.dVoxelX;       Py.x=P.x;                Pz.x=P.x;
+    Px.y=P.y;                   Py.y=P.y+geo.dVoxelY;    Pz.y=P.y;
+    Px.z=P.z;                   Py.z=P.z;                Pz.z=P.z+geo.dVoxelZ;
+    
+    // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ
+    eulerZYZT(geo,&P);
+    eulerZYZT(geo,&Px);
+    eulerZYZT(geo,&Py);
+    eulerZYZT(geo,&Pz);
+    
+    //detector offset
+    P.z =P.z-geo.offDetecV[i];            P.y =P.y-geo.offDetecU[i];
+    Px.z =Px.z-geo.offDetecV[i];          Px.y =Px.y-geo.offDetecU[i];
+    Py.z =Py.z-geo.offDetecV[i];          Py.y =Py.y-geo.offDetecU[i];
+    Pz.z =Pz.z-geo.offDetecV[i];          Pz.y =Pz.y-geo.offDetecU[i];
+    
+    //Detector Roll pitch Yaw
+    //
+    // first, we need to offset everything so (0,0,0) is the center of the detector
+    // Only X is required for that
+    P.x=P.x+(geo.DSD[i]-geo.DSO[i]);
+    Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]);
+    Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]);
+    Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]);
+    rollPitchYawT(geo,i,&P);
+    rollPitchYawT(geo,i,&Px);
+    rollPitchYawT(geo,i,&Py);
+    rollPitchYawT(geo,i,&Pz);
+    
+    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
+    Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]);
+    Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]);
+    Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]);
+    //Done for P, now source
+    Point3Ddouble source;
+    source.x=geo.DSD[i]; //already offseted for rotation
+    source.y=-geo.offDetecU[i];
+    source.z=-geo.offDetecV[i];
+    rollPitchYawT(geo,i,&source);
+    
+    source.x=source.x-(geo.DSD[i]-geo.DSO[i]);//   source.y=source.y-auxOff.y;    source.z=source.z-auxOff.z;
+    
+//       mexPrintf("%f,%f,%f\n",source.x,source.y,source.z);
+    // Scale coords so detector pixels are 1x1
+    
+    P.z =P.z /geo.dDetecV;                          P.y =P.y/geo.dDetecU;
+    Px.z=Px.z/geo.dDetecV;                          Px.y=Px.y/geo.dDetecU;
+    Py.z=Py.z/geo.dDetecV;                          Py.y=Py.y/geo.dDetecU;
+    Pz.z=Pz.z/geo.dDetecV;                          Pz.y=Pz.y/geo.dDetecU;
+    
+    source.z=source.z/geo.dDetecV;                  source.y=source.y/geo.dDetecU;
+    
+    // get deltas of the changes in voxels
+    deltaX->x=Px.x-P.x;   deltaX->y=Px.y-P.y;    deltaX->z=Px.z-P.z;
+    deltaY->x=Py.x-P.x;   deltaY->y=Py.y-P.y;    deltaY->z=Py.z-P.z;
+    deltaZ->x=Pz.x-P.x;   deltaZ->y=Pz.y-P.y;    deltaZ->z=Pz.z-P.z;
+    
+    // cast the results from the double precision calculations back to float
+    *xyzorigin=P.to_float();
+    *S=source.to_float();
+}
+
+void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
+    size_t memfree;
+    size_t memtotal;
+    const int deviceCount = gpuids.GetLength();
+    
+    for (int dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMemGetInfo(&memfree,&memtotal);
+        if(dev==0) *mem_GPU_global=memfree;
+        if(memfree<memtotal/2){
+            mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
+        }
+        cudaCheckErrors("Check mem error");
+        
+        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
+    }
+    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
+    
+    //*mem_GPU_global= insert your known number here, in bytes.
+}
+
diff --git a/Common/CUDA/voxel_backprojection.hpp.prehip b/Common/CUDA/voxel_backprojection.hpp.prehip
new file mode 100644
index 00000000..a4ea464f
--- /dev/null
+++ b/Common/CUDA/voxel_backprojection.hpp.prehip
@@ -0,0 +1,59 @@
+/*-------------------------------------------------------------------------
+ *
+ * HEader CUDA function for backrpojection using FDK weigts for CBCT
+ *
+ *
+ * CODE by  Ander Biguri
+ *          Optimized and modified by RB
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+#include "types_TIGRE.hpp"
+#include "GpuIds.hpp"
+
+#ifndef BACKPROJECTION_HPP
+#define BACKPROJECTION_HPP
+void rollPitchYawT(Geometry geo,int i, Point3Ddouble* point);
+int  voxel_backprojection(float* projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids);
+void splitCTbackprojection(const GpuIds& gpuids,Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections);
+void eulerZYZT(Geometry geo, Point3Ddouble* point);
+void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S);
+void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles);
+void freeGeoArray(unsigned int splits,Geometry* geoArray);
+void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global);
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection2.cu b/Common/CUDA/voxel_backprojection2.cu
index c9dcc957..43091e78 100644
--- a/Common/CUDA/voxel_backprojection2.cu
+++ b/Common/CUDA/voxel_backprojection2.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA function for backrpojection using FDK weigts for CBCT
@@ -45,8 +46,8 @@
 
 #define  PI_2 1.57079632679489661923
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "voxel_backprojection2.hpp"
 #include "TIGRE_common.hpp"
 #include <math.h>
@@ -55,10 +56,10 @@
 // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -92,7 +93,7 @@ do { \
      **/
     
 // this definitionmust go here.
-void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate);
+void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate);
 
 __global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){
     size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -139,7 +140,7 @@ __constant__ float projSinCosArray2Dev[5*PROJ_PER_KERNEL];
 //      Description:    Main FDK backprojection kernel
 //______________________________________________________________________________
 
-__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, hipTextureObject_t tex)
 {
     
     unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
@@ -355,9 +356,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
     float** dimage=(float**)malloc(deviceCount*sizeof(float*));
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMalloc((void**)&dimage[dev], num_bytes_img);
-        cudaCheckErrors("cudaMalloc fail");
+        hipSetDevice(gpuids[dev]);
+        hipMalloc((void**)&dimage[dev], num_bytes_img);
+        cudaCheckErrors("hipMalloc fail");
     }
         
     
@@ -366,15 +367,15 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
     // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
     if (isHostRegisterSupported & split_image>1){
-        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     if (isHostRegisterSupported ){
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     cudaCheckErrors("Error pinning memory");
 
@@ -385,27 +386,27 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     //If it is the first time, lets make sure our image is zeroed.
     int nStreamDevice=2;
     int nStreams=deviceCount*nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
+        hipSetDevice(gpuids[dev]);
         for (int i = 0; i < nStreamDevice; ++i){
-            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
+            hipStreamCreate(&stream[i+dev*nStreamDevice]);
             
         }
     }
     
     // Kernel auxiliary variables
     Point3D* projParamsArray2Host;
-    cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D));
     float* projSinCosArray2Host;
-    cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float));
+    hipHostMalloc((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float));
     
     // Texture object variables
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
+    hipTextureObject_t *texProj;
+    hipArray **d_cuArrTex;
+    texProj =(hipTextureObject_t*)malloc(deviceCount*2*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(deviceCount*2*sizeof(hipArray*));
     
     
     
@@ -425,8 +426,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
 //
         // Initialize the memory if its the first time.
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemset(dimage[dev],0,num_bytes_img);
+            hipSetDevice(gpuids[dev]);
+            hipMemset(dimage[dev],0,num_bytes_img);
             cudaCheckErrors("memset fail");
         }
         
@@ -478,8 +479,8 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
                         (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
                 
                 for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
+                    hipSetDevice(gpuids[dev]);
+                    hipStreamSynchronize(stream[dev*nStreamDevice+1]);
                  }
 
                 for (dev = 0; dev < deviceCount; dev++){
@@ -489,7 +490,7 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
                     if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
                         break;
                     
-                    cudaSetDevice(gpuids[dev]);
+                    hipSetDevice(gpuids[dev]);
                     
                     
                     
@@ -566,9 +567,9 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
                         }   // END for (preparing params for kernel call)
                         
                         // Copy the prepared parameter arrays to constant memory to make it available for the kernel
-                        cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArray2Dev), projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArray2Dev), projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        hipStreamSynchronize(stream[dev*nStreamDevice]);
                         kernelPixelBackprojection<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
                         
                     }  // END for
@@ -581,24 +582,24 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
         } // END projection splits
         
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>(  geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV));
         }
 
         // Now we need to take the image out of the GPU
         for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaStreamSynchronize(stream[dev*nStreamDevice]);
+            hipSetDevice(gpuids[dev]);
+            hipStreamSynchronize(stream[dev*nStreamDevice]);
             
             num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
             img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
-            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
+            hipMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, hipMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
         }
     } // end image splits
     
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDeviceSynchronize();
+        hipSetDevice(gpuids[dev]);
+        hipDeviceSynchronize();
     }  
     
     
@@ -607,40 +608,40 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
     for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
         if (!two_buffers_used && i==1)
             break;        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
-            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
+            hipSetDevice(gpuids[dev]);
+            hipDestroyTextureObject(texProj[i*deviceCount+dev]);
+            hipFreeArray(d_cuArrTex[i*deviceCount+dev]);
         }
     }
     free(d_cuArrTex);
     free(texProj);
 
     for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dimage[dev]);
+        hipSetDevice(gpuids[dev]);
+        hipFree(dimage[dev]);
     }
     free(dimage);
     
-    cudaFreeHost(projSinCosArray2Host);
-    cudaFreeHost(projParamsArray2Host);
+    hipHostFree(projSinCosArray2Host);
+    hipHostFree(projParamsArray2Host);
     free(partial_projection);
     free(proj_split_size);
     
     freeGeoArray(split_image*deviceCount,geoArray);
 #ifndef NO_PINNED_MEMORY     
     if (isHostRegisterSupported & split_image>1){
-        cudaHostUnregister(result);
+        hipHostUnregister(result);
     }
     if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
+        hipHostUnregister(projections);
     }
 #endif 
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
+        hipStreamDestroy(stream[i]);
     
-    cudaCheckErrors("cudaFree fail");
+    cudaCheckErrors("hipFree fail");
     
-//     cudaDeviceReset(); // For the Nvidia Visual Profiler
+//     hipDeviceReset(); // For the Nvidia Visual Profiler
     return 0;
     
 }  // END voxel_backprojection
@@ -649,52 +650,52 @@ int voxel_backprojection2(float * projections, Geometry geo, float* result,float
 
 
 
-void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
+void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream,int nStreamDevice,bool allocate){
     //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
     int num_devices = gpuids.GetLength();
 #if IS_FOR_MATLAB_TIGRE
-    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles);
 #else
-    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+    const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles);
 #endif
     if (allocate){
         for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
+            hipSetDevice(gpuids[dev]);
             
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //hipArray Descriptor
+            hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
             
         }
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
+        hipSetDevice(gpuids[dev]);
+        hipMemcpy3DParms copyParams = {0};
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[dev];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
     }
 
     //Array creation End
     for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipSetDevice(gpuids[dev]);
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModeLinear;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
     }
 }
 #ifndef BACKPROJECTION_HPP
@@ -826,8 +827,8 @@ void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
     const int gpuids.GetLength();
     
     for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
+        hipSetDevice(gpuids[dev]);
+        hipMemGetInfo(&memfree,&memtotal);
         if(dev==0) *mem_GPU_global=memfree;
         if(memfree<memtotal/2){
             mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
diff --git a/Common/CUDA/voxel_backprojection2.cu.prehip b/Common/CUDA/voxel_backprojection2.cu.prehip
new file mode 100644
index 00000000..c9dcc957
--- /dev/null
+++ b/Common/CUDA/voxel_backprojection2.cu.prehip
@@ -0,0 +1,844 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA function for backrpojection using FDK weigts for CBCT
+ *
+ *
+ * CODE by  Ander Biguri
+ *          Optimized and modified by RB
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+#define  PI_2 1.57079632679489661923
+#include <algorithm>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include "voxel_backprojection2.hpp"
+#include "TIGRE_common.hpp"
+#include <math.h>
+#include "GpuIds.hpp"
+
+// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+    
+#define MAXTREADS 1024
+    /*GEOMETRY DEFINITION
+     *
+     *                Detector plane, behind
+     *            |-----------------------------|
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |      +--------+             |
+     *            |     /        /|             |
+     *   A Z      |    /        / |*D           |
+     *   |        |   +--------+  |             |
+     *   |        |   |        |  |             |
+     *   |        |   |     *O |  +             |
+     *   *--->y   |   |        | /              |
+     *  /         |   |        |/               |
+     * V X        |   +--------+                |
+     *            |-----------------------------|
+     *
+     *           *S
+     *
+     *
+     *
+     *
+     *
+     **/
+    
+// this definitionmust go here.
+void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate);
+
+__global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){
+    size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+    for(; idx<geo.nVoxelX* geo.nVoxelY *geo.nVoxelZ; idx+=gridDim.x*blockDim.x) {
+        image[idx]*=constant;
+    }
+    
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were:
+// PROJ_PER_KERNEL = 32 or 16 (very similar times)
+// VOXELS_PER_THREAD = 8
+// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code.
+// (e.g. 16.2 s vs. ~62 s).
+
+const int PROJ_PER_KERNEL = 32;  // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck.
+const int VOXELS_PER_THREAD = 8;  // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck.
+
+// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection:
+//   deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec
+// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel
+// (they will be updated in the main loop before each kernel call).
+
+__constant__ Point3D projParamsArray2Dev[7*PROJ_PER_KERNEL];  // Dev means it is on device
+
+// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above)
+
+// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection)
+__constant__ float projSinCosArray2Dev[5*PROJ_PER_KERNEL];
+
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+//______________________________________________________________________________
+//
+//      Function:       kernelPixelBackprojectionFDK
+//
+//      Description:    Main FDK backprojection kernel
+//______________________________________________________________________________
+
+__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
+{
+    
+    unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x;
+    // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
+    unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
+    //Make sure we don't go out of bounds
+    if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ)
+        return;
+    
+    // We'll keep a local auxiliary array of values of a column of voxels that this thread will update
+    float voxelColumn[VOXELS_PER_THREAD];
+    
+    // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then
+    // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes
+    
+    unsigned long colIdx;
+#pragma unroll
+    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+    {
+        unsigned long long indZ = startIndZ + colIdx;
+        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the updated values go back to the main volume)
+        if(indZ>=geo.nVoxelZ)
+            break;   // break the loop.
+        
+        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
+        voxelColumn[colIdx] = image[idx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
+        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
+    }  // END copy 3D volume voxels to local array
+    
+    // Now iterate through projections
+#pragma unroll
+    for(unsigned long projNumber=0; projNumber<PROJ_PER_KERNEL; projNumber++)
+    {
+        // Get the current parameters from parameter arrays in constant memory.
+        unsigned long indAlpha = currProjSetNumber*PROJ_PER_KERNEL+projNumber;  // This is the ABSOLUTE projection number in the projection array
+        
+        // Our currImageVal will be updated by hovewer many projections we had left in the "remainder" - that's OK.
+        if(indAlpha>=totalNoOfProjections)
+            break;
+        
+        Point3D deltaX = projParamsArray2Dev[7*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
+        Point3D deltaY = projParamsArray2Dev[7*projNumber+1];
+        Point3D deltaZ = projParamsArray2Dev[7*projNumber+2];
+        Point3D xyzOrigin = projParamsArray2Dev[7*projNumber+3];
+        Point3D xyzOffset = projParamsArray2Dev[7*projNumber+4];
+        Point3D uv0Offset = projParamsArray2Dev[7*projNumber+5];
+        Point3D S = projParamsArray2Dev[7*projNumber+6];
+        
+        float sinalpha = projSinCosArray2Dev[5*projNumber];     // 2*projNumber because we have 2 float (sin or cos angle) values per projection
+        float cosalpha = projSinCosArray2Dev[5*projNumber+1];
+        float COR = projSinCosArray2Dev[5*projNumber+2];
+        float DSD = projSinCosArray2Dev[5*projNumber+3];
+        float DSO = projSinCosArray2Dev[5*projNumber+4];
+        // Precomputations for the weights:
+        //Real coords of Source
+        // We already have S.x (geo.DSO), and S.y and S.z are always zero. we just need to rotate
+        Point3D realS;
+        realS.x= DSO*cosalpha;
+        realS.y=-DSO*sinalpha;
+        realS.z=0;
+        
+        
+        Point3D realvoxel_init;
+        realvoxel_init.x=-geo.sVoxelX/2+geo.dVoxelX/2+xyzOffset.x;
+        realvoxel_init.y=-geo.sVoxelY/2+geo.dVoxelY/2+xyzOffset.y;
+        realvoxel_init.z=-geo.sVoxelZ/2+geo.dVoxelZ/2+xyzOffset.z;
+        // Real XYZ coordinates of Detector.
+        Point3D realD, realDaux;
+        // We know the index of the detector (u,v). Start from there.
+        realDaux.x=-(DSD-DSO);
+        
+        // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION
+#pragma unroll
+        for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+        {
+            unsigned long long indZ = startIndZ + colIdx;
+            
+            // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+            // be trying to copy the out of bounds values anyway (bounds checks will be done in the final loop where the values go to the main volume)
+            if(indZ>=geo.nVoxelZ)
+                break;   // break the loop.
+            
+            // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles.
+            Point3D P;
+            P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x);
+            P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU;
+            P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z);
+            
+            // This is the vector defining the line from the source to the Voxel
+            float vectX,vectY,vectZ;
+            vectX=(P.x -S.x);
+            vectY=(P.y -S.y);
+            vectZ=(P.z -S.z);
+            
+            // Get the coordinates in the detector UV where the mid point of the voxel is projected.
+            float t=__fdividef(DSO-DSD-S.x,vectX);
+            float y,z;
+            y=vectY*t+S.y;
+            z=vectZ*t+S.z;
+            float u,v;
+            u=y+(float)geo.nDetecU*0.5f;
+            v=z+(float)geo.nDetecV*0.5f;
+#if IS_FOR_MATLAB_TIGRE
+            float sample=tex3D<float>(tex, v, u ,indAlpha+0.5f);
+#else
+            float sample=tex3D<float>(tex, u, v ,indAlpha+0.5f);
+#endif
+            float weight=0;
+            //
+            //
+            //
+            // IMPORTANT: The weights are almost 50% of the computational time. Is there a way of speeding this up??
+            //
+            //Real coordinates of Voxel. Instead of reverting the transformation, its less math (faster) to compute it from the indexes.
+            Point3D realvoxel;
+            
+            realvoxel.x=realvoxel_init.x+indX*geo.dVoxelX;
+            realvoxel.y=realvoxel_init.y+indY*geo.dVoxelY;
+            realvoxel.z=realvoxel_init.z+indZ*geo.dVoxelZ;
+            
+            
+            
+            realDaux.y=(-geo.sDetecU+geo.dDetecU)*0.5f + u*geo.dDetecU +uv0Offset.x;
+            realD.z   =(-geo.sDetecV+geo.dDetecV)*0.5f + v*geo.dDetecV +uv0Offset.y;
+            //rotate the detector
+            realD.x= realDaux.x*cosalpha  + realDaux.y*sinalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x)
+            realD.y=-realDaux.x*sinalpha  + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x)
+            float L,lsq;
+            
+            L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always.
+            lsq =  (realS.x-realvoxel.x)*(realS.x-realvoxel.x)
+            + (realS.y-realvoxel.y)*(realS.y-realvoxel.y)
+            + (realS.z-realvoxel.z)*(realS.z-realvoxel.z);
+            
+            weight=__fdividef(L*L*L,(DSD*lsq));
+//             weight=1;
+            // Get Value in the computed (U,V) and multiply by the corresponding weight.
+            // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
+            voxelColumn[colIdx]+=sample* weight;
+        }  // END iterating through column of voxels
+        
+    }  // END iterating through multiple projections
+    
+    // And finally copy the updated local voxelColumn array back to our 3D volume (main memory)
+#pragma unroll
+    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+    {
+        unsigned long long indZ = startIndZ + colIdx;
+        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the values go to the main volume)
+        if(indZ>=geo.nVoxelZ)
+            break;   // break the loop.
+        
+        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
+        image[idx] = voxelColumn[colIdx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
+        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
+        // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write.
+        // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is
+        // better for avoiding memory congestion.
+    }  // END copy updated voxels from local array to our 3D volume
+    
+}  // END kernelPixelBackprojectionFDK
+
+
+
+
+//______________________________________________________________________________
+//
+//      Function:       voxel_backprojection
+//
+//      Description:    Main host function for FDK backprojection (invokes the kernel)
+//______________________________________________________________________________
+
+int voxel_backprojection2(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids){
+    
+    
+    
+    
+    // Prepare for MultiGPU
+    int deviceCount = gpuids.GetLength();
+    cudaCheckErrors("Device query fail");
+    if (deviceCount == 0) {
+        mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n");
+    }
+    
+    
+    // CODE assumes
+    // 1.-All available devices are usable by this code
+    // 2.-All available devices are equal, they are the same machine (warning thrown)
+    // Check the available devices, and if they are the same
+    if (!gpuids.AreEqualDevices()) {
+        mexWarnMsgIdAndTxt("Atb:Voxel_backprojection2:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+    }
+
+    int dev;
+
+    
+    // Split the CT problem
+    unsigned int split_image;
+    unsigned int split_projections;
+    splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections);
+    
+    
+    // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the
+    // image slices. The rest of the Geometry is the same
+    Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry));
+    createGeoArray(split_image*deviceCount,geo,geoArray,nalpha);
+    
+    // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly
+    // in the previous section this should leave enough space for the textures.
+    size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
+    float** dimage=(float**)malloc(deviceCount*sizeof(float*));
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMalloc((void**)&dimage[dev], num_bytes_img);
+        cudaCheckErrors("cudaMalloc fail");
+    }
+        
+    
+    //Pagelock memory for synchronous copy.
+    // Lets try to make the host memory pinned:
+    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
+    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
+    if (isHostRegisterSupported & split_image>1){
+        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
+    }
+    if (isHostRegisterSupported ){
+        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
+    }
+    cudaCheckErrors("Error pinning memory");
+
+    
+    
+    
+
+    //If it is the first time, lets make sure our image is zeroed.
+    int nStreamDevice=2;
+    int nStreams=deviceCount*nStreamDevice;
+    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        for (int i = 0; i < nStreamDevice; ++i){
+            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
+            
+        }
+    }
+    
+    // Kernel auxiliary variables
+    Point3D* projParamsArray2Host;
+    cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D));
+    float* projSinCosArray2Host;
+    cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float));
+    
+    // Texture object variables
+    cudaTextureObject_t *texProj;
+    cudaArray **d_cuArrTex;
+    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
+    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
+    
+    
+    
+    unsigned int proj_split_overlap_number;
+    // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop
+    // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution
+    // of the code, as repeated allocation and deallocation only happens when the projection data is very very big,
+    // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time.
+    unsigned long long proj_linear_idx_start;
+    unsigned int current_proj_split_size,current_proj_overlap_split_size;
+    size_t num_bytes_img_curr;
+    size_t img_linear_idx_start;
+    float** partial_projection;
+    size_t* proj_split_size;
+    
+    for(unsigned int img_slice=0;img_slice<split_image;img_slice++){
+//
+        // Initialize the memory if its the first time.
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaMemset(dimage[dev],0,num_bytes_img);
+            cudaCheckErrors("memset fail");
+        }
+        
+        for( unsigned int proj=0;proj<split_projections;proj++){
+            
+            
+            // What is the size of the current chunk of proejctions we need in?
+            current_proj_split_size=(nalpha+split_projections-1)/split_projections;
+            // if its the last one its probably less
+            current_proj_split_size=((proj+1)*current_proj_split_size<nalpha)?  current_proj_split_size:  nalpha-current_proj_split_size*proj;
+            
+            // We are going to split it in the same amount of kernels we need to execute.
+            proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;
+            
+            // Create pointer to pointers of projections and precompute their location and size.
+            if(!proj && !img_slice){
+                partial_projection=(float**)malloc(current_proj_split_size*sizeof(float*));
+                proj_split_size=(size_t*)malloc(current_proj_split_size*sizeof(size_t*));
+            }
+            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
+                // Crop the last one, as its likely its not completely divisible.
+                // now lets split this for simultanoeus memcopy and compute.
+                // We want to make sure that if we can, we run PROJ_PER_KERNEL projections, to maximize kernel acceleration
+                // current_proj_overlap_split_size units = angles
+                current_proj_overlap_split_size=max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL);
+                current_proj_overlap_split_size=(proj_block_split<proj_split_overlap_number-1)?current_proj_overlap_split_size:current_proj_split_size-(proj_split_overlap_number-1)*current_proj_overlap_split_size;
+                //Get the linear index where the current memory chunk starts.
+                
+                proj_linear_idx_start=(unsigned long long)((nalpha+split_projections-1)/split_projections)*(unsigned long long)proj*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
+                proj_linear_idx_start+=proj_block_split*max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL)*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
+                //Store result
+                proj_split_size[proj_block_split]=current_proj_overlap_split_size;
+                partial_projection[proj_block_split]=&projections[proj_linear_idx_start];
+                
+            }                
+
+            
+            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
+
+                
+                // Now get the projections on memory
+
+                CreateTexture2(gpuids,
+                        partial_projection[proj_block_split],geo,
+                        &d_cuArrTex[(proj_block_split%2)*deviceCount],
+                        proj_split_size[proj_block_split],
+                        &texProj   [(proj_block_split%2)*deviceCount],
+                        stream, nStreamDevice,
+                        (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
+                
+                for (dev = 0; dev < deviceCount; dev++){
+                    cudaSetDevice(gpuids[dev]);
+                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
+                 }
+
+                for (dev = 0; dev < deviceCount; dev++){
+                    //Safety:
+                    // Depends on the amount of GPUs, the case where a image slice is zero hight can happen.
+                    // Just break the loop if we reached that point
+                    if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
+                        break;
+                    
+                    cudaSetDevice(gpuids[dev]);
+                    
+                    
+                    
+                    int divx,divy,divz;
+                    // RB: Use the optimal (in their tests) block size from paper by Zinsser and Keck (16 in x and 32 in y).
+                    // I tried different sizes and shapes of blocks (tiles), but it does not appear to significantly affect throughput, so
+                    // let's stick with the values from Zinsser and Keck.
+                    divx=16;
+                    divy=32;
+                    divz=VOXELS_PER_THREAD;      // We now only have 32 x 16 threads per block (flat tile, see below), BUT each thread works on a Z column of VOXELS_PER_THREAD voxels, so we effectively need fewer blocks!
+                    
+                    
+                    dim3 grid((geo.nVoxelX+divx-1)/divx,
+                            (geo.nVoxelY+divy-1)/divy,
+                            (geoArray[img_slice*deviceCount+dev].nVoxelZ+divz-1)/divz);
+                    
+                    dim3 block(divx,divy,1);    // Note that we have 1 in the Z size, not divz, since each thread works on a vertical set of VOXELS_PER_THREAD voxels (so we only need a "flat" tile of threads, with depth of 1)
+                    //////////////////////////////////////////////////////////////////////////////////////
+                    // Main reconstruction loop: go through projections (rotation angles) and backproject
+                    //////////////////////////////////////////////////////////////////////////////////////
+                    
+                    // Since we'll have multiple projections processed by a SINGLE kernel call, compute how many
+                    // kernel calls we'll need altogether.
+                    unsigned int noOfKernelCalls = (proj_split_size[proj_block_split]+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_KERNEL
+                    for (unsigned int i=0; i<noOfKernelCalls; i++){
+                        
+                        // Now we need to generate and copy all data for PROJ_PER_KERNEL projections to constant memory so that our kernel can use it
+                        unsigned int j;
+                        for(j=0; j<PROJ_PER_KERNEL; j++){
+                            
+                            unsigned int currProjNumber_slice=i*PROJ_PER_KERNEL+j;
+                            unsigned int currProjNumber_global=i*PROJ_PER_KERNEL+j                                                                          // index within kernel
+                                    +proj*(nalpha+split_projections-1)/split_projections                                          // index of the global projection split
+                                    +proj_block_split*max(current_proj_split_size/proj_split_overlap_number,PROJ_PER_KERNEL); // indexof overlap current split
+                            if(currProjNumber_slice>=proj_split_size[proj_block_split])
+                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
+                            if(currProjNumber_global>=nalpha)
+                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
+                            
+                            Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, offDetec,source;
+                            float sinalpha,cosalpha;
+                            
+                            geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now.
+                            geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1];
+                            geoArray[img_slice*deviceCount+dev].psi  =-alphas[currProjNumber_global*3+2];
+                            
+                            sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha);
+                            cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha);
+                            
+                            projSinCosArray2Host[5*j]=sinalpha;  // 2*j because we have 2 float (sin or cos angle) values per projection
+                            projSinCosArray2Host[5*j+1]=cosalpha;
+                            projSinCosArray2Host[5*j+2]=geo.COR[currProjNumber_global];
+                            projSinCosArray2Host[5*j+3]=geo.DSD[currProjNumber_global];
+                            projSinCosArray2Host[5*j+4]=geo.DSO[currProjNumber_global];
+                            
+                            computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
+                            
+                            offOrig.x=geo.offOrigX[currProjNumber_global];
+                            offOrig.y=geo.offOrigY[currProjNumber_global];
+                            offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global];
+                            
+                            offDetec.x=geo.offDetecU[currProjNumber_global];
+                            offDetec.y=geo.offDetecV[currProjNumber_global];
+                            offDetec.z=0;//unused
+                            
+                            projParamsArray2Host[7*j]  =deltaX;		// 7*j because we have 7 Point3D values per projection
+                            projParamsArray2Host[7*j+1]=deltaY;
+                            projParamsArray2Host[7*j+2]=deltaZ;
+                            projParamsArray2Host[7*j+3]=xyzOrigin;
+                            projParamsArray2Host[7*j+4]=offOrig;
+                            projParamsArray2Host[7*j+5]=offDetec;
+                            projParamsArray2Host[7*j+6]=source;
+                            
+                        }   // END for (preparing params for kernel call)
+                        
+                        // Copy the prepared parameter arrays to constant memory to make it available for the kernel
+                        cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
+                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
+                        kernelPixelBackprojection<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
+                        
+                    }  // END for
+                    //////////////////////////////////////////////////////////////////////////////////////
+                    // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject
+                    //////////////////////////////////////////////////////////////////////////////////////
+                }
+            } // END sub-split of current projection chunk
+
+        } // END projection splits
+        
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>(  geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV));
+        }
+
+        // Now we need to take the image out of the GPU
+        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaStreamSynchronize(stream[dev*nStreamDevice]);
+            
+            num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
+            img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
+            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
+        }
+    } // end image splits
+    
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaDeviceSynchronize();
+    }  
+    
+    
+    // Clean the GPU
+    bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1;
+    for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
+        if (!two_buffers_used && i==1)
+            break;        for (dev = 0; dev < deviceCount; dev++){
+            cudaSetDevice(gpuids[dev]);
+            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
+            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
+        }
+    }
+    free(d_cuArrTex);
+    free(texProj);
+
+    for (dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaFree(dimage[dev]);
+    }
+    free(dimage);
+    
+    cudaFreeHost(projSinCosArray2Host);
+    cudaFreeHost(projParamsArray2Host);
+    free(partial_projection);
+    free(proj_split_size);
+    
+    freeGeoArray(split_image*deviceCount,geoArray);
+#ifndef NO_PINNED_MEMORY     
+    if (isHostRegisterSupported & split_image>1){
+        cudaHostUnregister(result);
+    }
+    if (isHostRegisterSupported){
+        cudaHostUnregister(projections);
+    }
+#endif 
+    for (int i = 0; i < nStreams; ++i)
+        cudaStreamDestroy(stream[i]);
+    
+    cudaCheckErrors("cudaFree fail");
+    
+//     cudaDeviceReset(); // For the Nvidia Visual Profiler
+    return 0;
+    
+}  // END voxel_backprojection
+
+
+
+
+
+void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
+    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
+    int num_devices = gpuids.GetLength();
+#if IS_FOR_MATLAB_TIGRE
+    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+#else
+    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+#endif
+    if (allocate){
+        for (unsigned int dev = 0; dev < num_devices; dev++){
+            cudaSetDevice(gpuids[dev]);
+            
+            //cudaArray Descriptor
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            //cuda Array
+            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            
+        }
+    }
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMemcpy3DParms copyParams = {0};
+        //Array creation
+        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.dstArray = d_cuArrTex[dev];
+        copyParams.extent   = extent;
+        copyParams.kind     = cudaMemcpyHostToDevice;
+        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
+    }
+
+    //Array creation End
+    for (unsigned int dev = 0; dev < num_devices; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(cudaResourceDesc));
+        texRes.resType = cudaResourceTypeArray;
+        texRes.res.array.array  = d_cuArrTex[dev];
+        cudaTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        texDescr.normalizedCoords = false;
+        texDescr.filterMode = cudaFilterModeLinear;
+        texDescr.addressMode[0] = cudaAddressModeBorder;
+        texDescr.addressMode[1] = cudaAddressModeBorder;
+        texDescr.addressMode[2] = cudaAddressModeBorder;
+        texDescr.readMode = cudaReadModeElementType;
+        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
+    }
+}
+#ifndef BACKPROJECTION_HPP
+void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){
+    
+    
+    // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
+    
+    size_t mem_GPU_global;
+    checkFreeMemory(gpuids, &mem_GPU_global);
+    const int deviceCount = gpuids.GetLength();
+    
+    // Compute how much memory each of the relevant memory pieces need
+    size_t mem_image=       (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
+    size_t mem_proj=        (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float);
+    
+    
+    
+    
+    // Does everything fit in the GPU?
+    
+    if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2<mem_GPU_global){
+        // We only need to split if we have extra GPUs
+        *split_image=1;
+        *split_projections=1;
+    }
+    // We know we need to split, but:
+    // Does all the image fit in the GPU, with some slack for a stack of projections??
+    else
+    {
+        // As we can overlap memcpys from H2D of the projections, we should then minimize the amount of image splits.
+        // Lets assume to start with that we only need 1 stack of PROJ_PER_KERNEL projections. The rest is for the image.
+        size_t mem_free=mem_GPU_global-2*mem_proj*PROJ_PER_KERNEL;
+        
+        *split_image=(mem_image/deviceCount+mem_free-1)/mem_free;
+        // Now knowing how many splits we have for images, we can recompute how many slices of projections actually
+        // fit on the GPU. Must be more than 0 obviously.
+        
+        mem_free=mem_GPU_global-(mem_image/deviceCount)/(*split_image); // NOTE: There is some rounding error, but its in the order of bytes, and we have 5% of GPU free jsut in case. We are safe
+        
+        
+        *split_projections=(mem_proj*PROJ_PER_KERNEL*2+mem_free-1)/mem_free;
+        
+    }
+}
+
+
+void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S)
+{
+    Point3Ddouble P, Px,Py,Pz;
+    // Get coords of Img(0,0,0)
+    P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i];
+    P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i];
+    P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i];
+    
+    // Get coors from next voxel in each direction
+    Px.x=P.x+geo.dVoxelX;      Py.x=P.x;                Pz.x=P.x;
+    Px.y=P.y;                   Py.y=P.y+geo.dVoxelY;    Pz.y=P.y;
+    Px.z=P.z;                   Py.z=P.z;                Pz.z=P.z+geo.dVoxelZ;
+    
+    
+    
+// Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ
+    
+    eulerZYZT(geo,&P);
+    eulerZYZT(geo,&Px);
+    eulerZYZT(geo,&Py);
+    eulerZYZT(geo,&Pz);
+    
+    
+    
+    //detector offset
+    P.z =P.z-geo.offDetecV[i];            P.y =P.y-geo.offDetecU[i];
+    Px.z =Px.z-geo.offDetecV[i];          Px.y =Px.y-geo.offDetecU[i];
+    Py.z =Py.z-geo.offDetecV[i];          Py.y =Py.y-geo.offDetecU[i];
+    Pz.z =Pz.z-geo.offDetecV[i];          Pz.y =Pz.y-geo.offDetecU[i];
+    
+    //Detector Roll pitch Yaw
+    //
+    //
+    // first, we need to offset everything so (0,0,0) is the center of the detector
+    // Only X is required for that
+    P.x=P.x+(geo.DSD[i]-geo.DSO[i]);
+    Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]);
+    Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]);
+    Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]);
+    rollPitchYawT(geo,i,&P);
+    rollPitchYawT(geo,i,&Px);
+    rollPitchYawT(geo,i,&Py);
+    rollPitchYawT(geo,i,&Pz);
+    
+    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
+    Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]);
+    Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]);
+    Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]);
+    //Done for P, now source
+    Point3Ddouble source;
+    source.x=geo.DSD[i]; //already offseted for rotation
+    source.y=-geo.offDetecU[i];
+    source.z=-geo.offDetecV[i];
+    rollPitchYawT(geo,i,&source);
+    
+    
+    source.x=source.x-(geo.DSD[i]-geo.DSO[i]);//   source.y=source.y-auxOff.y;    source.z=source.z-auxOff.z;
+    
+//       mexPrintf("%f,%f,%f\n",source.x,source.y,source.z);
+    // Scale coords so detector pixels are 1x1
+    
+    P.z =P.z /geo.dDetecV;                          P.y =P.y/geo.dDetecU;
+    Px.z=Px.z/geo.dDetecV;                          Px.y=Px.y/geo.dDetecU;
+    Py.z=Py.z/geo.dDetecV;                          Py.y=Py.y/geo.dDetecU;
+    Pz.z=Pz.z/geo.dDetecV;                          Pz.y=Pz.y/geo.dDetecU;
+    
+    source.z=source.z/geo.dDetecV;                  source.y=source.y/geo.dDetecU;
+    
+    // get deltas of the changes in voxels
+    deltaX->x=Px.x-P.x;   deltaX->y=Px.y-P.y;    deltaX->z=Px.z-P.z;
+    deltaY->x=Py.x-P.x;   deltaY->y=Py.y-P.y;    deltaY->z=Py.z-P.z;
+    deltaZ->x=Pz.x-P.x;   deltaZ->y=Pz.y-P.y;    deltaZ->z=Pz.z-P.z;
+    
+    
+    *xyzorigin=P.to_float();
+    *S=source.to_float();
+}  // END computeDeltasCube
+
+void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
+    size_t memfree;
+    size_t memtotal;
+    const int gpuids.GetLength();
+    
+    for (int dev = 0; dev < deviceCount; dev++){
+        cudaSetDevice(gpuids[dev]);
+        cudaMemGetInfo(&memfree,&memtotal);
+        if(dev==0) *mem_GPU_global=memfree;
+        if(memfree<memtotal/2){
+            mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
+        }
+        cudaCheckErrors("Check mem error");
+        
+        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
+    }
+    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
+    
+    //*mem_GPU_global= insert your known number here, in bytes.
+}
+
+#endif
diff --git a/Common/CUDA/voxel_backprojection2.hpp.prehip b/Common/CUDA/voxel_backprojection2.hpp.prehip
new file mode 100644
index 00000000..314de4f2
--- /dev/null
+++ b/Common/CUDA/voxel_backprojection2.hpp.prehip
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA function for backrpojection using matched weigts for CBCT
+ *
+ *
+ * CODE by  Ander Biguri
+ *          Optimized and modified by RB
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+#include "voxel_backprojection.hpp"
+#include "types_TIGRE.hpp"
+#include "GpuIds.hpp"
+
+
+#ifndef BACKPROJECTION2_HPP
+#define BACKPROJECTION2_HPP
+
+int voxel_backprojection2(float  *  projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids);
+void computeDeltasCube(Geometry geo, float alpha,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S);
+void splitCTbackprojection(const GpuIds& gpuids,Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections);
+void computeDeltasCube(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S);
+void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles);
+void freeGeoArray(unsigned int splits,Geometry* geoArray);
+void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
+#endif
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection_parallel.cu b/Common/CUDA/voxel_backprojection_parallel.cu
index 03703576..58ab9f38 100644
--- a/Common/CUDA/voxel_backprojection_parallel.cu
+++ b/Common/CUDA/voxel_backprojection_parallel.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*-------------------------------------------------------------------------
  *
  * CUDA function for backrpojection  for parallel beam
@@ -46,8 +47,8 @@
 
 #define  PI_2 1.57079632679489661923
 #include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
 #include "voxel_backprojection.hpp"
 #include "voxel_backprojection_parallel.hpp"
 
@@ -57,10 +58,10 @@
 // https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
 #define cudaCheckErrors(msg) \
 do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
+        hipError_t __err = hipGetLastError(); \
+        if (__err != hipSuccess) { \
                 mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",hipGetErrorString(__err));\
         } \
 } while (0)
     
@@ -92,7 +93,7 @@ do { \
      *
      *
      **/
-void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate);
+void CreateTextureParallel( float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool allocate);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
@@ -135,7 +136,7 @@ __constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL];
 //      Description:    Main FDK backprojection kernel
 //______________________________________________________________________________
 
-__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex)
+__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,hipTextureObject_t tex)
 {
     
     // Old kernel call signature:
@@ -286,9 +287,9 @@ __global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* im
 int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids)
 {
     if (gpuids.GetLength() == 0) {
-        cudaSetDevice(0);
+        hipSetDevice(0);
     } else {
-        cudaSetDevice(gpuids[0]);
+        hipSetDevice(gpuids[0]);
     }
     
     /*
@@ -298,10 +299,10 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     //If it is the first time, lets make sure our image is zeroed.
     int nStreamDevice=2;
     int nStreams=nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    hipStream_t* stream=(hipStream_t*)malloc(nStreams*sizeof(hipStream_t));;
     
     for (int i = 0; i < nStreamDevice; ++i){
-        cudaStreamCreate(&stream[i]);
+        hipStreamCreate(&stream[i]);
         
         
     }
@@ -310,10 +311,10 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
     int isHostRegisterSupported = 0;
 #if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+    hipDeviceGetAttribute(&isHostRegisterSupported,hipDeviceAttributeHostRegisterSupported,gpuids[0]);
 #endif
     if (isHostRegisterSupported){
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
+        hipHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),hipHostRegisterPortable);
     }
     cudaCheckErrors("Error pinning memory");
     
@@ -321,22 +322,22 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     // Allocate result image memory
     size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float);
     float* dimage;
-    cudaMalloc((void**)&dimage, num_bytes);
-    cudaMemset(dimage,0,num_bytes);
-    cudaCheckErrors("cudaMalloc fail");
+    hipMalloc((void**)&dimage, num_bytes);
+    hipMemset(dimage,0,num_bytes);
+    cudaCheckErrors("hipMalloc fail");
     
     
     Point3D* projParamsArrayHostParallel;
-    cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D));
+    hipHostMalloc((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D));
     float* projSinCosArrayHostParallel;
-    cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float));
+    hipHostMalloc((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float));
     
     
     // Texture buffer objects
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*));
+    hipTextureObject_t *texProj;
+    hipArray **d_cuArrTex;
+    texProj =(hipTextureObject_t*)malloc(2*sizeof(hipTextureObject_t));
+    d_cuArrTex =(hipArray**)malloc(2*sizeof(hipArray*));
 
     
     
@@ -389,7 +390,7 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
                 (proj_block_split<2));// Only allocate if its the first 2 calls
         
   
-        cudaStreamSynchronize(stream[0+1]);
+        hipStreamSynchronize(stream[0+1]);
         
         
 
@@ -464,9 +465,9 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
             
             // Copy the prepared parameter arrays to constant memory to make it available for the kernel
             
-            cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
-            cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
-            cudaStreamSynchronize(stream[0]);
+            hipMemcpyToSymbolAsync(HIP_SYMBOL(projSinCosArrayDevParallel), projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]);
+            hipMemcpyToSymbolAsync(HIP_SYMBOL(projParamsArrayDevParallel), projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,hipMemcpyHostToDevice,stream[0]);
+            hipStreamSynchronize(stream[0]);
 
             kernelPixelBackprojection_parallel<<<grid,block,0,stream[0]>>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]);
         }  // END for
@@ -475,9 +476,9 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
         // END Main reconstruction loop: go through projections (rotation angles) and backproject
         //////////////////////////////////////////////////////////////////////////////////////
     }
-    cudaDeviceSynchronize();
-    cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost);
-    cudaCheckErrors("cudaMemcpy result fail");
+    hipDeviceSynchronize();
+    hipMemcpy(result, dimage, num_bytes, hipMemcpyDeviceToHost);
+    cudaCheckErrors("hipMemcpy result fail");
     
     free(partial_projection);
     free(proj_split_size);
@@ -486,23 +487,23 @@ int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* re
     for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
         if (!two_buffers_used && i==1)
             break;            
-            cudaDestroyTextureObject(texProj[i]);
-            cudaFreeArray(d_cuArrTex[i]);
+            hipDestroyTextureObject(texProj[i]);
+            hipFreeArray(d_cuArrTex[i]);
     }
     free(texProj);
     
     free(d_cuArrTex);
-    cudaFreeHost(projSinCosArrayHostParallel);
-    cudaFreeHost(projParamsArrayHostParallel);
+    hipHostFree(projSinCosArrayHostParallel);
+    hipHostFree(projParamsArrayHostParallel);
     
-    cudaFree(dimage);
+    hipFree(dimage);
     if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
+        hipHostUnregister(projections);
     }
     for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
+        hipStreamDestroy(stream[i]);
 
-//     cudaDeviceReset();
+//     hipDeviceReset();
     return 0;
     
 }  // END voxel_backprojection
@@ -583,45 +584,45 @@ void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D*
 
     
 }  // END computeDeltasCube
-void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc)
+void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArrTex,unsigned int nangles, hipTextureObject_t *texImage,hipStream_t* stream, bool alloc)
 {
-        //cudaArray Descriptor
+        //hipArray Descriptor
 #if IS_FOR_MATLAB_TIGRE
-        const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+        const hipExtent extent =make_hipExtent(geo.nDetecV, geo.nDetecU, nangles);
 #else
-        const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+        const hipExtent extent =make_hipExtent(geo.nDetecU, geo.nDetecV, nangles);
 #endif
-        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+        hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
         //cuda Array
         if (alloc){
-        cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+        hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
         cudaCheckErrors("Texture memory allocation fail");
         }
-        cudaMemcpy3DParms copyParams = {0};
+        hipMemcpy3DParms copyParams = {0};
         
         
         //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.srcPtr   = make_hipPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
         copyParams.dstArray = d_cuArrTex[0];
         copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[0+1]);
+        copyParams.kind     = hipMemcpyHostToDevice;
+        hipMemcpy3DAsync(&copyParams,stream[0+1]);
         cudaCheckErrors("Texture memory data copy fail");
         //Array creation End
         
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
+        hipResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(hipResourceDesc));
+        texRes.resType = hipResourceTypeArray;
         texRes.res.array.array  = d_cuArrTex[0];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        hipTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(hipTextureDesc));
         texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+        texDescr.filterMode = hipFilterModeLinear;
+        texDescr.addressMode[0] = hipAddressModeBorder;
+        texDescr.addressMode[1] = hipAddressModeBorder;
+        texDescr.addressMode[2] = hipAddressModeBorder;
+        texDescr.readMode = hipReadModeElementType;
+        hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
         cudaCheckErrors("Texture object creation fail");
     
 }
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection_parallel.cu.prehip b/Common/CUDA/voxel_backprojection_parallel.cu.prehip
new file mode 100644
index 00000000..03703576
--- /dev/null
+++ b/Common/CUDA/voxel_backprojection_parallel.cu.prehip
@@ -0,0 +1,627 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA function for backrpojection  for parallel beam
+ *
+ *
+ * CODE by  Ander Biguri
+ *          Optimized and modified by RB
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+#define  PI_2 1.57079632679489661923
+#include <algorithm>
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include "voxel_backprojection.hpp"
+#include "voxel_backprojection_parallel.hpp"
+
+#include "TIGRE_common.hpp"
+#include <math.h>
+
+// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
+        } \
+} while (0)
+    
+    
+#define MAXTREADS 1024
+    /*GEOMETRY DEFINITION
+     *
+     *                Detector plane, behind
+     *            |-----------------------------|
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |                             |
+     *            |      +--------+             |
+     *            |     /        /|             |
+     *   A Z      |    /        / |*D           |
+     *   |        |   +--------+  |             |
+     *   |        |   |        |  |             |
+     *   |        |   |     *O |  +             |
+     *   *--->y   |   |        | /              |
+     *  /         |   |        |/               |
+     * V X        |   +--------+                |
+     *            |-----------------------------|
+     *
+     *           *S
+     *
+     *
+     *
+     *
+     *
+     **/
+void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were:
+// PROJ_PER_KERNEL = 32 or 16 (very similar times)
+// VOXELS_PER_THREAD = 8
+// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code.
+// (e.g. 16.2 s vs. ~62 s).
+
+const int PROJ_PER_KERNEL = 32;  // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck.
+const int VOXELS_PER_THREAD = 8;  // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck.
+
+// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection:
+//   deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec
+// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel
+// (they will be updated in the main loop before each kernel call).
+
+__constant__ Point3D projParamsArrayDevParallel[6*PROJ_PER_KERNEL];  // Dev means it is on device
+
+// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above)
+// Point3D projParamsArrayHostParallel[6*PROJ_PER_KERNEL];   // Host means it is host memory
+
+// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection)
+__constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL];
+
+// float projSinCosArrayHostParallel[3*PROJ_PER_KERNEL];
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+//______________________________________________________________________________
+//
+//      Function:       kernelPixelBackprojectionFDK
+//
+//      Description:    Main FDK backprojection kernel
+//______________________________________________________________________________
+
+__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex)
+{
+    
+    // Old kernel call signature:
+    // kernelPixelBackprojectionFDK<<<grid,block>>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha);
+    // We just read in most of the params from the constant memory instead of getting them from the param list.
+    // This is because we now have MANY params, since single kernel processes more than one projection!
+    /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo,
+     * float* image,
+     * const int indAlpha,
+     * const Point3D deltaX ,
+     * const Point3D deltaY,
+     * const Point3D deltaZ,
+     * const Point3D xyzOrigin,
+     * const Point3D xyzOffset,
+     * const Point3D uv0Offset,
+     * const float sinalpha,
+     * const float cosalpha){
+     */
+    unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x;
+    // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
+    unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
+    //Make sure we don't go out of bounds
+    if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ)
+        return;
+    
+    // We'll keep a local auxiliary array of values of a column of voxels that this thread will update
+    float voxelColumn[VOXELS_PER_THREAD];
+    
+    // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then
+    // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes
+    
+    unsigned long colIdx;
+    
+    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+    {
+        unsigned long long indZ = startIndZ + colIdx;
+        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the updated values go back to the main volume)
+        if(indZ>=geo.nVoxelZ)
+            break;   // break the loop.
+        
+        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
+        voxelColumn[colIdx] = image[idx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
+        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
+    }  // END copy 3D volume voxels to local array
+    
+    // Now iterate through projections
+    for(unsigned long projNumber=0; projNumber<PROJ_PER_KERNEL; projNumber++)
+    {
+        // Get the current parameters from parameter arrays in constant memory.
+        unsigned long indAlpha = currProjSetNumber*PROJ_PER_KERNEL+projNumber;  // This is the ABSOLUTE projection number in the projection array
+        
+        // Our currImageVal will be updated by hovewer many projections we had left in the "remainder" - that's OK.
+        if(indAlpha>=totalNoOfProjections)
+            break;
+        
+        Point3D deltaX = projParamsArrayDevParallel[6*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
+        Point3D deltaY = projParamsArrayDevParallel[6*projNumber+1];
+        Point3D deltaZ = projParamsArrayDevParallel[6*projNumber+2];
+        Point3D xyzOrigin = projParamsArrayDevParallel[6*projNumber+3];
+        Point3D xyzOffset = projParamsArrayDevParallel[6*projNumber+4];
+        Point3D S = projParamsArrayDevParallel[6*projNumber+5];
+        
+        float DSD = projSinCosArrayDevParallel[3*projNumber];     // 2*projNumber because we have 2 float (sin or cos angle) values per projection
+        float DSO = projSinCosArrayDevParallel[3*projNumber+1];
+        float COR = projSinCosArrayDevParallel[3*projNumber+2];
+        
+        // Geometric trasnformations:
+        //Source, scaled XYZ coordinates
+        
+        // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION
+        for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+        {
+            unsigned long long indZ = startIndZ + colIdx;
+            
+            // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+            // be trying to copy the out of bounds values anyway (bounds checks will be done in the final loop where the values go to the main volume)
+            if(indZ>=geo.nVoxelZ)
+                break;   // break the loop.
+            
+            // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles.
+            Point3D P;
+            S.x=DSO;
+            P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x);
+            P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU;
+            P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z);
+            S.y=P.y;S.z=P.z;
+            
+            // This is the vector defining the line from the source to the Voxel
+            float vectX,vectY,vectZ;
+            vectX=(P.x -S.x);
+            vectY=(P.y -S.y);
+            vectZ=(P.z -S.z);
+            
+            // Get the coordinates in the detector UV where the mid point of the voxel is projected.
+            float t=(DSO-DSD /*-DOD*/ - S.x)/vectX;
+            float y,z;
+            y=vectY*t+S.y;
+            z=vectZ*t+S.z;
+            float u,v;
+            u=y+geo.nDetecU/2.0f-0.5f;
+            v=z+geo.nDetecV/2.0f-0.5f;
+            
+            
+            
+            // Get Value in the computed (U,V) and multiply by the corresponding weight.
+            // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
+#if IS_FOR_MATLAB_TIGRE
+            voxelColumn[colIdx]+=tex3D<float>(tex, v+0.5f, u+0.5f ,indAlpha+0.5f);
+#else
+            voxelColumn[colIdx]+=tex3D<float>(tex, u+0.5f, v+0.5f ,indAlpha+0.5f);
+#endif
+            
+        }  // END iterating through column of voxels
+        
+    }  // END iterating through multiple projections
+    
+    // And finally copy the updated local voxelColumn array back to our 3D volume (main memory)
+    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
+    {
+        unsigned long long indZ = startIndZ + colIdx;
+        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
+        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the values go to the main volume)
+        if(indZ>=geo.nVoxelZ)
+            break;   // break the loop.
+        
+        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
+        image[idx] = voxelColumn[colIdx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
+        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
+        // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write.
+        // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is
+        // better for avoiding memory congestion.
+    }  // END copy updated voxels from local array to our 3D volume
+    
+}  // END kernelPixelBackprojectionFDK
+
+
+
+
+//______________________________________________________________________________
+//
+//      Function:       voxel_backprojection_parallel
+//
+//      Description:    Main host function for FDK backprojection (invokes the kernel)
+//______________________________________________________________________________
+
+int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids)
+{
+    if (gpuids.GetLength() == 0) {
+        cudaSetDevice(0);
+    } else {
+        cudaSetDevice(gpuids[0]);
+    }
+    
+    /*
+     * Allocate texture memory on the device
+     */
+    // copy data to CUDA memory
+    //If it is the first time, lets make sure our image is zeroed.
+    int nStreamDevice=2;
+    int nStreams=nStreamDevice;
+    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
+    
+    for (int i = 0; i < nStreamDevice; ++i){
+        cudaStreamCreate(&stream[i]);
+        
+        
+    }
+    //Pagelock memory for synchronous copy.
+    // Lets try to make the host memory pinned:
+    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
+    int isHostRegisterSupported = 0;
+#if CUDART_VERSION >= 9020
+    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
+#endif
+    if (isHostRegisterSupported){
+        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
+    }
+    cudaCheckErrors("Error pinning memory");
+    
+    
+    // Allocate result image memory
+    size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float);
+    float* dimage;
+    cudaMalloc((void**)&dimage, num_bytes);
+    cudaMemset(dimage,0,num_bytes);
+    cudaCheckErrors("cudaMalloc fail");
+    
+    
+    Point3D* projParamsArrayHostParallel;
+    cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D));
+    float* projSinCosArrayHostParallel;
+    cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float));
+    
+    
+    // Texture buffer objects
+    cudaTextureObject_t *texProj;
+    cudaArray **d_cuArrTex;
+    texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t));
+    d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*));
+
+    
+    
+    unsigned int proj_split_overlap_number;
+    unsigned int split_projections=1;
+    // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop
+    // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution
+    // of the code, as repeated allocation and deallocation only happens when the projection data is very very big,
+    // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time.
+    unsigned long long proj_linear_idx_start;
+    unsigned int current_proj_split_size,current_proj_overlap_split_size;
+    size_t num_bytes_img_curr;
+    size_t img_linear_idx_start;
+    
+    
+    current_proj_split_size=nalpha;
+    // We are going to split it in the same amount of kernels we need to execute.
+    proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;
+    
+    
+    // Create pointer to pointers of projections and precompute their location and size.
+    
+    float ** partial_projection=(float**)malloc(current_proj_split_size*sizeof(float*));
+    size_t * proj_split_size=(size_t*)malloc(current_proj_split_size*sizeof(size_t*));
+    
+    for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
+        // Crop the last one, as its likely its not completely divisible.
+        // now lets split this for simultanoeus memcopy and compute.
+        // We want to make sure that if we can, we run PROJ_PER_KERNEL projections, to maximize kernel acceleration
+        // current_proj_overlap_split_size units = angles
+        current_proj_overlap_split_size=max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL);
+        current_proj_overlap_split_size=(proj_block_split<proj_split_overlap_number-1)?current_proj_overlap_split_size:current_proj_split_size-(proj_split_overlap_number-1)*current_proj_overlap_split_size;
+        //Get the linear index where the current memory chunk starts.
+        
+        proj_linear_idx_start=proj_block_split*max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL)*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
+        //Store result
+        proj_split_size[proj_block_split]=current_proj_overlap_split_size;
+        partial_projection[proj_block_split]=&projections[proj_linear_idx_start];
+        
+    }
+    for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
+        
+        // Now get the projections on memory
+        
+        CreateTextureParallel(partial_projection[proj_block_split],geo,
+                &d_cuArrTex[(proj_block_split%2)],
+                proj_split_size[proj_block_split],
+                &texProj   [(proj_block_split%2)],
+                stream,
+                (proj_block_split<2));// Only allocate if its the first 2 calls
+        
+  
+        cudaStreamSynchronize(stream[0+1]);
+        
+        
+
+        int divx,divy,divz;
+        
+        // RB: Use the optimal (in their tests) block size from paper by Zinsser and Keck (16 in x and 32 in y).
+        // I tried different sizes and shapes of blocks (tiles), but it does not appear to significantly affect throughput, so
+        // let's stick with the values from Zinsser and Keck.
+        divx=16;
+        divy=32;
+        divz=VOXELS_PER_THREAD;      // We now only have 32 x 16 threads per block (flat tile, see below), BUT each thread works on a Z column of VOXELS_PER_THREAD voxels, so we effectively need fewer blocks!
+        dim3 grid((geo.nVoxelX+divx-1)/divx,
+                (geo.nVoxelY+divy-1)/divy,
+                (geo.nVoxelZ+divz-1)/divz);
+        
+        dim3 block(divx,divy,1);    // Note that we have 1 in the Z size, not divz, since each thread works on a vertical set of VOXELS_PER_THREAD voxels (so we only need a "flat" tile of threads, with depth of 1)
+        
+        
+        
+        
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Main reconstruction loop: go through projections (rotation angles) and backproject
+        //////////////////////////////////////////////////////////////////////////////////////
+        
+        // Since we'll have multiple projections processed by a SINGLE kernel call, compute how many
+        // kernel calls we'll need altogether.
+        int noOfKernelCalls = (proj_split_size[proj_block_split]+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_KERNEL
+        for (unsigned int i=0; i<noOfKernelCalls; i++)
+        {
+            // Now we need to generate and copy all data for PROJ_PER_KERNEL projections to constant memory so that our kernel can use it
+            int j;
+            for(j=0; j<PROJ_PER_KERNEL; j++)
+            {
+                int currProjNumber=i*PROJ_PER_KERNEL+j;
+                unsigned int currProjNumber_slice=i*PROJ_PER_KERNEL+j;
+                unsigned int currProjNumber_global=i*PROJ_PER_KERNEL+j                                                                          // index within kernel
+                        +proj_block_split*max(current_proj_split_size/proj_split_overlap_number,PROJ_PER_KERNEL); // indexof overlap current split
+                if(currProjNumber_slice>=proj_split_size[proj_block_split])
+                    break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
+                
+                if(currProjNumber_global>=nalpha)
+                    break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
+                
+                Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source;
+                float sinalpha,cosalpha;
+                
+                geo.alpha=-alphas[currProjNumber_global*3];
+                geo.theta=-alphas[currProjNumber_global*3+1];
+                geo.psi  =-alphas[currProjNumber_global*3+2];
+                
+                //sinalpha=sin(geo.alpha);
+//            cosalpha=cos(geo.alpha);
+                
+                projSinCosArrayHostParallel[3*j]=geo.DSD[currProjNumber_global];  // 3*j because we have 3 float (sin or cos angle) values per projection
+                projSinCosArrayHostParallel[3*j+1]=geo.DSO[currProjNumber_global];
+                projSinCosArrayHostParallel[3*j+2]=geo.COR[currProjNumber_global];
+                
+                //computeDeltasCubeParallel(geo,geo.alpha,currProjNumber,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
+                computeDeltasCubeParallel(geo,currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
+                
+                offOrig.x=geo.offOrigX[currProjNumber_global];
+                offOrig.y=geo.offOrigY[currProjNumber_global];
+                
+                
+                projParamsArrayHostParallel[6*j]=deltaX;		// 6*j because we have 6 Point3D values per projection
+                projParamsArrayHostParallel[6*j+1]=deltaY;
+                projParamsArrayHostParallel[6*j+2]=deltaZ;
+                projParamsArrayHostParallel[6*j+3]=xyzOrigin;
+                projParamsArrayHostParallel[6*j+4]=offOrig;
+                projParamsArrayHostParallel[6*j+5]=source;
+            }   // END for (preparing params for kernel call)
+            
+            // Copy the prepared parameter arrays to constant memory to make it available for the kernel
+            
+            cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
+            cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
+            cudaStreamSynchronize(stream[0]);
+
+            kernelPixelBackprojection_parallel<<<grid,block,0,stream[0]>>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]);
+        }  // END for
+        
+        //////////////////////////////////////////////////////////////////////////////////////
+        // END Main reconstruction loop: go through projections (rotation angles) and backproject
+        //////////////////////////////////////////////////////////////////////////////////////
+    }
+    cudaDeviceSynchronize();
+    cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost);
+    cudaCheckErrors("cudaMemcpy result fail");
+    
+    free(partial_projection);
+    free(proj_split_size);
+        
+    bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1;
+    for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
+        if (!two_buffers_used && i==1)
+            break;            
+            cudaDestroyTextureObject(texProj[i]);
+            cudaFreeArray(d_cuArrTex[i]);
+    }
+    free(texProj);
+    
+    free(d_cuArrTex);
+    cudaFreeHost(projSinCosArrayHostParallel);
+    cudaFreeHost(projParamsArrayHostParallel);
+    
+    cudaFree(dimage);
+    if (isHostRegisterSupported){
+        cudaHostUnregister(projections);
+    }
+    for (int i = 0; i < nStreams; ++i)
+        cudaStreamDestroy(stream[i]);
+
+//     cudaDeviceReset();
+    return 0;
+    
+}  // END voxel_backprojection
+
+void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S)
+{
+    
+    Point3Ddouble P, Px,Py,Pz;
+    // Get coords of Img(0,0,0)
+    P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i];
+    P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i];
+    P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i];
+    
+    // Get coors from next voxel in each direction
+    Px.x=P.x+geo.dVoxelX;       Py.x=P.x;                Pz.x=P.x;
+    Px.y=P.y;                   Py.y=P.y+geo.dVoxelY;    Pz.y=P.y;
+    Px.z=P.z;                   Py.z=P.z;                Pz.z=P.z+geo.dVoxelZ;
+    
+    
+    
+   // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ
+    eulerZYZT(geo,&P);
+    eulerZYZT(geo,&Px);
+    eulerZYZT(geo,&Py);
+    eulerZYZT(geo,&Pz);
+    
+    //detector offset
+    P.z =P.z-geo.offDetecV[i];            P.y =P.y-geo.offDetecU[i];
+    Px.z =Px.z-geo.offDetecV[i];          Px.y =Px.y-geo.offDetecU[i];
+    Py.z =Py.z-geo.offDetecV[i];          Py.y =Py.y-geo.offDetecU[i];
+    Pz.z =Pz.z-geo.offDetecV[i];          Pz.y =Pz.y-geo.offDetecU[i];
+    
+    //Detector Roll pitch Yaw
+    //
+    //
+    // first, we need to offset everything so (0,0,0) is the center of the detector
+    // Only X is required for that
+    P.x=P.x+(geo.DSD[i]-geo.DSO[i]);
+    Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]);
+    Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]);
+    Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]);
+
+    rollPitchYawT(geo,i,&P);
+    rollPitchYawT(geo,i,&Px);
+    rollPitchYawT(geo,i,&Py);
+    rollPitchYawT(geo,i,&Pz);
+
+    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
+    Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]);
+    Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]);
+    Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]);
+    
+    
+    Point3Ddouble source;
+    source.x=0;
+    source.y=-geo.offDetecU[i];
+    source.z=-geo.offDetecV[i];
+    
+    rollPitchYawT(geo,i,&source);
+    source.x=source.x-(geo.DSD[i]-geo.DSO[i]);
+            
+    P.z =P.z /geo.dDetecV;                          P.y =P.y/geo.dDetecU;
+    Px.z=Px.z/geo.dDetecV;                          Px.y=Px.y/geo.dDetecU;
+    Py.z=Py.z/geo.dDetecV;                          Py.y=Py.y/geo.dDetecU;
+    Pz.z=Pz.z/geo.dDetecV;                          Pz.y=Pz.y/geo.dDetecU;
+    
+    source.z=source.z/geo.dDetecV;                  source.y=source.y/geo.dDetecU;
+    
+    // get deltas of the changes in voxels
+    deltaX->x=Px.x-P.x;   deltaX->y=Px.y-P.y;    deltaX->z=Px.z-P.z;
+    deltaY->x=Py.x-P.x;   deltaY->y=Py.y-P.y;    deltaY->z=Py.z-P.z;
+    deltaZ->x=Pz.x-P.x;   deltaZ->y=Pz.y-P.y;    deltaZ->z=Pz.z-P.z;
+    
+    
+    // cast the results from the double precision calculations back to float
+    *xyzorigin=P.to_float();
+    *S=source.to_float();
+
+    
+}  // END computeDeltasCube
+void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc)
+{
+        //cudaArray Descriptor
+#if IS_FOR_MATLAB_TIGRE
+        const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
+#else
+        const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
+#endif
+        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+        //cuda Array
+        if (alloc){
+        cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+        cudaCheckErrors("Texture memory allocation fail");
+        }
+        cudaMemcpy3DParms copyParams = {0};
+        
+        
+        //Array creation
+        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
+        copyParams.dstArray = d_cuArrTex[0];
+        copyParams.extent   = extent;
+        copyParams.kind     = cudaMemcpyHostToDevice;
+        cudaMemcpy3DAsync(&copyParams,stream[0+1]);
+        cudaCheckErrors("Texture memory data copy fail");
+        //Array creation End
+        
+        cudaResourceDesc    texRes;
+        memset(&texRes, 0, sizeof(cudaResourceDesc));
+        texRes.resType = cudaResourceTypeArray;
+        texRes.res.array.array  = d_cuArrTex[0];
+        cudaTextureDesc     texDescr;
+        memset(&texDescr, 0, sizeof(cudaTextureDesc));
+        texDescr.normalizedCoords = false;
+        texDescr.filterMode = cudaFilterModeLinear;
+        texDescr.addressMode[0] = cudaAddressModeBorder;
+        texDescr.addressMode[1] = cudaAddressModeBorder;
+        texDescr.addressMode[2] = cudaAddressModeBorder;
+        texDescr.readMode = cudaReadModeElementType;
+        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
+        cudaCheckErrors("Texture object creation fail");
+    
+}
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection_parallel.hpp.prehip b/Common/CUDA/voxel_backprojection_parallel.hpp.prehip
new file mode 100644
index 00000000..92b72023
--- /dev/null
+++ b/Common/CUDA/voxel_backprojection_parallel.hpp.prehip
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA function for backrpojection  for parallel beam
+ *
+ *
+ * CODE by  Ander Biguri
+ *          Optimized and modified by RB
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+#include "types_TIGRE.hpp"
+#include "GpuIds.hpp"
+
+
+#ifndef BACKPROJECTION_PARALLEL_HPP
+#define BACKPROJECTION_PARALLEL_HPP
+
+int  voxel_backprojection_parallel(float  *  projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids);
+void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S);
+void createGeoArrayParallel(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles);
+//  void computeDeltasCube(Geometry geo, float alpha,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ);
+#endif
\ No newline at end of file
diff --git a/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip
new file mode 100644
index 00000000..650a9815
--- /dev/null
+++ b/MATLAB/Utilities/GPU/getGpuCount_mex.cpp.prehip
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <string.h>
+#include <mex.h>
+#include <CUDA/gpuUtils.hpp>
+
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+	if (nrhs != 0) {
+		mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "No input requred.");
+		return;
+	}
+	if (nlhs != 1) {
+		mexErrMsgIdAndTxt("MATLAB:getGpuCount_mex", "Too many output arguments. Returns one integer.");
+		return;
+	}
+	int iCount = GetGpuCount();
+	size_t dims[2] = {1,1};
+	plhs[0] = mxCreateNumericArray(2, dims, mxUINT32_CLASS, mxREAL);
+    *((int*)mxGetData(plhs[0])) = iCount;
+}
diff --git a/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip
new file mode 100644
index 00000000..c56ca29b
--- /dev/null
+++ b/MATLAB/Utilities/GPU/getGpuName_mex.cpp.prehip
@@ -0,0 +1,29 @@
+#include <mex.h>
+#include <CUDA/gpuUtils.hpp>
+
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+    // Usage: name = getGpuName_mex(int iId)
+    if (nrhs != 1) {
+		mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:invalidNumInputs", "One input required.");
+		return;
+	} else if(nlhs > 1) {
+        mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:maxlhs", "Too many output arguments.");
+		return;
+    }
+    
+    int iId = 0;
+    if (mxIsDouble(prhs[0])) {
+        mexErrMsgIdAndTxt( "MATLAB:getGpuName_mex:inputNotInt", "Input must be an integer.");
+        return;
+    } else {
+        iId = *((int*)mxGetData(prhs[0]));
+    }
+	int iCount = GetGpuCount();
+    char* pcName = (char*)mxCalloc(128, sizeof(char));
+    if (iId < iCount) {
+        GetGpuName(iId, pcName);
+    }
+    plhs[0] = mxCreateString(pcName);
+}
diff --git a/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip
new file mode 100644
index 00000000..670c2d3e
--- /dev/null
+++ b/MATLAB/Utilities/IO/VarianCBCT/XimPara.hpp.prehip
@@ -0,0 +1,28 @@
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <stdio.h>
+#include <iostream>
+
+// Purpose: To fast read .xim files 
+// Method: based on ReadXim.m by Fredrik Nordström 2015
+// Date: 2017.07
+// Author: Yi Du, yi.du@hotmail.com
+
+#ifndef STR_XIM
+#define STR_XIM
+//struct XimPara
+typedef struct XimPara
+{
+	char FileName[256]; 
+	int ImgWidth;					// Image Width
+	int ImgHeight;					// Image Height
+	int PixelNO;
+
+	int BytesPerPixel;				// Determine how to read the data
+	int Compression_Indicator;		// Data number in Rec Image Matrix
+
+	double GantryRtn;				// Gantry rotation angle
+    int KVNormChamber;          // KV norm chamber reading, date: 2022-05-23
+}XimPara;
+#endif
+
diff --git a/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip
new file mode 100644
index 00000000..453c4278
--- /dev/null
+++ b/MATLAB/Utilities/IO/VarianCBCT/mexReadXim.cpp.prehip
@@ -0,0 +1,357 @@
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "io64.h"
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <cstdint>
+//**** C data types are defined in tmwtypes.h
+#include <tmwtypes.h>
+#include "mex.h"
+#include <math.h>
+#include "matrix.h"
+#include "XimPara.hpp"
+
+#define GET_BIT(x,bit) ((x & (1 << bit)) >>bit)
+
+// Purpose: To fast read .xim files 
+// Method: based on ReadXim.m by Fredrik Nordström 2015
+// Date: 2017.07
+// Author: Yi Du, yi.du@hotmail.com
+
+
+int cReadXim(char *XimFullFile, XimPara *XimStr, int *XimImg);
+
+void mexFunction(
+        int nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+    //check input variable
+    if (mxIsChar(prhs[0]) != 1)
+    mexErrMsgIdAndTxt( "MATLAB:revord:inputNotString",
+          "Input must be a string.");
+    
+    // .xim filename
+    char *filename;
+    filename = mxArrayToString(prhs[0]);
+    //mexPrintf("%s\n", filename);
+
+    // file open
+    FILE *fid = fopen(filename, "rb");    
+    if(fid == NULL)
+    {
+        mexErrMsgIdAndTxt("%s fopen failed.\n", filename);
+        //getchar();
+        //exit(1);
+    }
+
+    // Parameter structure
+    XimPara *para = new XimPara[1];
+    
+    // file pointer position
+    //fpos_t position = {0};
+    
+	// Skip useless information
+    // 8 * sizeof(char) + sizeof(int32_t);
+	long int position = 8*sizeof(char) + sizeof(int32_T); 
+    fseek ( fid , position , SEEK_SET );
+// 	setFilePos(fid, (fpos_t*) &position);
+	// Read ImgWidth & ImgHeight (int32)
+	fread(&(para->ImgWidth), sizeof(int32_T), 1, fid);
+	fread(&(para->ImgHeight), sizeof(int32_T), 1, fid);
+    fclose(fid);
+
+    para->PixelNO = para->ImgWidth * para->ImgHeight;
+
+    int *frame;    
+	plhs[0] = mxCreateNumericMatrix(para->ImgWidth, para->ImgHeight, mxINT32_CLASS, mxREAL);
+    frame = (int*)mxGetPr(plhs[0]);
+
+	// empty file return
+	if (para->PixelNO == 0)
+    {
+    	plhs[1] = mxCreateDoubleScalar(10000);
+    	mexPrintf("%s is an empty file\n", filename);
+        return;
+    }
+        
+	/******* Kernel Function *********/
+    cReadXim(filename, para, frame);
+
+	/**** KVSourceRtn is the only parameter-of-interest to return ****/
+    // KVSourceRtn = GantryRtn + 90 deg;
+	double KVSourceRtn = para->GantryRtn + 90;
+	plhs[1] = mxCreateDoubleScalar(KVSourceRtn);
+    
+    double NormChamberReading = para->KVNormChamber * 1.0;
+    plhs[2] = mxCreateDoubleScalar(NormChamberReading);
+
+}
+
+/************* Kernel Funtion to read .xim ***************/
+// Kernel function
+int cReadXim(char *XimFullFile,
+	XimPara *XimStr,
+	int *XimImg)
+{
+	// Read the .xim file name
+
+//	char *ptr = strrchr(XimFullFile, '\\');
+//	sprintf(XimStr->FileName, "%s", ptr + 1);
+
+	// ****** Open .xim File Pointer ***********//
+	FILE *fid = fopen(XimFullFile, "rb");
+    
+	// Syntax Parsing
+	if (fid == NULL)
+	{
+		mexErrMsgIdAndTxt("Error: file %s doesn't exist, at all\n", XimFullFile);
+        //getchar();
+		//exit(1);
+	}
+
+    // ******* Stage 1: Portal Image Data ****//
+	// Skip useless information
+	fseek(fid, 8 * sizeof(char) + sizeof(int32_T), SEEK_CUR);
+
+	// Read ImgWidth & ImgHeight
+	fread(&(XimStr->ImgWidth), sizeof(int32_T), 1, fid);
+	fread(&(XimStr->ImgHeight), sizeof(int32_T), 1, fid);
+	XimStr->PixelNO = (XimStr->ImgWidth)*(XimStr->ImgHeight);
+
+	// Skip the useless information: bits_per_pixel
+	fseek(fid, sizeof(int32_T), SEEK_CUR);
+
+	// Load .xim file compression  parameters
+	fread(&(XimStr->BytesPerPixel), sizeof(int32_T), 1, fid);
+	fread(&(XimStr->Compression_Indicator), sizeof(int32_T), 1, fid);
+
+	// Load .xim Pixel Data
+	if (1 == XimStr->Compression_Indicator)
+	{
+		int LookUpTableSize = 0;
+		fread(&LookUpTableSize, sizeof(int), 1, fid);
+
+		int *LookUpTable = new int[XimStr->ImgHeight * XimStr->ImgWidth];
+		memset(LookUpTable, 0, XimStr->ImgHeight * XimStr->ImgWidth * sizeof(int));
+
+		// Load the LookUpTable data
+		for (int ii = 0; ii < LookUpTableSize; ii++)
+		{
+			// Load in the 8-bit date
+			// Updated: 2021-11-05, Yi Du
+			uint8_T tmp =0;
+			fread(&tmp, 1, 1, fid);
+			int Bit2[4] = { 0 };
+			Bit2[0] = GET_BIT(tmp,0) + GET_BIT(tmp,1) *2;
+			Bit2[1] = GET_BIT(tmp,2) + GET_BIT(tmp,3) *2;
+			Bit2[2] = GET_BIT(tmp,4) + GET_BIT(tmp,5) *2;
+			Bit2[3] = GET_BIT(tmp,6) + GET_BIT(tmp,7) *2;
+			
+			// extract the lookup_table data
+			for (int jj = 0; jj < 4; jj++)
+			{
+				LookUpTable[ii * 4 + jj] = Bit2[jj];
+			}
+
+			/**  Old Code with bug			
+			int Bit2[4] = { 0 };
+
+			// extract the lookup_table data
+			for (int jj = 0; jj < 8; jj = jj +2)
+			{
+				Bit2[jj/2] = ((tmp & 1 << jj) != 0);
+				// It's 4, because 1 unsigned __int8 in tmp is represented by 4 ints in LookUpTable.
+				LookUpTable[ii * 4 + jj / 2] = Bit2[jj / 2];
+				
+				//printf("Index = %d, LookUpTable = %d\n", ii * 4 + jj / 2, LookUpTable[ii * 4 + jj / 2]);
+			}
+			**/			
+		}
+
+		// Skip compressed_pixel_buffer_size: passed
+		fseek(fid, sizeof(int32_T), SEEK_CUR);
+
+		// Allocate memory for XimImg
+		fread(XimImg, sizeof(int32_T), (XimStr->ImgWidth) + 1, fid);
+
+		// load the compressed pixel data
+		int delta = 0;
+		int LUT_Pos = 0;
+
+		// Be very careful with all data types!!!
+		int8_T tmp8 = 0;
+		int16_T tmp16 = 0;
+		int32_T tmp32 = 0;
+
+		for (int ImgTag = XimStr->ImgWidth + 1;
+			ImgTag < (XimStr->ImgHeight) * (XimStr->ImgWidth);
+			ImgTag++)
+		{
+			if (0 == LookUpTable[LUT_Pos])
+			{
+				fread(&tmp8, sizeof(int8_T), 1, fid);
+				delta = int(tmp8);
+			}
+			else if (1 == LookUpTable[LUT_Pos])
+			{
+				fread(&tmp16, sizeof(int16_T), 1, fid);
+				delta = int(tmp16);
+			}
+			else
+			{
+				fread(&tmp32, sizeof(int32_T), 1, fid);
+				delta = int(tmp32);
+			}
+			
+			XimImg[ImgTag] = delta + XimImg[ImgTag - 1]
+				+ XimImg[ImgTag - XimStr->ImgWidth]
+				- XimImg[ImgTag - XimStr->ImgWidth - 1];
+
+			LUT_Pos = LUT_Pos + 1;
+		}
+
+		// Skip uncompressed_pixel_buffer_size
+		fseek(fid, sizeof(int32_T), SEEK_CUR);
+
+	}
+	else
+	{
+		// Be careful: the code block for uncompressed pixel data readout hasn't been tested yet.
+		// Date: 2017-09-12
+		int BufferSize = 0;
+		fread(&BufferSize, sizeof(int), 1, fid);
+
+		switch (XimStr->BytesPerPixel)
+		{
+		case 1:	
+		{
+			uint8_t *buffer8 = new uint8_t[XimStr->ImgWidth * XimStr->ImgHeight];
+			memset(buffer8, 0, sizeof(uint8_t)* XimStr->ImgWidth * XimStr->ImgHeight);
+			fread(buffer8, sizeof(uint8_t), BufferSize, fid);
+			for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight;ii++)
+			{
+				XimImg[ii] = int(buffer8[ii]);
+			}
+			break;
+		}
+		case 2:
+		{
+			uint16_t *buffer16 = new uint16_t[XimStr->ImgWidth * XimStr->ImgHeight];
+			memset(buffer16, 0, sizeof(uint16_t)* XimStr->ImgWidth * XimStr->ImgHeight);
+			fread(buffer16, sizeof(uint16_t), BufferSize / 2, fid);
+			for (int ii = 0; ii < XimStr->ImgWidth * XimStr->ImgHeight; ii++)
+			{
+				XimImg[ii] = int(buffer16[ii]);
+			}
+			break;
+		}
+		default:
+		{
+			fread(XimImg, sizeof(int), BufferSize / 4, fid);
+			break;
+		}
+		}
+	}
+
+	
+	// ******* Stage 2: load the gantry angle from the residual property data ****//
+	// Skip histogram
+	int tmp = 0;
+	fread(&tmp, sizeof(int), 1, fid);
+	if (tmp > 0)
+	{
+		fseek(fid, tmp* sizeof(int), SEEK_CUR);
+	}
+
+	// Decode .xim properties
+	int nProperties = 0;
+	fread(&nProperties, sizeof(int), 1, fid);
+	// Property structure is not NULL
+	if (nProperties > 0)
+	{
+		int pName_len = 0;
+		// Only load the property name rather than the content
+		char pName[128] = { 0 };
+		int pType = 0;
+		for (int ii = 0; ii < nProperties; ii++)
+		{
+			// load property name length
+			fread(&pName_len, sizeof(int), 1, fid);
+			// load property name
+			fread(pName, sizeof(char)* pName_len, 1, fid);
+			// load property data type
+			fread(&pType, sizeof(int), 1, fid);
+
+            //printf("%s\n", pName);
+			
+			// extract the Gantry Rotation Angle
+			if (!strcmp(pName, "GantryRtn"))
+			{
+				fread(&(XimStr->GantryRtn), sizeof(double), 1, fid);
+//				continue;
+			}
+            else if(!strcmp(pName, "KVNormChamber"))
+            {
+                //printf("KVNormChamber");
+				fread(&(XimStr->KVNormChamber), sizeof(int), 1, fid);
+				break;                
+            }
+			else
+			{
+				switch (pType)
+				{
+				case 0:
+				{
+					fseek(fid, sizeof(int), SEEK_CUR);
+					break;
+				}
+				case 1:
+				{
+					fseek(fid, sizeof(double), SEEK_CUR);
+					break;
+				}
+				case 2:
+				{
+					int skiplen = 0;
+					fread(&skiplen, sizeof(int), 1, fid);
+					fseek(fid, sizeof(char) * skiplen, SEEK_CUR);
+					break;
+				}
+				case 4:
+				{
+					int skiplen = 0;
+					fread(&skiplen, sizeof(int), 1, fid);
+					fseek(fid, sizeof(double) * skiplen /8, SEEK_CUR);
+					break;
+				}
+				case 5:
+				{
+					int skiplen = 0;
+					fread(&skiplen, sizeof(int), 1, fid);
+					fseek(fid, sizeof(int) * skiplen /4, SEEK_CUR);
+					break;
+				}
+				break;
+				}
+			}
+			// reset all the temporary variables 
+			pName_len = 0;
+			memset(pName, 0, 128*sizeof(char));
+			pType = 0;
+		}
+
+	}
+
+	// ********* END of XIM Reading: Close the File Pointer******* //
+	if (fclose(fid))
+	{
+		printf("The file `crt_fopen.c' was not closed\n");
+		getchar();
+		exit(1);
+	}
+    
+	return 1;
+}
diff --git a/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip
new file mode 100644
index 00000000..e38db7d9
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip
@@ -0,0 +1,126 @@
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX  functions for Random Number Generator. Check inputs and parses 
+ * MATLAB data to C++ data.
+ *
+ *
+ * CODE by       Tomoyuki SADAKANE
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+#include <math.h>
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/RandomNumberGenerator.hpp>
+#include <CUDA/GpuIds.hpp>
+#include <CUDA/gpuUtils.hpp>
+/**
+ * MEX gateway
+ * AddNoise(Im, mu, sigma, "gpuids", gpuids);
+ *   poissrnd(Im)+randn(size(Im)).*sigma + mu;
+ */
+
+void mexFunction(int nlhs, mxArray *plhs[],
+                 int nrhs, mxArray const *prhs[])
+{
+    size_t uiLen = 0;
+    float fGaussMu = 0;
+    float fGaussSigma = 0;
+
+    GpuIds gpuids;
+    if (nrhs==5) {
+        size_t iM = mxGetM(prhs[4]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[4]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[4]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    } else {
+        int iGpuCount = GetGpuCount();
+        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
+        for (int iI = 0; iI < iGpuCount; ++iI) {
+            piDev[iI] = iI;
+        }
+        gpuids.SetIds(iGpuCount, piDev);
+        free(piDev); piDev = 0;
+    }
+    if (nrhs < 3) {
+        mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "At least three input argumet required.");
+    } else if (nrhs==3 || nrhs==5){
+        size_t mrows = mxGetM(prhs[1]);
+        size_t ncols = mxGetN(prhs[1]);
+        if (mrows!=1 || ncols !=1) {
+            mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "2nd parameter should be 1x1");
+        }
+        mrows = mxGetM(prhs[2]);
+        ncols = mxGetN(prhs[2]);
+        if (mrows!=1 || ncols !=1) {
+            mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "3rd parameter should be 1x1");
+        }
+        fGaussMu    = (float)mxGetScalar(prhs[1]);
+        fGaussSigma = (float)mxGetScalar(prhs[2]);
+    } else if (nrhs>4) {
+        mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "Too many input arguments");
+    }
+    /////////////// First input argumet.
+    // First input should be an array, whose elements are lambda.
+    mxArray const * const image = prhs[0];
+    float* pfLambdas = static_cast<float*>(mxGetData(image));
+    mwSize const numDims = mxGetNumberOfDimensions(image);  // get dim of image
+    const mwSize *size_img= mxGetDimensions(image); //get size of image
+    uiLen = size_img[0];    // calculate the total length
+    for (int iI = 1; iI < numDims; ++iI) {
+        uiLen *= size_img[iI];
+    }
+    //////////////
+    //prepare outputs
+    // Allocte output image
+    plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL);
+    float *imgout =(float*) mxGetPr(plhs[0]);
+    // call CUDA rng
+    poisson_gaussian_1d(pfLambdas, uiLen, fGaussMu, fGaussSigma, imgout, gpuids);
+}
diff --git a/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip
new file mode 100644
index 00000000..da78bfce
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip
@@ -0,0 +1,367 @@
+
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX gateway for backprojection
+ *
+ * This file gets the data from MATLAB, checks it for errors and then
+ * parses it to C and calls the relevant C/CUDA functions.
+ *
+ * CODE by       Ander Biguri
+ *
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+
+#include <math.h>
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/voxel_backprojection.hpp>
+#include <CUDA/voxel_backprojection2.hpp>
+#include <CUDA/voxel_backprojection_parallel.hpp>
+#include <CUDA/GpuIds.hpp>
+
+
+
+
+
+/**
+ * MEX gateway
+ *
+ * This function takes data from MATLAB and passes it to the MEX code.
+ * It checks and casts the inputs and prepares teh outputs for MATLAB.
+ *
+ *
+ */
+
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[]){
+    
+    //Check amount of inputs
+    if (nrhs != 5) {
+        mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput", "Wrong number of inputs provided");
+    }
+    ////////////////////////////
+    // 5th argument is array of GPU-IDs.
+    GpuIds gpuids;
+    {
+        size_t iM = mxGetM(prhs[4]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[4]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[4]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    }
+    
+    /*
+     ** 4th argument is matched or un matched.
+     */
+    bool pseudo_matched=false; // Caled krylov, because I designed it for krylov case....
+    /* copy the string data from prhs[0] into a C string input_ buf.    */
+    char *krylov = mxArrayToString(prhs[3]);
+    if (!strcmp(krylov,"matched")) // if its 0, they are the same
+        pseudo_matched=true;
+
+    /*
+     ** Third argument: angle of projection.
+     */
+    size_t mrows,nangles;
+    
+    mrows = mxGetM(prhs[2]);
+    nangles = mxGetN(prhs[2]);
+    
+    
+    mxArray const * const ptrangles=prhs[2];
+    
+    
+    double const * const anglesM= static_cast<double const *>(mxGetData(ptrangles));
+    // just copy paste the data to a float array
+    float  *  angles= (float*)malloc(nangles*mrows*sizeof(float));
+    for (int i=0;i<nangles*mrows;i++){
+        angles[i]=(float)anglesM[i];
+    }
+    /**
+     *
+     * First input: The projections
+     */
+    
+    // First input should be b from (Ax=b) i.e. the projections.
+    mxArray const * const image = prhs[0];                 // Get pointer of the data
+    mwSize const numDims = mxGetNumberOfDimensions(image); // Get numer of Dimensions of input matrix.
+    // Image should be dim 3
+    if (!(numDims==3 && nangles>1) && !(numDims==2 && nangles==1) ){
+        mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput",  "Projection data is not the right size");
+    }
+    if( !mxIsSingle(prhs[0])) {
+        mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput",
+                "Input image must be a single noncomplex array.");
+    }
+    // Now that input is ok, parse it to C data types.
+    // NOTE: while Number of dimensions is the size of the matrix in Matlab, the data is 1D row-wise mayor.
+    
+    // We need a float image, and, unfortunately, the only way of casting it is by value
+//     const mwSize *size_proj= mxGetDimensions(image); //get size of image
+//     mrows = mxGetM(image);
+//     nangles = mxGetN(image);
+//     size_t size_proj2;
+//     if (nangles==1)
+//         size_proj2=1;
+//     else
+//         size_proj2=size_proj[2];
+    
+    
+    float  *  projections= static_cast<float *>(mxGetData(image));
+    
+    
+    
+    
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /**
+     * Second input: Geometry structure
+     */
+    mxArray * geometryMex=(mxArray*)prhs[1];
+    
+    // IMPORTANT-> Make sure Matlab creates the struct in this order.
+    const char *fieldnames[14];
+    fieldnames[0] = "nVoxel";
+    fieldnames[1] = "sVoxel";
+    fieldnames[2] = "dVoxel";
+    fieldnames[3] = "nDetector";
+    fieldnames[4] = "sDetector";
+    fieldnames[5] = "dDetector";
+    fieldnames[6] = "DSD";
+    fieldnames[7] = "DSO";
+    fieldnames[8] = "offOrigin";
+    fieldnames[9] = "offDetector";
+    fieldnames[10]= "accuracy";
+    fieldnames[11]= "mode";
+    fieldnames[12]= "COR";
+    fieldnames[13]= "rotDetector";
+    // Make sure input is structure
+    
+    mxArray    *tmp;
+    
+    // Now we know that all the input struct is good! Parse it from mxArrays to
+    // C structures that MEX can understand.
+    
+    double * nVoxel, *nDetec; //we need to cast these to int
+    double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD,*offOrig,*offDetec;
+    double *acc, *COR,*rotDetector;
+    const char* mode;
+    bool coneBeam=true;
+    Geometry geo;
+    int c;
+    geo.unitX=1;geo.unitY=1;geo.unitZ=1;
+    for(int ifield=0; ifield<14; ifield++) {
+        tmp=mxGetField(geometryMex,0,fieldnames[ifield]);
+        if(tmp==NULL){
+            //tofix
+            continue;
+        }
+        switch(ifield){
+            case 0:
+                nVoxel=(double *)mxGetData(tmp);
+                // copy data to MEX memory
+                geo.nVoxelX=(int)nVoxel[0];
+                geo.nVoxelY=(int)nVoxel[1];
+                geo.nVoxelZ=(int)nVoxel[2];
+                break;
+            case 1:
+                sVoxel=(double *)mxGetData(tmp);
+                geo.sVoxelX=(float)sVoxel[0];
+                geo.sVoxelY=(float)sVoxel[1];
+                geo.sVoxelZ=(float)sVoxel[2];
+                break;
+            case 2:
+                dVoxel=(double *)mxGetData(tmp);
+                geo.dVoxelX=(float)dVoxel[0];
+                geo.dVoxelY=(float)dVoxel[1];
+                geo.dVoxelZ=(float)dVoxel[2];
+                break;
+            case 3:
+                nDetec=(double *)mxGetData(tmp);
+                geo.nDetecU=(int)nDetec[0];
+                geo.nDetecV=(int)nDetec[1];
+                break;
+            case 4:
+                sDetec=(double *)mxGetData(tmp);
+                geo.sDetecU=(float)sDetec[0];
+                geo.sDetecV=(float)sDetec[1];
+                break;
+            case 5:
+                dDetec=(double *)mxGetData(tmp);
+                geo.dDetecU=(float)dDetec[0];
+                geo.dDetecV=(float)dDetec[1];
+                break;
+           case 6:
+                geo.DSD=(float*)malloc(nangles * sizeof(float));
+                DSD=(double *)mxGetData(tmp);
+                for (int i=0;i<nangles;i++){
+                    geo.DSD[i]=(float)DSD[i];
+                }
+                break;
+            case 7:
+                geo.DSO=(float*)malloc(nangles * sizeof(float));
+                DSO=(double *)mxGetData(tmp);
+                for (int i=0;i<nangles;i++){
+                    geo.DSO[i]=(float)DSO[i];
+                }
+                break;
+            case 8:
+                
+                geo.offOrigX=(float*)malloc(nangles * sizeof(float));
+                geo.offOrigY=(float*)malloc(nangles * sizeof(float));
+                geo.offOrigZ=(float*)malloc(nangles * sizeof(float));
+                
+                offOrig=(double *)mxGetData(tmp);
+                
+                for (int i=0;i<nangles;i++){
+                    c=i;
+                    
+                    geo.offOrigX[i]=(float)offOrig[0+3*c];
+                    geo.offOrigY[i]=(float)offOrig[1+3*c];
+                    geo.offOrigZ[i]=(float)offOrig[2+3*c];
+                }
+                break;
+            case 9:
+                geo.offDetecU=(float*)malloc(nangles * sizeof(float));
+                geo.offDetecV=(float*)malloc(nangles * sizeof(float));
+                
+                offDetec=(double *)mxGetData(tmp);
+                for (int i=0;i<nangles;i++){
+                    c=i;
+                    
+                    geo.offDetecU[i]=(float)offDetec[0+2*c];
+                    geo.offDetecV[i]=(float)offDetec[1+2*c];
+                }
+                break;
+            case 10:
+                acc=(double*)mxGetData(tmp);
+                geo.accuracy=(float)acc[0];
+                if (acc[0]<0.001)
+                    mexErrMsgIdAndTxt( "CBCT:MEX:Ax:Accuracy","Accuracy should be bigger than 0.001");
+                
+                break;
+            case 11:
+                mode="";
+                mode=mxArrayToString(tmp);
+                if (!strcmp(mode,"parallel"))
+                    coneBeam=false;
+                break;
+            case 12:
+                COR=(double*)mxGetData(tmp);
+                geo.COR=(float*)malloc(nangles * sizeof(float));
+                for (int i=0;i<nangles;i++){
+                    
+                    geo.COR[i]  = (float)COR[i];
+                }
+                break;
+            case 13:
+                geo.dRoll= (float*)malloc(nangles * sizeof(float));
+                geo.dPitch=(float*)malloc(nangles * sizeof(float));
+                geo.dYaw=  (float*)malloc(nangles * sizeof(float));
+                
+                rotDetector=(double *)mxGetData(tmp);
+                
+                for (int i=0;i<nangles;i++){
+                    c=i;
+                    geo.dYaw[i]  = (float)rotDetector[0+3*c];
+                    geo.dPitch[i]= (float)rotDetector[1+3*c];
+                    geo.dRoll[i] = (float)rotDetector[2+3*c];
+                    
+                }
+                break;
+            default:
+                mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","This should not happen. Weird");
+                break;
+                
+        }
+    }
+    
+    /*
+     * allocate memory for the output (No longer needed)
+     */
+    
+//     float* result = (float*)malloc(geo.nVoxelX *geo.nVoxelY*geo.nVoxelZ*sizeof(float));
+    
+    
+    /*
+     * Call the CUDA kernel
+     */
+    mwSize imgsize[3];
+    imgsize[0]=geo.nVoxelX;
+    imgsize[1]=geo.nVoxelY;
+    imgsize[2]=geo.nVoxelZ;
+    plhs[0] = mxCreateNumericArray(3,imgsize, mxSINGLE_CLASS, mxREAL);
+    float *result = (float *)mxGetPr(plhs[0]);
+    
+    
+    
+    // To know which backprojection to call, we also need to know if the rotation is the orthodox/standard circular
+    // rotation around the Z axis, or of its something else. This is because the current backprojection for
+    // circular scans is optimized with a trick that assumes that the voxels in Z direction
+    // on the image are aligned with the axis of rotation, to increase memory latency.
+    // This however does not apply in arbitrary axis of rotation cases.
+    // TODO: test if we really need 2 different codes, or if running the accelerated code
+    // with the wrong assumptions will just result in a speed like the non-accelerated code,
+    // without sacrificing speedup in the standard case.
+    
+   
+    // Run the CUDA code.
+    if (coneBeam){
+        if (pseudo_matched){
+            voxel_backprojection2(projections,geo,result,angles,nangles, gpuids);
+        }else{
+            voxel_backprojection(projections,geo,result,angles,nangles, gpuids);
+        }
+    }else{
+        voxel_backprojection_parallel(projections,geo,result,angles,nangles, gpuids);
+    }
+
+
+    
+    return;
+}
diff --git a/MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip b/MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip
new file mode 100644
index 00000000..50d631df
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip
@@ -0,0 +1,137 @@
+/*
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX gateway for Total variation minimization via Steepest descend
+ *
+ * This file gets the data from MATLAB, checks it for errors and then 
+ * parses it to C and calls the relevant C/CUDA functions.
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+#include <math.h>
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/GD_AwTV.hpp>
+#include <CUDA/GpuIds.hpp>
+#include <CUDA/gpuUtils.hpp>
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+///////// First check if the amount of inputs is right.    
+    int maxIter;
+    float alpha;
+    GpuIds gpuids;
+    if (nrhs==5) {
+        size_t iM = mxGetM(prhs[4]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[4]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[4]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    } else {
+        int iGpuCount = GetGpuCount();
+        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
+        for (int iI = 0; iI < iGpuCount; ++iI) {
+            piDev[iI] = iI;
+        }
+        gpuids.SetIds(iGpuCount, piDev);
+        free(piDev); piDev = 0;
+    }    
+    if (nrhs==1){
+        maxIter=100;
+        alpha=15.0f;
+    } else if (nrhs==2){
+       mexErrMsgIdAndTxt("err", "Only 1 POCS hyperparameter inputted");
+    } else if (nrhs==4 || nrhs==5){
+        size_t mrows = mxGetM(prhs[1]);
+        size_t ncols = mxGetN(prhs[1]);
+        if (mrows!=1 || ncols !=1) {
+            mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1");
+        }
+        mrows = mxGetM(prhs[2]);
+        ncols = mxGetN(prhs[2]);
+        if (mrows!=1 || ncols !=1) {
+            mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1");
+        }
+        alpha= (float)(mxGetScalar(prhs[1]));
+        maxIter=(int)floor(mxGetScalar(prhs[2])+0.5);
+    } else {
+       mexErrMsgIdAndTxt("err", "Too many input arguments");
+    }
+    float delta=(float)(mxGetScalar(prhs[3]));
+////////////////////////// First input.
+    // First input should be x from (Ax=b), or the image.
+    mxArray const * const image = prhs[0];
+    mwSize const numDims = mxGetNumberOfDimensions(image);
+    mwSize third_dim = 1;
+    
+    // Now that input is ok, parse it to C data types.
+    float  *  img = static_cast<float  *>(mxGetData(image));
+    const mwSize *size_img= mxGetDimensions(image); //get size of image
+
+    // Image should be dim 3
+    if (numDims==3){
+        third_dim = size_img[2];
+    }
+    
+    // Allocte output image
+    plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL);
+    float *imgout =(float*) mxGetPr(plhs[0]);
+    // call C function with the CUDA denoising
+  
+    const long imageSize[3]={size_img[0], size_img[1], third_dim };
+    
+    aw_pocs_tv(img,imgout, alpha, imageSize, maxIter, delta, gpuids); 
+    
+    //prepareotputs
+}
diff --git a/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip
new file mode 100644
index 00000000..3c6f3670
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip
@@ -0,0 +1,338 @@
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX gateway for projection
+ *
+ * This file gets the data from MATLAB, checks it for errors and then
+ * parses it to C and calls the relevant C/CUDA functions.
+ *
+ * CODE by       Ander Biguri
+ *
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+
+
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/ray_interpolated_projection.hpp>
+#include <CUDA/ray_interpolated_projection_parallel.hpp>
+#include <CUDA/Siddon_projection.hpp>
+#include <CUDA/Siddon_projection_parallel.hpp>
+#include <CUDA/GpuIds.hpp>
+
+/**
+ * MEX gateway
+ */
+
+
+
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+//     clock_t begin, end;
+//     begin = clock();
+    
+    
+    //Check amount of inputs
+    if (nrhs != 5) {
+        mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput", "Invalid number of inputs to MEX file.");
+    }
+    ////////////////////////////
+    // 5th argument is array of GPU-IDs.
+    GpuIds gpuids;
+    {
+        size_t iM = mxGetM(prhs[4]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[4]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[4]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    }
+    ////////////////////////////
+    // 4th argument is interpolated or ray-voxel/Siddon
+    bool rayvoxel=false;
+    if ( mxIsChar(prhs[3]) != 1)
+        mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be a string");
+    
+    /* copy the string data from prhs[0] into a C string input_ buf.    */
+    char *krylov = mxArrayToString(prhs[3]);
+    if (strcmp(krylov,"interpolated") && strcmp(krylov,"Siddon") && strcmp(krylov,"ray-voxel"))
+        mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be either 'interpolated' or 'Siddon'");
+    else
+        // If its not ray-voxel, its "interpolated"
+        if (strcmp(krylov,"Siddon") == 0 || strcmp(krylov,"ray-voxel") == 0) //strcmp returs 0 if they are equal
+            rayvoxel=true;
+    ///////////////////////// 3rd argument: angle of projection.
+    
+    size_t mrows = mxGetM(prhs[2]);
+    size_t nangles = mxGetN(prhs[2]);
+
+    mxArray const * const ptrangles=prhs[2];
+    
+    
+    double const * const anglesM= static_cast<double const *>(mxGetData(ptrangles));
+    // just copy paste the data to a float array
+    float  *  angles= (float*)malloc(nangles*mrows*sizeof(float));
+    for (int i=0;i<nangles*mrows;i++){
+        angles[i]=(float)anglesM[i];
+    }
+    
+    
+    ////////////////////////// First input.
+    // First input should be x from (Ax=b), or the image.
+    mxArray const * const image = prhs[0];
+    mwSize const numDims = mxGetNumberOfDimensions(image);
+    
+
+    // Now that input is ok, parse it to C data types.
+    float  *  img = static_cast<float  *>(mxGetData(image));
+    // We need a float image, and, unfortunately, the only way of casting it is by value
+    const mwSize *size_img= mxGetDimensions(image); //get size of image
+    
+    
+    
+    ///////////////////// Second input argument,
+    // Geometry structure that has all the needed geometric data.
+    
+    
+    mxArray * geometryMex=(mxArray*)prhs[1];
+    
+    // IMPORTANT-> Make sure Matlab creates the struct in this order.
+    const char *fieldnames[14];
+    fieldnames[0] = "nVoxel";
+    fieldnames[1] = "sVoxel";
+    fieldnames[2] = "dVoxel";
+    fieldnames[3] = "nDetector";
+    fieldnames[4] = "sDetector";
+    fieldnames[5] = "dDetector";
+    fieldnames[6] = "DSD";
+    fieldnames[7] = "DSO";
+    fieldnames[8] = "offOrigin";
+    fieldnames[9] = "offDetector";
+    fieldnames[10]= "accuracy";
+    fieldnames[11]= "mode";
+    fieldnames[12]= "COR";
+    fieldnames[13]= "rotDetector";
+    
+    // Now we know that all the input struct is good! Parse it from mxArrays to
+    // C structures that MEX can understand.
+    double * nVoxel, *nDetec; //we need to cast these to int
+    double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD;
+    double *offOrig,*offDetec,*rotDetector;
+    double *  acc, *COR;
+    const char* mode;
+    int c;
+    mxArray    *tmp;
+    Geometry geo;
+    geo.unitX=1;geo.unitY=1;geo.unitZ=1;
+    bool coneBeam=true;
+//     mexPrintf("%d \n",nfields);
+    for(int ifield=0; ifield<14; ifield++) {
+        tmp=mxGetField(geometryMex,0,fieldnames[ifield]);
+        if(tmp==NULL){
+            //tofix
+            continue;
+        }
+        switch(ifield){
+            case 0:
+                nVoxel=(double *)mxGetData(tmp);
+                // copy data to MEX memory
+                geo.nVoxelX=(int)nVoxel[0];
+                geo.nVoxelY=(int)nVoxel[1];
+                geo.nVoxelZ=(int)nVoxel[2];
+                break;
+            case 1:
+                sVoxel=(double *)mxGetData(tmp);
+                geo.sVoxelX=(float)sVoxel[0];
+                geo.sVoxelY=(float)sVoxel[1];
+                geo.sVoxelZ=(float)sVoxel[2];
+                break;
+            case 2:
+                dVoxel=(double *)mxGetData(tmp);
+                geo.dVoxelX=(float)dVoxel[0];
+                geo.dVoxelY=(float)dVoxel[1];
+                geo.dVoxelZ=(float)dVoxel[2];
+                break;
+            case 3:
+                nDetec=(double *)mxGetData(tmp);
+                geo.nDetecU=(int)nDetec[0];
+                geo.nDetecV=(int)nDetec[1];
+                break;
+            case 4:
+                sDetec=(double *)mxGetData(tmp);
+                geo.sDetecU=(float)sDetec[0];
+                geo.sDetecV=(float)sDetec[1];
+                break;
+            case 5:
+                dDetec=(double *)mxGetData(tmp);
+                geo.dDetecU=(float)dDetec[0];
+                geo.dDetecV=(float)dDetec[1];
+                break;
+            case 6:
+                geo.DSD=(float*)malloc(nangles * sizeof(float));
+                DSD=(double *)mxGetData(tmp);
+                for (int i=0;i<nangles;i++){
+                    geo.DSD[i]=(float)DSD[i];
+                }
+                break;
+            case 7:
+                geo.DSO=(float*)malloc(nangles * sizeof(float));
+                DSO=(double *)mxGetData(tmp);
+                for (int i=0;i<nangles;i++){
+                    geo.DSO[i]=(float)DSO[i];
+                }
+                break;
+            case 8:
+                
+                geo.offOrigX=(float*)malloc(nangles * sizeof(float));
+                geo.offOrigY=(float*)malloc(nangles * sizeof(float));
+                geo.offOrigZ=(float*)malloc(nangles * sizeof(float));
+                
+                offOrig=(double *)mxGetData(tmp);
+                
+                for (int i=0;i<nangles;i++){
+                    c=i;
+                    geo.offOrigX[i]=(float)offOrig[0+3*c];
+                    geo.offOrigY[i]=(float)offOrig[1+3*c];
+                    geo.offOrigZ[i]=(float)offOrig[2+3*c];
+                }
+                break;
+            case 9:
+                geo.offDetecU=(float*)malloc(nangles * sizeof(float));
+                geo.offDetecV=(float*)malloc(nangles * sizeof(float));
+                
+                offDetec=(double *)mxGetData(tmp);
+                for (int i=0;i<nangles;i++){
+                    c=i;
+                    geo.offDetecU[i]=(float)offDetec[0+2*c];
+                    geo.offDetecV[i]=(float)offDetec[1+2*c];
+                }
+                break;
+            case 10:
+                acc=(double*)mxGetData(tmp);
+                if (acc[0]<0.001)
+                    mexErrMsgIdAndTxt( "CBCT:MEX:Ax:Accuracy","Accuracy should be bigger than 0.001");
+                
+                geo.accuracy=(float)acc[0];
+                break;
+            case 11:
+                mode="";
+                mode=mxArrayToString(tmp);
+                if (!strcmp(mode,"parallel"))
+                    coneBeam=false;
+                break;
+            case 12:
+                COR=(double*)mxGetData(tmp);
+                geo.COR=(float*)malloc(nangles * sizeof(float));
+                for (int i=0;i<nangles;i++){
+
+                    c=i;
+                    geo.COR[i]  = (float)COR[0+c];
+                }
+                break;
+                
+            case 13:
+                geo.dRoll= (float*)malloc(nangles * sizeof(float));
+                geo.dPitch=(float*)malloc(nangles * sizeof(float));
+                geo.dYaw=  (float*)malloc(nangles * sizeof(float));
+                
+                rotDetector=(double *)mxGetData(tmp);
+                
+                for (int i=0;i<nangles;i++){
+                  
+                    c=i;
+                    geo.dYaw[i]  = (float)rotDetector[0+3*c];
+                    geo.dPitch[i]= (float)rotDetector[1+3*c];
+                    geo.dRoll[i] = (float)rotDetector[2+3*c];
+                    
+                }
+                break;
+            default:
+                mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","This should not happen. Weird");
+                break;
+                
+        }
+    }
+    
+ 
+    size_t num_bytes = geo.nDetecU*geo.nDetecV * sizeof(float);
+    
+    
+    mwSize outsize[3];
+    outsize[0]=geo.nDetecV;
+    outsize[1]=geo.nDetecU;
+    outsize[2]= nangles;
+    plhs[0] = mxCreateNumericArray(3, outsize, mxSINGLE_CLASS, mxREAL);
+    float *outProjections = (float*)mxGetPr(plhs[0]);  // WE will NOT be freeing this pointer!
+    
+    // MODIFICATION, RB, 5/12/2017: As said above, we do not allocate anything, just
+    // set pointers in result to point to outProjections
+    float** result = (float**)malloc(nangles * sizeof(float*)); // This only allocates memory for pointers
+    unsigned long long projSizeInPixels = geo.nDetecU * geo.nDetecV;
+    for (int i = 0; i < nangles; i++)
+    {
+        unsigned long long currProjIndex = projSizeInPixels*i;
+        result[i] = &outProjections[currProjIndex]; // now the pointers are the same
+    }
+    
+    // call the real function
+    if (coneBeam){
+        if (rayvoxel){
+            siddon_ray_projection(img,geo,result,angles,nangles, gpuids);
+        }else{
+            interpolation_projection(img,geo,result,angles,nangles, gpuids);
+        }
+    }else{
+        if (rayvoxel){
+            siddon_ray_projection_parallel(img,geo,result,angles,nangles, gpuids);
+        }else{
+            interpolation_projection_parallel(img,geo,result,angles,nangles, gpuids);
+        }
+    }
+    
+    return;
+    
+}
diff --git a/MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip b/MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip
new file mode 100644
index 00000000..ee5d0fdc
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip
@@ -0,0 +1,147 @@
+/*
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX gateway for Total variation minimization via Steepest descend
+ *
+ * This file gets the data from MATLAB, checks it for errors and then 
+ * parses it to C and calls the relevant C/CUDA fucntions.
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+#include <tmwtypes.h>
+#include <mex.h>
+#include <math.h>
+#include <matrix.h>
+#include <CUDA/PICCS.hpp>
+#include <CUDA/GpuIds.hpp>
+#include <CUDA/gpuUtils.hpp>
+#include <string.h>
+// #include <time.h>
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+///////// First check if the amount of imputs is rigth.    
+    int maxIter;
+    float alpha;
+    float ratio;
+    GpuIds gpuids;
+    if (nrhs<5)
+        mexErrMsgIdAndTxt("TIGRE:minPICCS", "At least 2 inputs needed: Image and prior image");
+    if (nrhs>6){
+       mexErrMsgIdAndTxt("TIGRE:minPICCS", "Too many imput argumets");
+    }
+    if (nrhs==6){
+     size_t mrows = mxGetM(prhs[2]);
+     size_t ncols = mxGetN(prhs[2]);
+     if (mrows!=1 || ncols !=1)
+        mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1");
+     mrows = mxGetM(prhs[3]);
+     ncols = mxGetN(prhs[3]);
+     if (mrows!=1 || ncols !=1)
+        mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1");
+     mrows = mxGetM(prhs[4]);
+     ncols = mxGetN(prhs[4]);
+     if (mrows!=1 || ncols !=1)
+        mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1");
+     alpha= (float)(mxGetScalar(prhs[2]));
+     maxIter=(int)floor(mxGetScalar(prhs[3])+0.5);
+     ratio= (float)(mxGetScalar(prhs[4]));
+     
+     size_t uiGpuCount = mxGetN(prhs[5]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "TIGRE:minPICCS","6th parameter must be a row vector");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[5]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    }else{
+        int iGpuCount = GetGpuCount();
+        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
+        for (int iI = 0; iI < iGpuCount; ++iI) {
+            piDev[iI] = iI;
+        }
+        gpuids.SetIds(iGpuCount, piDev);
+        free(piDev); piDev = 0;
+    }
+    if (nrhs==2){
+        maxIter=100;
+        alpha=15.0f;
+        ratio=0.5;
+    }
+        
+    
+////////////////////////// First input.
+    // First input should be x from (Ax=b), or the image.
+    mxArray const * const image = prhs[0];
+    mwSize const numDims = mxGetNumberOfDimensions(image);
+    if (numDims!=3){
+        mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D");
+    }   
+    mxArray const * const prior_mex = prhs[1];
+    mwSize const numDims_prior = mxGetNumberOfDimensions(image);
+    if (numDims_prior!=3){
+        mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D");
+    }
+    if(numDims_prior!=numDims)
+        mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image and prior are not the same size");
+    // Image should be dim 3
+
+    // Now that input is ok, parse it to C data types.
+    float const * const img   = static_cast<float const *>(mxGetData(image));
+    float const * const prior = static_cast<float const *>(mxGetData(prior_mex));
+    const mwSize *size_img= mxGetDimensions(image); //get size of image
+    
+
+    // Allocte output image  
+    const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] };
+    plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL);
+    float *imgout =(float*) mxGetPr(plhs[0]);
+    
+    
+    piccs_tv(img,prior,imgout, alpha,ratio, imageSize, maxIter,gpuids); 
+    
+
+    
+}
\ No newline at end of file
diff --git a/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip b/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip
new file mode 100644
index 00000000..da60446c
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip
@@ -0,0 +1,132 @@
+/*
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX gateway for Total variation minimization via Steepest descend
+ *
+ * This file gets the data from MATLAB, checks it for errors and then 
+ * parses it to C and calls the relevant C/CUDA functions.
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+#include <math.h>
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/GD_TV.hpp>
+#include <CUDA/GpuIds.hpp>
+#include <CUDA/gpuUtils.hpp>
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+///////// First check if the amount of inputs is right.    
+    int maxIter;
+    float alpha;
+    GpuIds gpuids;
+    if (nrhs==4) {
+        size_t iM = mxGetM(prhs[3]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[3]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "TIGRE:minTV","4th parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[3]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    } else {
+        int iGpuCount = GetGpuCount();
+        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
+        for (int iI = 0; iI < iGpuCount; ++iI) {
+            piDev[iI] = iI;
+        }
+        gpuids.SetIds(iGpuCount, piDev);
+        free(piDev); piDev = 0;
+    }
+    if (nrhs==1){
+        maxIter=100;
+        alpha=15.0f;
+    } else if (nrhs==2){
+       mexErrMsgIdAndTxt("minTV:mex", "Only 1 POCS hyperparameter inputted");
+    } else if (nrhs==3 || nrhs==4){
+     size_t mrows = mxGetM(prhs[1]);
+     size_t ncols = mxGetN(prhs[1]);
+     if (mrows!=1 || ncols !=1)
+        mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1");
+     mrows = mxGetM(prhs[2]);
+     ncols = mxGetN(prhs[2]);
+     if (mrows!=1 || ncols !=1)
+        mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1");
+     alpha= (float)(mxGetScalar(prhs[1]));
+     maxIter=(int)floor(mxGetScalar(prhs[2])+0.5);
+    } else {
+       mexErrMsgIdAndTxt("minTV:mex", "Too many input arguments");
+    }
+    
+////////////////////////// First input.
+    // First input should be x from (Ax=b), or the image.
+    mxArray const * const image = prhs[0];
+    mwSize const numDims = mxGetNumberOfDimensions(image);
+    mwSize third_dim = 1;
+    
+    
+    // Now that input is ok, parse it to C data types.
+    float  *  img = static_cast<float  *>(mxGetData(image));
+    const mwSize *size_img = mxGetDimensions(image); //get size of image    
+
+    // Image should be dim 3
+    if (numDims==3){
+        third_dim = size_img[2];
+    }
+    
+    // Allocte output image  
+    const long imageSize[3]={size_img[0] ,size_img[1], third_dim };
+    plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL);
+    float *imgout =(float*) mxGetPr(plhs[0]);
+    
+    pocs_tv(img,imgout, alpha, imageSize, maxIter, gpuids); 
+}
diff --git a/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip
new file mode 100644
index 00000000..1142a5f7
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip
@@ -0,0 +1,124 @@
+/*--------------------------------------------------------------------------
+--------------------------------------------------------------------------
+ This file is part of the TIGRE Toolbox
+ 
+ Copyright (c) 2015, University of Bath and 
+                     CERN-European Organization for Nuclear Research
+                     All rights reserved.
+
+ License:            Open Source under BSD. 
+                     See the full license at
+                     https://github.com/CERN/TIGRE/blob/master/LICENSE
+
+ Contact:            tigre.toolbox@gmail.com
+ Codes:              https://github.com/CERN/TIGRE/
+ Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
+--------------------------------------------------------------------------*/
+
+#include "mex.h"
+#include "CUDA/improvedForwardProjections.hpp"
+#include <stdexcept>
+#include <iostream>
+#include <cstring>
+
+
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs,const mxArray *prhs[]){
+
+    if (nrhs =! 7){
+        mexErrMsgIdAndTxt("CS Projections:", "Check Number of Input arguments!");
+    }
+
+    float *posIn, *posOut, *dirIn, *dirOut;
+    float *Wepl, *pixelSize, *detectorDistanceIn, *detectorDistanceOut, *initEnergy;
+
+    //Load parameters
+    posIn = (float *)(mxGetPr(prhs[0]));
+    posOut = (float *)mxGetPr(prhs[1]);
+    dirIn = (float *)mxGetPr(prhs[2]);
+    dirOut = (float *)mxGetPr(prhs[3]);
+    Wepl = (float*) mxGetPr(prhs[4]);
+    initEnergy = (float*) mxGetPr(prhs[5]);
+
+    //Get Number of Protons contained in the root files
+    int numOfProtons = (int) mxGetM(prhs[4]);
+
+    mxArray * geometryMex=(mxArray*)prhs[6];
+
+    const char *fieldnames_geo[7];
+    fieldnames_geo[0] = "dDetector";
+    fieldnames_geo[1] = "DSD";
+    fieldnames_geo[2] = "DSID";
+    fieldnames_geo[3] = "DSO";
+    fieldnames_geo[4] = "hull";
+    fieldnames_geo[5] = "sDetector";
+    fieldnames_geo[6] = "mode";
+    
+    double * pix0, *dsd0, *dsid0, *hull0, *det0, *dso0;
+    float pix[2], dsd, dsid, dso, hull[4], det[2];
+    const char* mode;
+    bool coneBeam = true;
+    mxArray    *tmp;
+    for (int ifield=0; ifield<7; ifield++){
+        tmp=mxGetField(geometryMex,0,fieldnames_geo[ifield]);
+        switch(ifield){
+            case 0:
+                pix0 =(double *)mxGetData(tmp);
+                pix[0] = (float)pix0[0];
+                pix[1] = (float)pix0[1];
+                break;
+            case 1:
+                dsd0 =(double *)mxGetData(tmp);
+                dsd = (float)dsd0[0];
+                break;
+            case 2:
+                dsid0 =(double *)mxGetData(tmp);
+                dsid = (float)dsid0[0];
+                break;
+            case 3:
+                dso0 =(double *)mxGetData(tmp);
+                dso = (float)dso0[0];
+                break;
+            case 4:
+                hull0 =(double *)mxGetData(tmp);
+                hull[0] = (float)hull0[0];
+                hull[1] = (float)hull0[1];
+                hull[2] = (float)hull0[2];
+                hull[3] = (float)hull0[3];
+                break;
+            case 5:
+                det0 =(double *)mxGetData(tmp);
+                det[0] = (float)det0[0];
+                det[1] = (float)det0[1];
+                break;
+            case 6:
+                mode="";
+                mode=mxArrayToString(tmp);
+                if (!strcmp(mode,"parallel"))
+                    coneBeam=false;
+                break;
+        } 
+    }
+    
+    
+    if (hull[3] == 0){std::cout << "Info: Calculation of optimized proton radiographies will be performed without object hull!" << std::endl;}
+    
+    if (hull[2] > 6.28318530717958648){std::cout << "Info: Hull rotation angle exceeds 2 Pi. Please check the input! Continuing with calculation..." << std::endl;}
+
+    mwSize outSize[2];
+    outSize[0] = int(det[1]/pix[1]);
+    outSize[1] = int(det[0]/pix[0]);
+    plhs[0] = mxCreateNumericArray(2, outSize, mxSINGLE_CLASS, mxREAL);
+    float *outProjections = (float*)mxGetPr(plhs[0]);
+   
+    //For Calculation 2 historgrams are needed
+    // 
+    if(coneBeam == false){
+        std::cout << "Info: Parallel geometry selected..." << std::endl;
+        ParticleProjections(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, *initEnergy, hull);
+    }
+    else{
+        std::cout << "Info: Cone beam geometry selected..." << std::endl;
+        ParticleProjectionsCone(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, -1*dso, *initEnergy, hull);
+    }
+
+}
diff --git a/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip
new file mode 100644
index 00000000..f905bcbd
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip
@@ -0,0 +1,147 @@
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX  functions for TV image denoising. Check inputs and parses 
+ * MATLAB data to C++ data.
+ *
+ *
+ * CODE by       Ander Biguri
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+
+
+
+
+
+#include <math.h>
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/tv_proximal.hpp>
+#include <CUDA/GpuIds.hpp>
+#include <CUDA/gpuUtils.hpp>
+/**
+ * MEX gateway
+ */
+void mexFunction(int  nlhs , mxArray *plhs[],
+        int nrhs, mxArray const *prhs[])
+{
+    int maxIter;
+    float lambda;
+    GpuIds gpuids;
+    if (nrhs==4) {
+        size_t iM = mxGetM(prhs[3]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[3]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[3]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    } else {
+        int iGpuCount = GetGpuCount();
+        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
+        for (int iI = 0; iI < iGpuCount; ++iI) {
+            piDev[iI] = iI;
+        }
+        gpuids.SetIds(iGpuCount, piDev);
+        free(piDev); piDev = 0;
+    }
+    if (nrhs == 0) {
+        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "At least one input argumet required.");
+    } else if (nrhs==1){
+        maxIter=100;
+        lambda=15.0f;
+    } else if (nrhs==2){
+        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Only 1 TV hyperparameter inputted");
+    } else if (nrhs==3 || nrhs==4){
+        size_t mrows = mxGetM(prhs[1]);
+        size_t ncols = mxGetN(prhs[1]);
+        if (mrows!=1 || ncols !=1) {
+            mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1");
+        }
+        mrows = mxGetM(prhs[2]);
+        ncols = mxGetN(prhs[2]);
+        if (mrows!=1 || ncols !=1) {
+            mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1");
+        }
+        lambda= (float)(mxGetScalar(prhs[1]));
+        maxIter=(int)round(mxGetScalar(prhs[2]));
+    } else if (nrhs>4) {
+        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Too many input arguments");
+    }
+    ////////////////////////// First input.
+    // First input should be x from (Ax=b), or the image.
+    mxArray const * const image = prhs[0];
+    mwSize const numDims = mxGetNumberOfDimensions(image);
+    
+    // Image should be dim 3
+    if (numDims!=3){
+        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Image is not 3D");
+    }
+    // Now that input is ok, parse it to C data types.
+    float  *  img = static_cast<float  *>(mxGetData(image));
+    // We need a float image, and, unfortunately, the only way of casting it is by value
+    const mwSize *size_img= mxGetDimensions(image); //get size of image
+    
+    //////////////
+    //prepareotputs
+    plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL);
+    float *imgout =(float*) mxGetPr(plhs[0]);
+    // Allocte output image
+    // call C function with the CUDA denoising
+    const float spacing[3]={1,1,1};
+    const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] };
+   
+    tvdenoising(img,imgout, lambda, spacing, imageSize, maxIter, gpuids); 
+    
+    
+    
+//     memcpy(mxImgout,imgout,size_img[0] *size_img[1] *size_img[2]*sizeof(float));
+    //free memory
+//     free(img);
+//     free(imgout);
+     
+
+}

From 29e4f6a3b29860cf2a130081075870dfdbc3ebcb Mon Sep 17 00:00:00 2001
From: purepani <pani0028@umn.edu>
Date: Wed, 19 Mar 2025 19:38:13 -0500
Subject: [PATCH 2/3] Remove prehip files

---
 Common/CUDA/GD_AwTV.cu.prehip                 |  713 ----------
 Common/CUDA/GD_AwTV.hpp.prehip                |   62 -
 Common/CUDA/GD_TV.cu.prehip                   |  702 ----------
 Common/CUDA/GD_TV.hpp.prehip                  |   61 -
 Common/CUDA/GpuIds.cpp.prehip                 |   70 -
 Common/CUDA/GpuIds.hpp.prehip                 |   17 -
 Common/CUDA/PICCS.cu.prehip                   |  398 ------
 Common/CUDA/PICCS.hpp.prehip                  |   61 -
 Common/CUDA/RandomNumberGenerator.cu.prehip   |  193 ---
 Common/CUDA/RandomNumberGenerator.hpp.prehip  |   49 -
 Common/CUDA/Siddon_projection.cu.prehip       |  859 ------------
 Common/CUDA/Siddon_projection.hpp.prehip      |   66 -
 .../CUDA/Siddon_projection_parallel.cu.prehip |  540 --------
 .../Siddon_projection_parallel.hpp.prehip     |   65 -
 Common/CUDA/TIGRE_common.cpp.prehip           |   20 -
 Common/CUDA/TIGRE_common.hpp.prehip           |   24 -
 Common/CUDA/errors.hpp.prehip                 |   10 -
 Common/CUDA/gpuUtils.cu.prehip                |   70 -
 Common/CUDA/gpuUtils.hpp.prehip               |   18 -
 .../CUDA/improvedForwardProjections.cu.prehip | 1032 --------------
 .../improvedForwardProjections.hpp.prehip     |  263 ----
 .../improvedForwardProjections_cone.cu.prehip | 1230 -----------------
 Common/CUDA/projection.cpp.prehip             |   35 -
 Common/CUDA/projection.hpp.prehip             |    9 -
 .../ray_interpolated_projection.cu.prehip     |  843 -----------
 .../ray_interpolated_projection.hpp.prehip    |   66 -
 ...interpolated_projection_parallel.cu.prehip |  449 ------
 ...nterpolated_projection_parallel.hpp.prehip |   65 -
 Common/CUDA/tv_proximal.cu.prehip             |  693 ----------
 Common/CUDA/tv_proximal.hpp.prehip            |   57 -
 Common/CUDA/types_TIGRE.hpp.prehip            |  109 --
 Common/CUDA/voxel_backprojection.cu.prehip    |  920 ------------
 Common/CUDA/voxel_backprojection.hpp.prehip   |   59 -
 Common/CUDA/voxel_backprojection2.cu.prehip   |  844 -----------
 Common/CUDA/voxel_backprojection2.hpp.prehip  |   64 -
 .../voxel_backprojection_parallel.cu.prehip   |  627 ---------
 .../voxel_backprojection_parallel.hpp.prehip  |   57 -
 .../cuda_interface/AddNoise.cpp.prehip        |  126 --
 .../cuda_interface/Atb_mex.cpp.prehip         |  367 -----
 .../cuda_interface/AwminTV.cpp.prehip         |  137 --
 .../cuda_interface/Ax_mex.cpp.prehip          |  338 -----
 .../cuda_interface/minPICCS.cpp.prehip        |  147 --
 .../Utilities/cuda_interface/minTV.cpp.prehip |  132 --
 .../pCTCubicSpline_mex.cpp.prehip             |  124 --
 .../cuda_interface/tvDenoise.cpp.prehip       |  147 --
 45 files changed, 12938 deletions(-)
 delete mode 100644 Common/CUDA/GD_AwTV.cu.prehip
 delete mode 100644 Common/CUDA/GD_AwTV.hpp.prehip
 delete mode 100644 Common/CUDA/GD_TV.cu.prehip
 delete mode 100644 Common/CUDA/GD_TV.hpp.prehip
 delete mode 100644 Common/CUDA/GpuIds.cpp.prehip
 delete mode 100644 Common/CUDA/GpuIds.hpp.prehip
 delete mode 100644 Common/CUDA/PICCS.cu.prehip
 delete mode 100644 Common/CUDA/PICCS.hpp.prehip
 delete mode 100644 Common/CUDA/RandomNumberGenerator.cu.prehip
 delete mode 100644 Common/CUDA/RandomNumberGenerator.hpp.prehip
 delete mode 100644 Common/CUDA/Siddon_projection.cu.prehip
 delete mode 100644 Common/CUDA/Siddon_projection.hpp.prehip
 delete mode 100644 Common/CUDA/Siddon_projection_parallel.cu.prehip
 delete mode 100644 Common/CUDA/Siddon_projection_parallel.hpp.prehip
 delete mode 100644 Common/CUDA/TIGRE_common.cpp.prehip
 delete mode 100644 Common/CUDA/TIGRE_common.hpp.prehip
 delete mode 100644 Common/CUDA/errors.hpp.prehip
 delete mode 100644 Common/CUDA/gpuUtils.cu.prehip
 delete mode 100644 Common/CUDA/gpuUtils.hpp.prehip
 delete mode 100644 Common/CUDA/improvedForwardProjections.cu.prehip
 delete mode 100644 Common/CUDA/improvedForwardProjections.hpp.prehip
 delete mode 100644 Common/CUDA/improvedForwardProjections_cone.cu.prehip
 delete mode 100644 Common/CUDA/projection.cpp.prehip
 delete mode 100644 Common/CUDA/projection.hpp.prehip
 delete mode 100644 Common/CUDA/ray_interpolated_projection.cu.prehip
 delete mode 100644 Common/CUDA/ray_interpolated_projection.hpp.prehip
 delete mode 100644 Common/CUDA/ray_interpolated_projection_parallel.cu.prehip
 delete mode 100644 Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip
 delete mode 100644 Common/CUDA/tv_proximal.cu.prehip
 delete mode 100644 Common/CUDA/tv_proximal.hpp.prehip
 delete mode 100644 Common/CUDA/types_TIGRE.hpp.prehip
 delete mode 100644 Common/CUDA/voxel_backprojection.cu.prehip
 delete mode 100644 Common/CUDA/voxel_backprojection.hpp.prehip
 delete mode 100644 Common/CUDA/voxel_backprojection2.cu.prehip
 delete mode 100644 Common/CUDA/voxel_backprojection2.hpp.prehip
 delete mode 100644 Common/CUDA/voxel_backprojection_parallel.cu.prehip
 delete mode 100644 Common/CUDA/voxel_backprojection_parallel.hpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/minTV.cpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip
 delete mode 100644 MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip

diff --git a/Common/CUDA/GD_AwTV.cu.prehip b/Common/CUDA/GD_AwTV.cu.prehip
deleted file mode 100644
index d98c13c1..00000000
--- a/Common/CUDA/GD_AwTV.cu.prehip
+++ /dev/null
@@ -1,713 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for Steepest descend in POCS-type algorithms.
- *
- * This file will iteratively minimize by steepest descend the total variation
- * of the input image, with the parameters given, using GPUs.
- *
- * CODE by       Ander Biguri
- *
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-
-
-
-
-
-#define MAXTHREADS 1024
-#define MAX_BUFFER 60
-
-#include "GD_AwTV.hpp"
-
-
-
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("CBCT:CUDA:GD_TV",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-// CUDA kernels
-//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927
-    __global__ void divideArrayScalar(float* vec,float scalar,const size_t n){
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]/=scalar;
-        }
-    }
-    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]*=scalar;
-        }
-    }
-    __global__ void substractArrays(float* vec,float* vec2,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]-=vec2[i];
-        }
-    }
-    
-    __device__ __inline__
-            void gradient(const float* u, float* grad,
-            long z, long y, long x,
-            long depth, long rows, long cols){
-        unsigned long size2d = rows*cols;
-        unsigned long long idx = z * size2d + y * cols + x;
-        
-        float uidx = u[idx];
-        
-        if ( z - 1 >= 0 && z<depth) {
-            grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
-        }
-        
-        if ( y - 1 >= 0 && y<rows){
-            grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
-        }
-        
-        if ( x - 1 >= 0 && x<cols) {
-            grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
-        }
-    }
-    
-    __global__ void gradientTV(const float* f, float* dftv,
-            long depth, long rows, long cols,const float delta){
-        unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
-        unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
-        unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
-        unsigned long long idx = z * rows * cols + y * cols + x;
-        if ( x >= cols || y >= rows || z >= depth )
-            return;
-        
-        
-        float df[3] ={0.f,0.f,0.f};
-        float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k}
-        float dfj[3]={0.f,0.f,0.f};
-        float dfk[3]={0.f,0.f,0.f};
-        gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
-        gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
-        gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
-        gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
-        float eps=0.00000001; //% avoid division by zero
-        
-        float wx=__expf(-(df[0]/delta)*(df[0]/delta));
-        float wy=__expf(-(df[1]/delta)*(df[1]/delta));
-        float wz=__expf(-(df[2]/delta)*(df[2]/delta));
-        
-        float wxi=__expf(-(dfi[0]/delta)*(dfi[0]/delta));
-        float wyi=__expf(-(dfi[1]/delta)*(dfi[1]/delta));
-        float wzi=__expf(-(dfi[2]/delta)*(dfi[2]/delta));
-        
-        float wxj=__expf(-(dfj[0]/delta)*(dfj[0]/delta));
-        float wyj=__expf(-(dfj[1]/delta)*(dfj[1]/delta));
-        float wzj=__expf(-(dfj[2]/delta)*(dfj[2]/delta));
-        
-        float wxk=__expf(-(dfk[0]/delta)*(dfk[0]/delta));
-        float wyk=__expf(-(dfk[1]/delta)*(dfk[1]/delta));
-        float wzk=__expf(-(dfk[2]/delta)*(dfk[2]/delta));
-
-        
-        // this hsould do the trick I think
-        
-        dftv[idx]=(wx*df[0]+wy*df[1]+wz*df[2])/(sqrt(wx*df[0] *df[0] +wy*df[1] *df[1] +wz*df[2] *df[2])+eps)
-        -wzi*dfi[2]/(sqrt(wxi*dfi[0]*dfi[0]+wyi*dfi[1]*dfi[1]+wzi*dfi[2]*dfi[2]) +eps)     // I wish I coudl precompute this, but if I do then Id need to recompute the gradient.
-        -wyj*dfj[1]/(sqrt(wxj*dfj[0]*dfj[0]+wyj*dfj[1]*dfj[1]+wzj*dfj[2]*dfj[2]) +eps)
-        -wxk*dfk[0]/(sqrt(wxk*dfk[0]*dfk[0]+wyk*dfk[1]*dfk[1]+wzk*dfk[2]*dfk[2]) +eps);
-        
-    
-        return;
-        
-    }
-    
-    __device__ void warpReduce(volatile float *sdata, size_t tid) {
-        sdata[tid] += sdata[tid + 32];
-        sdata[tid] += sdata[tid + 16];
-        sdata[tid] += sdata[tid + 8];
-        sdata[tid] += sdata[tid + 4];
-        sdata[tid] += sdata[tid + 2];
-        sdata[tid] += sdata[tid + 1];
-    }
-    
-    __global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
-        extern __shared__ volatile float sdata[];
-        //http://stackoverflow.com/a/35133396/1485872
-        size_t tid = threadIdx.x;
-        size_t i = blockIdx.x*blockDim.x + tid;
-        size_t gridSize = blockDim.x*gridDim.x;
-        float mySum = 0;
-        float value=0;
-        while (i < n) {
-            value=g_idata[i]; //avoid reading twice
-            mySum += value*value;
-            i += gridSize;
-        }
-        sdata[tid] = mySum;
-        __syncthreads();
-        
-        if (tid < 512)
-            sdata[tid] += sdata[tid + 512];
-        __syncthreads();
-        if (tid < 256)
-            sdata[tid] += sdata[tid + 256];
-        __syncthreads();
-        
-        if (tid < 128)
-            sdata[tid] += sdata[tid + 128];
-        __syncthreads();
-        
-        if (tid <  64)
-            sdata[tid] += sdata[tid + 64];
-        __syncthreads();
-        
-        
-#if (__CUDART_VERSION >= 9000)
-        if ( tid < 32 )
-        {
-            mySum = sdata[tid] + sdata[tid + 32];
-            for (int offset = warpSize/2; offset > 0; offset /= 2) {
-                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
-            }
-        }
-#else
-        if (tid < 32) {
-            warpReduce(sdata, tid);
-            mySum = sdata[0];
-        }
-#endif
-        if (tid == 0) g_odata[blockIdx.x] = mySum;
-    }
-    
-    __global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
-        extern __shared__ volatile float sdata[];
-        //http://stackoverflow.com/a/35133396/1485872
-        size_t tid = threadIdx.x;
-        size_t i = blockIdx.x*blockDim.x + tid;
-        size_t gridSize = blockDim.x*gridDim.x;
-        float mySum = 0;
-        // float value=0;
-        while (i < n) {
-            mySum += g_idata[i];
-            i += gridSize;
-        }
-        sdata[tid] = mySum;
-        __syncthreads();
-        
-        if (tid < 512)
-            sdata[tid] += sdata[tid + 512];
-        __syncthreads();
-        if (tid < 256)
-            sdata[tid] += sdata[tid + 256];
-        __syncthreads();
-        
-        if (tid < 128)
-            sdata[tid] += sdata[tid + 128];
-        __syncthreads();
-        
-        if (tid <  64)
-            sdata[tid] += sdata[tid + 64];
-        __syncthreads();
-        
-        
-#if (__CUDART_VERSION >= 9000)
-        if ( tid < 32 )
-        {
-            mySum = sdata[tid] + sdata[tid + 32];
-            for (int offset = warpSize/2; offset > 0; offset /= 2) {
-                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
-            }
-        }
-#else
-        if (tid < 32) {
-            warpReduce(sdata, tid);
-            mySum = sdata[0];
-        }
-#endif
-        if (tid == 0) g_odata[blockIdx.x] = mySum;
-    }
-    
-    
-    
-    
-// main function
-void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter,const float delta, const GpuIds& gpuids){
-        // Prepare for MultiGPU
-        int deviceCount = gpuids.GetLength();
-        cudaCheckErrors("Device query fail");
-        if (deviceCount == 0) {
-            mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","There are no available device(s) that support CUDA\n");
-        }
-        //
-        // CODE assumes
-        // 1.-All available devices are usable by this code
-        // 2.-All available devices are equal, they are the same machine (warning thrown)
-        // Check the available devices, and if they are the same
-        if (!gpuids.AreEqualDevices()) {
-            mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
-        }
-        int dev;
-        
-        // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
-        // check free memory
-        size_t mem_GPU_global;
-        checkFreeMemory(gpuids, &mem_GPU_global);
-
-        
-        
-        // %5 of free memory should be enough, we have almost no variables in these kernels
-        size_t total_pixels              = image_size[0] * image_size[1]  * image_size[2] ;
-        size_t mem_slice_image           = sizeof(float)* image_size[0] * image_size[1]  ;
-        size_t mem_size_image            = sizeof(float)* total_pixels;
-        size_t mem_auxiliary             = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
-        
-        // Decide how are we handling the distribution of computation
-        size_t mem_img_each_GPU;
-        
-        unsigned int buffer_length=2;
-        //Does everything fit in the GPU?
-        unsigned int slices_per_split;
-        
-        // if it is a thin problem (no need to split), just use one GPU
-        if (image_size[2]<4){deviceCount=1;}
-        
-        unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this.
-        if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary) {
-            // We only need to split if we have extra GPUs
-            slices_per_split=(image_size[2]+deviceCount-1)/deviceCount;
-            mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2));
-        }else{
-            // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all
-            size_t mem_free=mem_GPU_global-mem_auxiliary;
-            
-            splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free));
-            // Now, there is an overhead here, as each splits should have 2 slices more, to account for overlap of images.
-            // lets make sure these 2 slices fit, if they do not, add 1 to splits.
-            slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits);
-            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
-            
-            // if the new stuff does not fit in the GPU, it means we are in the edge case where adding that extra slice will overflow memory
-            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
-                // one more split should do the job, as its an edge case.
-                splits++;
-                //recompute for later
-                slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap
-                mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
-            }
-
-
-            // How many EXTRA buffer slices should be able to fit in here??!?!
-            // Only do it if there are splits needed. 
-            if(splits>1){
-                mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary);
-                unsigned int extra_buff=(mem_free/mem_slice_image); 
-                buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down.
-                buffer_length=max(buffer_length,2);// minimum 2
-                buffer_length=min(MAX_BUFFER,buffer_length);
-
-                mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2);
-                
-            }else{
-                buffer_length=2;
-            }
-
-            // Assert
-            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
-                mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n");
-            }
-        }
-        
-        
-         // Assert
-       
-        if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){
-            mexErrMsgIdAndTxt("minimizeAwTV:GD_AwTV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n");
-        }
-        
-        
-        
-        
-        
-        
-        float** d_image=    (float**)malloc(deviceCount*sizeof(float*));
-        float** d_dimgTV=   (float**)malloc(deviceCount*sizeof(float*));
-        float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*));
-        float** d_norm2=    (float**)malloc(deviceCount*sizeof(float*));
-         
-        // allocate memory in each GPU
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            
-            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
-            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
-            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
-            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
-            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
-            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
-            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
-            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
-            cudaCheckErrors("Malloc  error");
-            
-            
-        }
-       unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1];
-        float* buffer;
-        if(splits>1){
-            mexWarnMsgIdAndTxt("minimizeAwTV:GD_AwTV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
-        }else{
-            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
-        }
-        
-        
-        
-        // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-        // splits>2 is completely empirical observation
-        if (isHostRegisterSupported & splits>2){
-            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-        }
-        cudaCheckErrors("Error pinning memory");
-
-        
-        
-                // Create streams
-        int nStream_device=2;
-        int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
-        
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
-            }
-        }
-        cudaCheckErrors("Stream creation fail");
-
-        
-        // For the reduction
-
-        double totalsum_prev;
-        double totalsum;
-        float sum_curr_spl;
-        float * sumnorm2;
-        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
-        
-        unsigned int curr_slices;
-        unsigned long long curr_pixels;
-        size_t linear_idx_start;
-        unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        unsigned long long* offset_host  =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        bool is_first_chunk;
-        bool is_last_chunk;
-        for(unsigned int i=0;i<maxIter;i+=(buffer_length-1)){
-            if(splits>1){
-                totalsum_prev=0;
-            }
-            for(unsigned int sp=0;sp<splits;sp++){
-                
-                // For each iteration we need to compute all the image. The ordering of these loops
-                // need to be like this due to the bounding layers between splits. If more than 1 split is needed
-                // for each GPU then there is no other way that taking the entire memory out of GPU and putting it back.
-                // If the memory can be shared between GPUs fully without extra splits, then there is an easy way of synchronizing the memory
-                
-                // Copy image to memory
-                for (dev = 0; dev < deviceCount; dev++){
-                    curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                    curr_pixels=curr_slices*image_size[0]*image_size[1];
-                    linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
-                    
-                    // Check if its the first or last chunck
-                    is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                    is_first_chunk=!(sp*deviceCount+dev);
-                    
-                    // lets compute where we start copyes and how much. This avoids 3 calls to Memcpy
-                    offset_device[dev]=buffer_pixels*is_first_chunk;
-                    offset_host[dev]=linear_idx_start-buffer_pixels*!is_first_chunk;
-                    bytes_device[dev]=curr_pixels+buffer_pixels*!is_first_chunk+buffer_pixels*!is_last_chunk;
-                }
-
-                if(i==0){
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
-                        
-                        
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                    }
-                }
-                // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
-                if (splits>1 & i>0){
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
-                        
-                        
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                    }
-                }
-                cudaCheckErrors("Memcpy failure on multi split");
-                
-                for(unsigned int ib=0;  (ib<(buffer_length-1)) && ((i+ib)<maxIter);  ib++){
-                    
-                    // For the gradient
-                    dim3 blockGrad(10, 10, 10);
-                    dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
-                    
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        // Compute the gradient of the TV norm
-                        
-                        // I don't understand why I need to store 2 layers to compute correctly with 1 buffer. The bounding checks should
-                        // be enough but they are not.
-                        gradientTV<<<gridGrad, blockGrad,0,stream[dev*nStream_device]>>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0],delta);
-                        
-                    }
-                    
-                    
-                    
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        // no need to copy the 2 aux slices here
-                        cudaStreamSynchronize(stream[dev*nStream_device]);
-                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
-                    }
-                    
-                    
-                    // Compute the L2 norm of the gradient. For that, reduction is used.
-                    //REDUCE
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        
-                        size_t dimblockRed = MAXTHREADS;
-                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
-                        
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
-                        
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        size_t dimblockRed = MAXTHREADS;
-                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
-
-                        if (dimgridRed > 1) {
-                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-                        }
-                        else {
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-                        }
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                     }
-                    cudaCheckErrors("Reduction error");
-                    
-                    
-                    // Accumulate the norm accross devices
-                    sum_curr_spl=0;
-                    // this is CPU code
-                    for (dev = 0; dev < deviceCount; dev++){
-                        sum_curr_spl+=sumnorm2[dev];
-                    }
-                    sum_curr_spl+=0.0000001f; // avoid division by zero
-                    
-                    // If we have more than one splits, lets use the result from prior calls
-                    if(i>0 && splits>1){
-                        // this is already stored:
-                        //totalsum=totalsum_prev; 
-                    }else{
-                        totalsum=sum_curr_spl;
-                    }
-                    
-                    
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        //NORMALIZE
-                        //in a Tesla, maximum blocks =15 SM * 4 blocks/SM
-                        divideArrayScalar  <<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels);
-                        //MULTIPLY HYPERPARAMETER
-                        multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
-                    }
-                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                     }
-                    cudaCheckErrors("Scalar operations error");
-                    
-                    //SUBSTRACT GRADIENT
-                    //////////////////////////////////////////////
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        
-                        substractArrays<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels);
-                    }
-                }
-
-                // Synchronize mathematics, make sure bounding pixels are correct
-                 for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                     }
-                
-                if(splits==1){
-                    for(dev=0; dev<deviceCount;dev++){
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
-                        }
-                        cudaDeviceSynchronize();
-                        if (dev>0){
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
-                        }
-                    }
-                }else{
-                    
-                    // We need to take it out :(
-                    for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-                    }
-                }
-                
-                for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
-                }
-                cudaCheckErrors("Memory gather error");
-
-                totalsum_prev+=sum_curr_spl;
-            }
-            totalsum=totalsum_prev;
-        }
-        // If there has not been splits, we still have data in memory
-        if(splits==1){
-            for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
-                
-                curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
-                total_pixels=curr_slices*image_size[0]*image_size[1];
-                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-            }
-        }
-        cudaCheckErrors("Copy result back");
-        
-        for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(d_image[dev]);
-            cudaFree(d_norm2aux[dev]);
-            cudaFree(d_dimgTV[dev]);
-            cudaFree(d_norm2[dev]);
-        }
-        if (splits==1){
-            cudaFreeHost(buffer);
-        }
-        
-        if (isHostRegisterSupported& splits>2){
-            cudaHostUnregister(img);
-            cudaHostUnregister(dst);
-        }
-        for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
-        cudaCheckErrors("Memory free");
-//         cudaDeviceReset();
-    }
-        
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
-        size_t memfree;
-        size_t memtotal;
-        const int deviceCount = gpuids.GetLength();
-        for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
-            if(dev==0) *mem_GPU_global=memfree;
-            if(memfree<memtotal/2){
-                mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
-            }
-            cudaCheckErrors("Check mem error");
-            
-            *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
-        }
-        *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
-        
-        //*mem_GPU_global= insert your known number here, in bytes.
-}
diff --git a/Common/CUDA/GD_AwTV.hpp.prehip b/Common/CUDA/GD_AwTV.hpp.prehip
deleted file mode 100644
index a9581025..00000000
--- a/Common/CUDA/GD_AwTV.hpp.prehip
+++ /dev/null
@@ -1,62 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header for CUDA functions for Steepest descend in POCS-type algorithms.
- *
- * This file has the required headers for POCS_TV.cu
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-
-
-#ifndef GD_AwTV_HPP
-#define GD_AwTV_HPP
-#include "TIGRE_common.hpp"
-#include "GpuIds.hpp"
-
-void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter,const float delta, const GpuIds& gpuids);
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
-
-
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/GD_TV.cu.prehip b/Common/CUDA/GD_TV.cu.prehip
deleted file mode 100644
index 4edcf94c..00000000
--- a/Common/CUDA/GD_TV.cu.prehip
+++ /dev/null
@@ -1,702 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for Steepest descend in POCS-type algorithms.
- *
- * This file will iteratively minimize by steepest descend the total variation
- * of the input image, with the parameters given, using GPUs.
- *
- * CODE by       Ander Biguri
- *
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-
-
-
-
-
-#define MAXTHREADS 1024
-#define MAX_BUFFER 60
-
-#include "GD_TV.hpp"
-#include "gpuUtils.hpp"
-
-
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("GD_TV:GPU",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-// CUDA kernels
-//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927
-    __global__ void divideArrayScalar(float* vec,float scalar,const size_t n){
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]/=scalar;
-        }
-    }
-    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]*=scalar;
-        }
-    }
-    __global__ void substractArrays(float* vec,float* vec2,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]-=vec2[i];
-        }
-    }
-    
-    __device__ __inline__
-            void gradient(const float* u, float* grad,
-            long z, long y, long x,
-            long depth, long rows, long cols){
-        unsigned long size2d = rows*cols;
-        unsigned long long idx = z * size2d + y * cols + x;
-        
-        float uidx = u[idx];
-        
-        if ( z - 1 >= 0 && z<depth) {
-            grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
-        }
-        
-        if ( y - 1 >= 0 && y<rows){
-            grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
-        }
-        
-        if ( x - 1 >= 0 && x<cols) {
-            grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
-        }
-    }
-    
-    __global__ void gradientTV(const float* f, float* dftv,
-            long depth, long rows, long cols){
-        unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
-        unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
-        unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
-        unsigned long long idx = z * rows * cols + y * cols + x;
-        if ( x >= cols || y >= rows || z >= depth )
-            return;
-        
-        
-        float df[3] ={0.f,0.f,0.f};
-        float dfi[3]={0.f,0.f,0.f}; // dfi== \partial f_{i+1,j,k}
-        float dfj[3]={0.f,0.f,0.f};
-        float dfk[3]={0.f,0.f,0.f};
-        gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
-        gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
-        gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
-        gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
-        float eps=0.00000001; //% avoid division by zero
-        
-        dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps)
-        -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps)     // I wish I coudl precompute this, but if I do then Id need to recompute the gradient.
-        -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps)
-        -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps);
-        return;
-        
-    }
-    
-    __device__ void warpReduce(volatile float *sdata, size_t tid) {
-        sdata[tid] += sdata[tid + 32];
-        sdata[tid] += sdata[tid + 16];
-        sdata[tid] += sdata[tid + 8];
-        sdata[tid] += sdata[tid + 4];
-        sdata[tid] += sdata[tid + 2];
-        sdata[tid] += sdata[tid + 1];
-    }
-    
-    __global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
-        extern __shared__ volatile float sdata[];
-        //http://stackoverflow.com/a/35133396/1485872
-        size_t tid = threadIdx.x;
-        size_t i = blockIdx.x*blockDim.x + tid;
-        size_t gridSize = blockDim.x*gridDim.x;
-        float mySum = 0;
-        float value=0;
-        while (i < n) {
-            value=g_idata[i]; //avoid reading twice
-            mySum += value*value;
-            i += gridSize;
-        }
-        sdata[tid] = mySum;
-        __syncthreads();
-        
-        if (tid < 512)
-            sdata[tid] += sdata[tid + 512];
-        __syncthreads();
-        if (tid < 256)
-            sdata[tid] += sdata[tid + 256];
-        __syncthreads();
-        
-        if (tid < 128)
-            sdata[tid] += sdata[tid + 128];
-        __syncthreads();
-        
-        if (tid <  64)
-            sdata[tid] += sdata[tid + 64];
-        __syncthreads();
-        
-        
-#if (__CUDART_VERSION >= 9000)
-        if ( tid < 32 )
-        {
-            mySum = sdata[tid] + sdata[tid + 32];
-            for (int offset = warpSize/2; offset > 0; offset /= 2) {
-                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
-            }
-        }
-#else
-        if (tid < 32) {
-            warpReduce(sdata, tid);
-            mySum = sdata[0];
-        }
-#endif
-        if (tid == 0) g_odata[blockIdx.x] = mySum;
-    }
-    
-    __global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
-        extern __shared__ volatile float sdata[];
-        //http://stackoverflow.com/a/35133396/1485872
-        size_t tid = threadIdx.x;
-        size_t i = blockIdx.x*blockDim.x + tid;
-        size_t gridSize = blockDim.x*gridDim.x;
-        float mySum = 0;
-        // float value=0;
-        while (i < n) {
-            mySum += g_idata[i];
-            i += gridSize;
-        }
-        sdata[tid] = mySum;
-        __syncthreads();
-        
-        if (tid < 512)
-            sdata[tid] += sdata[tid + 512];
-        __syncthreads();
-        if (tid < 256)
-            sdata[tid] += sdata[tid + 256];
-        __syncthreads();
-        
-        if (tid < 128)
-            sdata[tid] += sdata[tid + 128];
-        __syncthreads();
-        
-        if (tid <  64)
-            sdata[tid] += sdata[tid + 64];
-        __syncthreads();
-        
-        
-#if (__CUDART_VERSION >= 9000)
-        if ( tid < 32 )
-        {
-            mySum = sdata[tid] + sdata[tid + 32];
-            for (int offset = warpSize/2; offset > 0; offset /= 2) {
-                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
-            }
-        }
-#else
-        if (tid < 32) {
-            warpReduce(sdata, tid);
-            mySum = sdata[0];
-        }
-#endif
-        if (tid == 0) g_odata[blockIdx.x] = mySum;
-    }
-    
-    
-    
-    
-// main function
-    void pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter, const GpuIds& gpuids){
-        
-        
-       
-        
-        // Prepare for MultiGPU
-        int deviceCount = gpuids.GetLength();
-        cudaCheckErrors("Device query fail");
-        if (deviceCount == 0) {
-            mexErrMsgIdAndTxt("GD_TV:GPU","There are no available device(s) that support CUDA\n");
-        }
-        //
-        // CODE assumes
-        // 1.-All available devices are usable by this code
-        // 2.-All available devices are equal, they are the same machine (warning thrown)
-        // Check the available devices, and if they are the same
-        if (!gpuids.AreEqualDevices()) {
-            mexWarnMsgIdAndTxt("minimizeTV:GD_TV:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
-        }
-        
-        int dev;
-        
-        // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
-
-        size_t mem_GPU_global;
-        checkFreeMemory(gpuids, &mem_GPU_global);
-
-        
-        
-        // %5 of free memory should be enough, we have almost no variables in these kernels
-        size_t total_pixels              = image_size[0] * image_size[1]  * image_size[2] ;
-        size_t mem_slice_image           = sizeof(float)* image_size[0] * image_size[1]  ;
-        size_t mem_size_image            = sizeof(float)* total_pixels;
-        size_t mem_auxiliary             = sizeof(float)* (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
-        
-        // Decide how are we handling the distribution of computation
-        size_t mem_img_each_GPU;
-        
-        unsigned int buffer_length=2;
-        //Does everything fit in the GPU?
-        unsigned int slices_per_split;
-        
-        // if it is a thin problem (no need to split), just use one GPU
-        if (image_size[2]<4){deviceCount=1;}
-
-        unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this.
-        if(mem_GPU_global> 3*mem_size_image+3*(deviceCount-1)*mem_slice_image*buffer_length+mem_auxiliary){
-            // We only need to split if we have extra GPUs
-            slices_per_split=(image_size[2]+deviceCount-1)/deviceCount;
-            mem_img_each_GPU=mem_slice_image*((slices_per_split+buffer_length*2));
-        }else{
-            // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all
-            size_t mem_free=mem_GPU_global-mem_auxiliary;
-            
-            splits=(unsigned int)(ceil(((float)(3*mem_size_image)/(float)(deviceCount))/mem_free));
-            // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images.
-            // lets make sure these 2 slices fit, if they do not, add 1 to splits.
-            slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits);
-            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
-            
-            // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory
-            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
-                // one more split should do the job, as its an edge case.
-                splits++;
-                //recompute for later
-                slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap
-                mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
-            }
-
-
-            // How many EXTRA buffer slices should be able to fit in here??!?!
-            // Only do it if there are splits needed. 
-            if(splits>1){
-                mem_free=mem_GPU_global-(3*mem_img_each_GPU+mem_auxiliary);
-                unsigned int extra_buff=(mem_free/mem_slice_image); 
-                buffer_length=(extra_buff/2)/3; // we need double whatever this results in, rounded down.
-                buffer_length=max(buffer_length,2);// minimum 2
-                buffer_length=min(MAX_BUFFER,buffer_length);
-
-                mem_img_each_GPU=mem_slice_image*(slices_per_split+buffer_length*2);
-                
-            }else{
-                buffer_length=2;
-            }
-
-            // Assert
-            if (mem_GPU_global< 3*mem_img_each_GPU+mem_auxiliary){
-                mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n");
-            }
-        }
-        
-        
-         // Assert
-       
-        if ((slices_per_split+buffer_length*2)*image_size[0]*image_size[1]* sizeof(float)!= mem_img_each_GPU){
-            mexErrMsgIdAndTxt("GD_TV:GPU","Assertion Failed. Memory needed calculation broken! Please tell: ander.biguri@gmail.com\n");
-        }
-        
-        
-        
-        
-        
-        
-        float** d_image=    (float**)malloc(deviceCount*sizeof(float*));
-        float** d_dimgTV=   (float**)malloc(deviceCount*sizeof(float*));
-        float** d_norm2aux= (float**)malloc(deviceCount*sizeof(float*));
-        float** d_norm2=    (float**)malloc(deviceCount*sizeof(float*));
-         
-        // allocate memory in each GPU
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            
-            cudaMalloc((void**)&d_image[dev]    , mem_img_each_GPU);
-            cudaMemset(         d_image[dev],0  , mem_img_each_GPU);
-            cudaMalloc((void**)&d_dimgTV[dev]   , mem_img_each_GPU);
-            cudaMemset(         d_dimgTV[dev],0 , mem_img_each_GPU);
-            cudaMalloc((void**)&d_norm2[dev]    , slices_per_split*mem_slice_image);
-            cudaMemset(         d_norm2[dev],0  , slices_per_split*mem_slice_image);
-            cudaMalloc((void**)&d_norm2aux[dev]   , mem_auxiliary);
-            cudaMemset(         d_norm2aux[dev],0 , mem_auxiliary);
-            cudaCheckErrors("Malloc  error");
-            
-            
-        }
-       unsigned long long buffer_pixels=buffer_length*image_size[0]*image_size[1];
-        float* buffer;
-        if(splits>1){
-            mexWarnMsgIdAndTxt("minimizeTV:GD_TV:Image_split","Your image can not be fully split between the available GPUs. The computation of minTV will be significantly slowed due to the image size.\nApproximated mathematics turned on for computational speed.");
-        }else{
-            cudaMallocHost((void**)&buffer,buffer_length*image_size[0]*image_size[1]*sizeof(float));
-        }
-        
-        
-        
-        // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-        // splits>2 is completely empirical observation
-        if (isHostRegisterSupported & splits>2){
-            cudaHostRegister(img ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-        }
-        cudaCheckErrors("Error pinning memory");
-
-        
-        
-                // Create streams
-        int nStream_device=2;
-        int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
-        
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
-            }
-        }
-        cudaCheckErrors("Stream creation fail");
-
-        
-        // For the reduction
-
-        double totalsum_prev;
-        double totalsum;
-        float sum_curr_spl;
-        float * sumnorm2;
-        cudaMallocHost((void**)&sumnorm2,deviceCount*sizeof(float));
-        
-        unsigned int curr_slices;
-        unsigned long long curr_pixels;
-        size_t linear_idx_start;
-        unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        unsigned long long* offset_host  =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        bool is_first_chunk;
-        bool is_last_chunk;
-        for(unsigned int i=0;i<maxIter;i+=(buffer_length-1)){
-            if(splits>1){
-                totalsum_prev=0;
-            }
-            for(unsigned int sp=0;sp<splits;sp++){
-                
-                // For each iteration we need to compute all the image. The ordering of these loops
-                // need to be like this due to the bounding layers between splits. If more than 1 split is needed
-                // for each GPU then there is no other way that taking the entire memory out of GPU and putting it back.
-                // If the memory can be shared between GPUs fully without extra splits, then there is an easy way of synchronizing the memory
-                
-                // Copy image to memory
-                for (dev = 0; dev < deviceCount; dev++){
-                    curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                    curr_pixels=curr_slices*image_size[0]*image_size[1];
-                    linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
-                    
-                    // Check if its the first or last chunck
-                    is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                    is_first_chunk=!(sp*deviceCount+dev);
-                    
-                    // lets compute where we start copyes and how much. This avoids 3 calls to Memcpy
-                    offset_device[dev]=buffer_pixels*is_first_chunk;
-                    offset_host[dev]=linear_idx_start-buffer_pixels*!is_first_chunk;
-                    bytes_device[dev]=curr_pixels+buffer_pixels*!is_first_chunk+buffer_pixels*!is_last_chunk;
-                }
-
-                if(i==0){
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], img+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
-                        
-                        
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                    }
-                }
-                // if we need to split and its not the first iteration, then we need to copy from Host memory the previosu result.
-                if (splits>1 & i>0){
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaMemcpyAsync(d_image[dev]+offset_device[dev], dst+offset_host[dev]  , bytes_device[dev]*sizeof(float), cudaMemcpyHostToDevice,stream[dev*nStream_device+1]);
-                        
-                        
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                    }
-                }
-                cudaCheckErrors("Memcpy failure on multi split");
-                
-                for(unsigned int ib=0;  (ib<(buffer_length-1)) && ((i+ib)<maxIter);  ib++){
-                    
-                    // For the gradient
-                    dim3 blockGrad(10, 10, 10);
-                    dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (curr_slices+buffer_length*2+blockGrad.z-1)/blockGrad.z);
-                    
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        // Compute the gradient of the TV norm
-                        
-                        // I don't understand why I need to store 2 layers to compute correctly with 1 buffer. The bounding checks should
-                        // be enough but they are not.
-                        gradientTV<<<gridGrad, blockGrad,0,stream[dev*nStream_device]>>>(d_image[dev],d_dimgTV[dev],(long)(curr_slices+buffer_length*2-1), image_size[1],image_size[0]);
-                        
-                    }
-                    
-                    
-                    
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        // no need to copy the 2 aux slices here
-                        cudaStreamSynchronize(stream[dev*nStream_device]);
-                        cudaMemcpyAsync(d_norm2[dev], d_dimgTV[dev]+buffer_pixels, image_size[0]*image_size[1]*curr_slices*sizeof(float), cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
-                    }
-                    
-                    
-                    // Compute the L2 norm of the gradient. For that, reduction is used.
-                    //REDUCE
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        
-                        size_t dimblockRed = MAXTHREADS;
-                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
-                        
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
-                        
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        size_t dimblockRed = MAXTHREADS;
-                        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
-
-                        if (dimgridRed > 1) {
-                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-                        }
-                        else {
-                            cudaStreamSynchronize(stream[dev*nStream_device]);
-                            cudaMemcpyAsync(&sumnorm2[dev], d_norm2aux[dev], sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-                        }
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                     }
-                    cudaCheckErrors("Reduction error");
-                    
-                    
-                    // Accumulate the norm accross devices
-                    sum_curr_spl=0;
-                    // this is CPU code
-                    for (dev = 0; dev < deviceCount; dev++){
-                        sum_curr_spl+=sumnorm2[dev];
-                    }
-                    sum_curr_spl+=0.0000001f; // avoid division by zero
-                    
-                    // If we have more than one splits, lets use the result from prior calls
-                    if(i>0 && splits>1){
-                        // this is already stored:
-                        //totalsum=totalsum_prev; 
-                    }else{
-                        totalsum=sum_curr_spl;
-                    }
-                    
-                    
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        //NORMALIZE
-                        //in a Tesla, maximum blocks =15 SM * 4 blocks/SM
-                        divideArrayScalar  <<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,(float)sqrt(totalsum),total_pixels);
-                        //MULTIPLY HYPERPARAMETER
-                        multiplyArrayScalar<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_dimgTV[dev]+buffer_pixels,alpha,   total_pixels);
-                    }
-                     for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                     }
-                    cudaCheckErrors("Scalar operations error");
-                    
-                    //SUBSTRACT GRADIENT
-                    //////////////////////////////////////////////
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        
-                        substractArrays<<<60,MAXTHREADS,0,stream[dev*nStream_device]>>>(d_image[dev]+buffer_pixels,d_dimgTV[dev]+buffer_pixels, total_pixels);
-                    }
-                }
-
-                // Synchronize mathematics, make sure bounding pixels are correct
-                 for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                     }
-                
-                if(splits==1){
-                    for(dev=0; dev<deviceCount;dev++){
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpy(buffer, d_image[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpy(d_image[dev]+total_pixels+buffer_pixels,buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice); 
-                        }
-                        cudaDeviceSynchronize();
-                        if (dev>0){
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer, d_image[dev-1]+total_pixels+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-                            cudaSetDevice(gpuids[dev]);
-                            cudaMemcpyAsync(d_image[dev],buffer, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice);
-                        }
-                    }
-                }else{
-                    
-                    // We need to take it out :(
-                    for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        linear_idx_start=image_size[0]*image_size[1]*slices_per_split*(sp*deviceCount+dev);
-                        total_pixels=curr_slices*image_size[0]*image_size[1];
-                        cudaMemcpyAsync(&dst[linear_idx_start], d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-                    }
-                }
-                
-                for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
-                }
-                cudaCheckErrors("Memory gather error");
-
-                totalsum_prev+=sum_curr_spl;
-            }
-            totalsum=totalsum_prev;
-        }
-        // If there has not been splits, we still have data in memory
-        if(splits==1){
-            for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
-                
-                curr_slices=((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
-                total_pixels=curr_slices*image_size[0]*image_size[1];
-                cudaMemcpy(dst+slices_per_split*image_size[0]*image_size[1]*dev, d_image[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost);
-            }
-        }
-        cudaCheckErrors("Copy result back");
-        
-        for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(d_image[dev]);
-            cudaFree(d_norm2aux[dev]);
-            cudaFree(d_dimgTV[dev]);
-            cudaFree(d_norm2[dev]);
-        }
-        if (splits==1){
-            cudaFreeHost(buffer);
-        }
-        
-        if (isHostRegisterSupported& splits>2){
-            cudaHostUnregister(img);
-            cudaHostUnregister(dst);
-        }
-        for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
-        
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
-        }
-        cudaCheckErrors("Memory free");
-        cudaDeviceReset();
-    }
-        
-void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
-        size_t memfree;
-        size_t memtotal;
-        int deviceCount = gpuids.GetLength();
-        for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
-            if(dev==0) *mem_GPU_global=memfree;
-            if(memfree<memtotal/2){
-                mexErrMsgIdAndTxt("GD_TV:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
-            }
-            cudaCheckErrors("Check mem error");
-            
-            *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
-        }
-        *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
-        
-        //*mem_GPU_global= insert your known number here, in bytes.
-}
diff --git a/Common/CUDA/GD_TV.hpp.prehip b/Common/CUDA/GD_TV.hpp.prehip
deleted file mode 100644
index 998e1ad5..00000000
--- a/Common/CUDA/GD_TV.hpp.prehip
+++ /dev/null
@@ -1,61 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header for CUDA functions for Steepest descend in POCS-type algorithms.
- *
- * This file has the required headers for POCS_TV.cu
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-
-
-#ifndef GD_TV_HPP
-#define GD_TV_HPP
-#include "TIGRE_common.hpp"
-#include "GpuIds.hpp"
-
-void pocs_tv(float* img,float* dst,float alpha,const long* image_size, int maxIter, const GpuIds& gpuids);
-
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/GpuIds.cpp.prehip b/Common/CUDA/GpuIds.cpp.prehip
deleted file mode 100644
index e9e622cc..00000000
--- a/Common/CUDA/GpuIds.cpp.prehip
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "GpuIds.hpp"
-#include <stdlib.h>
-#include <string.h>
-#include <cuda_runtime_api.h>
-
-GpuIds::~GpuIds() {
-    free(m_piDeviceIds); m_piDeviceIds = nullptr;
-    m_iCount = 0;
-}
-GpuIds::GpuIds() : m_piDeviceIds (nullptr), m_iCount(0) {
-
-}
-void GpuIds::SetIds(int iCount, int* piDeviceIds) {
-    if (iCount > 0 && piDeviceIds != 0) {
-        if (m_piDeviceIds) {
-            free(m_piDeviceIds); m_piDeviceIds = nullptr;
-            m_iCount = 0;
-        }
-        m_piDeviceIds = (int*)malloc(iCount * sizeof(int));
-        if (m_piDeviceIds) {
-            for (int iI = 0; iI < iCount; ++iI) {
-                m_piDeviceIds[iI] = piDeviceIds[iI];
-            }
-            m_iCount = iCount;
-        }
-    }
-}
-
-int GpuIds::GetLength() const {
-    return m_iCount;
-}
-int& GpuIds::operator[](int iIndex){
-    return m_piDeviceIds[iIndex];
-}
-int GpuIds::operator[](int iIndex) const {
-    return m_piDeviceIds[iIndex];
-}
-
-void GpuIds::SetAllGpus(int iTotalDeviceCount) {
-    // Set all GPUs for compatibility
-    // Makeup valid GpuIds.
-    int* aiIds = nullptr;
-    if (iTotalDeviceCount == 0) {
-        (int*)malloc(iTotalDeviceCount*sizeof(int));
-        for (int iI = 0; iI < iTotalDeviceCount; ++iI) {
-            aiIds[iI] = iI;
-        }
-    }
-    SetIds(iTotalDeviceCount, aiIds);
-    free(aiIds); aiIds = 0;    
-}
-
-bool GpuIds::AreEqualDevices() const {
-    int deviceCount = this->GetLength();
-    const int devicenamelength = 256;  // The length 256 is fixed by spec of cudaDeviceProp::name
-    char devicename[devicenamelength];
-    cudaDeviceProp deviceProp;
-    for (int dev = 0; dev < deviceCount; dev++) {
-        // cudaSetDevice(m_piDeviceIds[dev]);
-        cudaGetDeviceProperties(&deviceProp, m_piDeviceIds[dev]);
-        if (dev>0) {
-            if (strcmp(devicename, deviceProp.name) != 0) {
-                return false;
-            }
-        }
-        memset(devicename, 0, devicenamelength);
-        strcpy(devicename, deviceProp.name);
-    }
-    return true;
-}
diff --git a/Common/CUDA/GpuIds.hpp.prehip b/Common/CUDA/GpuIds.hpp.prehip
deleted file mode 100644
index e0223f86..00000000
--- a/Common/CUDA/GpuIds.hpp.prehip
+++ /dev/null
@@ -1,17 +0,0 @@
-
-#ifndef GPUIDS_H
-#define GPUIDS_H
-struct GpuIds {
-    int* m_piDeviceIds;
-    int m_iCount;
-    ~GpuIds();
-    GpuIds();
-    void SetIds(int iCount, int* piDeviceIds);
-    int GetLength() const;
-    void SetAllGpus(int iTotalDeviceCount);
-    int& operator[](int iIndex);
-    int operator[](int iIndex) const;
-    bool AreEqualDevices() const;
-};
-#endif
-
diff --git a/Common/CUDA/PICCS.cu.prehip b/Common/CUDA/PICCS.cu.prehip
deleted file mode 100644
index 481ede08..00000000
--- a/Common/CUDA/PICCS.cu.prehip
+++ /dev/null
@@ -1,398 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for Steepest descend in POCS-type algorithms.
- *
- * This file will iteratively minimize by stepest descend the total variation 
- * of the input image, with the parameters given, using GPUs.
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-
-
-#define MAXTHREADS 1024
-
-#include "PICCS.hpp"
-
-
-
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("ERROR in: %s \n",msg);\
-                mexErrMsgIdAndTxt("err",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-// CUDA kernels
-//https://stackoverflow.com/questions/21332040/simple-cuda-kernel-optimization/21340927#21340927
-    __global__ void divideArrayScalar(float* vec,float scalar,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]/=scalar;
-        }
-    }
-    __global__ void isnan_device(float* vec,const size_t n,bool* result)
-    {
-        *result = false;
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            if(isnan(vec[i]))
-                *result=true;
-        }
-    }
-    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]*=scalar;
-        }
-    }
-    __global__ void substractArrays(float* vec,float* vec2,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]-=vec2[i];
-        }
-    }
-    __global__ void addArrays(float* vec,float* vec2,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]+=vec2[i];
-        }
-    }
-    __device__ __inline__
-            void gradient(const float* u, float* grad,
-            long z, long y, long x,
-            long depth, long rows, long cols)
-    {
-        unsigned long size2d = rows*cols;
-        unsigned long long idx = z * size2d + y * cols + x;
-        
-        float uidx = u[idx];
-        
-        if ( z - 1 >= 0 && z<depth) {
-            grad[0] = (uidx-u[(z-1)*size2d + y*cols + x]) ;
-        }
-        
-        if ( y - 1 >= 0 && y<rows){
-            grad[1] = (uidx-u[z*size2d + (y-1)*cols + x]) ;
-        }
-        
-        if ( x - 1 >= 0 && x<cols) {
-            grad[2] = (uidx-u[z*size2d + y*cols + (x-1)]);
-        }
-    }
-    
-    __global__ void gradientTV(const float* f, float* dftv,
-            long depth, long rows, long cols){
-        unsigned long x = threadIdx.x + blockIdx.x * blockDim.x;
-        unsigned long y = threadIdx.y + blockIdx.y * blockDim.y;
-        unsigned long z = threadIdx.z + blockIdx.z * blockDim.z;
-        unsigned long long idx = z * rows * cols + y * cols + x;
-        if ( x >= cols || y >= rows || z >= depth )
-            return;
-        
-        float df[3] ={0,0,0};
-        float dfi[3]={0,0,0}; // dfi== \partial f_{i+1,j,k}
-        float dfj[3]={0,0,0};
-        float dfk[3]={0,0,0};
-        gradient(f,df  ,z  ,y  ,x  , depth,rows,cols);
-        gradient(f,dfi ,z  ,y  ,x+1, depth,rows,cols);
-        gradient(f,dfj ,z  ,y+1,x  , depth,rows,cols);
-        gradient(f,dfk ,z+1,y  ,x  , depth,rows,cols);
-        float eps=0.000001; //% avoid division by zero
-        dftv[idx]=(df[0]+df[1]+df[2])/(sqrt(df[0] *df[0] +df[1] *df[1] +df[2] *df[2])+eps)
-        -dfi[2]/(sqrt(dfi[0]*dfi[0]+dfi[1]*dfi[1]+dfi[2]*dfi[2]) +eps)     // I wish I coudl precompute this, but if I do then Id need to recompute the gradient.
-        -dfj[1]/(sqrt(dfj[0]*dfj[0]+dfj[1]*dfj[1]+dfj[2]*dfj[2]) +eps)
-        -dfk[0]/(sqrt(dfk[0]*dfk[0]+dfk[1]*dfk[1]+dfk[2]*dfk[2]) +eps);
-        
-    }
-    
-    __device__ void warpReduce(volatile float *sdata, size_t tid) {
-        sdata[tid] += sdata[tid + 32];
-        sdata[tid] += sdata[tid + 16];
-        sdata[tid] += sdata[tid + 8];
-        sdata[tid] += sdata[tid + 4];
-        sdata[tid] += sdata[tid + 2];
-        sdata[tid] += sdata[tid + 1];
-    }
-    
-    __global__ void  reduceNorm2(float *g_idata, float *g_odata, size_t n){
-        extern __shared__ volatile float sdata[];
-        //http://stackoverflow.com/a/35133396/1485872
-        size_t tid = threadIdx.x;
-        size_t i = blockIdx.x*blockDim.x + tid;
-        size_t gridSize = blockDim.x*gridDim.x;
-        float mySum = 0;
-        float value=0;
-        while (i < n) {
-            value=g_idata[i]; //avoid reading twice
-            mySum += value*value;
-            i += gridSize;
-        }
-        sdata[tid] = mySum;
-        __syncthreads();
-        
-        if (tid < 512)
-            sdata[tid] += sdata[tid + 512];
-        __syncthreads();
-        if (tid < 256)
-            sdata[tid] += sdata[tid + 256];
-        __syncthreads();
-        
-        if (tid < 128)
-            sdata[tid] += sdata[tid + 128];
-        __syncthreads();
-        
-        if (tid <  64)
-            sdata[tid] += sdata[tid + 64];
-        __syncthreads();
-        
-        
-#if (__CUDART_VERSION >= 9000)
-        if ( tid < 32 )
-        {
-            mySum = sdata[tid] + sdata[tid + 32];
-            for (int offset = warpSize/2; offset > 0; offset /= 2) {
-                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
-            }
-        }
-#else
-        if (tid < 32) {
-            warpReduce(sdata, tid);
-            mySum = sdata[0];
-        }
-#endif
-        if (tid == 0) g_odata[blockIdx.x] = mySum;
-    }
-    __global__ void  reduceSum(float *g_idata, float *g_odata, size_t n){
-        extern __shared__ volatile float sdata[];
-        //http://stackoverflow.com/a/35133396/1485872
-        size_t tid = threadIdx.x;
-        size_t i = blockIdx.x*blockDim.x + tid;
-        size_t gridSize = blockDim.x*gridDim.x;
-        float mySum = 0;
-       // float value=0;
-        while (i < n) {
-            mySum += g_idata[i];
-            i += gridSize;
-        }
-        sdata[tid] = mySum;
-        __syncthreads();
-        
-        if (tid < 512)
-            sdata[tid] += sdata[tid + 512];
-        __syncthreads();
-        if (tid < 256)
-            sdata[tid] += sdata[tid + 256];
-        __syncthreads();
-        
-        if (tid < 128)
-            sdata[tid] += sdata[tid + 128];
-        __syncthreads();
-        
-        if (tid <  64)
-            sdata[tid] += sdata[tid + 64];
-        __syncthreads();
-        
-        
-#if (__CUDART_VERSION >= 9000)
-        if ( tid < 32 )
-        {
-            mySum = sdata[tid] + sdata[tid + 32];
-            for (int offset = warpSize/2; offset > 0; offset /= 2) {
-                mySum += __shfl_down_sync(0xFFFFFFFF, mySum, offset,32);
-            }
-        }
-#else
-        if (tid < 32) {
-            warpReduce(sdata, tid);
-            mySum = sdata[0];
-        }
-#endif
-        if (tid == 0) g_odata[blockIdx.x] = mySum;
-    }
-    
-
-bool isnan_cuda(float* vec, size_t size){
-    bool*d_nan;
-    bool h_nan;
-    cudaMalloc((void **)&d_nan, sizeof (bool));
-    isnan_device<<<60,MAXTHREADS>>>(vec,size,d_nan);
-    cudaMemcpy(&h_nan, d_nan, sizeof(bool), cudaMemcpyDeviceToHost);
-    return h_nan;
-
-}
-    
-// main function
- void piccs_tv(const float* img,const float* prior, float* dst,float alpha,float ratio, const long* image_size, int maxIter, const GpuIds& gpuids){
-        
-     
-        
-    
-        size_t total_pixels = image_size[0] * image_size[1]  * image_size[2] ;
-        size_t mem_size = sizeof(float) * total_pixels;
-        
-        float *d_image,*d_prior,*d_dpiccsTV, *d_dimgTV,*d_aux_small,*d_aux_image, *d_norm2;
-        // memory for image
-        cudaMalloc(&d_image, mem_size);
-        cudaMalloc(&d_prior, mem_size);
-
-        cudaCheckErrors("Malloc Image error");
-        cudaMemcpy(d_image, img, mem_size, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_prior, prior, mem_size, cudaMemcpyHostToDevice);
-        cudaCheckErrors("Memory Malloc and Memset: SRC");
-        // memory for df
-        cudaMalloc(&d_dimgTV, mem_size);
-        cudaMalloc(&d_dpiccsTV, mem_size);
-        cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_norm2, mem_size);
-        cudaCheckErrors("Memory Malloc and Memset: TV");
-        cudaMalloc(&d_aux_image, mem_size);
-        cudaCheckErrors("Memory Malloc and Memset: TV");
-        
-        // memory for L2norm auxiliar
-        cudaMalloc(&d_aux_small, sizeof(float)*(total_pixels + MAXTHREADS - 1) / MAXTHREADS);
-        cudaCheckErrors("Memory Malloc and Memset: NORMAux");
-        
-        
-        
-        // For the gradient
-        dim3 blockGrad(10, 10, 10);
-        dim3 gridGrad((image_size[0]+blockGrad.x-1)/blockGrad.x, (image_size[1]+blockGrad.y-1)/blockGrad.y, (image_size[2]+blockGrad.z-1)/blockGrad.z);
-        
-        // For the reduction
-        float sumnorm2;
-        size_t dimblockRed = MAXTHREADS;
-        size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
-
-
-        for(unsigned int i=0;i<maxIter;i++){
-            
-            cudaMemcpy( d_aux_image,d_image, mem_size, cudaMemcpyDeviceToDevice);
-//             mexPrintf("Iteration %d\n",(int)i);
-
-            // Compute the gradient of the TV norm
-            gradientTV<<<gridGrad, blockGrad>>>(d_image,d_dimgTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Gradient");
-//             mexPrintf("Gradient is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
-
-
-            multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,(1-ratio),   total_pixels);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Multiplication error");
-
-            substractArrays<<<60,MAXTHREADS>>>(d_aux_image,d_prior, total_pixels);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Substraction error");
-            
-            gradientTV<<<gridGrad, blockGrad>>>(d_aux_image,d_dpiccsTV,image_size[2], image_size[1],image_size[0]);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Gradient");
-//             mexPrintf("Gradient piccs is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
-
-            multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dpiccsTV,ratio,   total_pixels);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Multiplication error");
-//             mexPrintf("Multiplication is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
-                
-            
-            addArrays<<<60,MAXTHREADS>>>(d_dimgTV,d_dpiccsTV,total_pixels);
-            cudaDeviceSynchronize();
-            //NOMRALIZE via reduction
-            //mexPrintf("Pre-norm2 is nan: %s\n",isnan_cuda(d_dimgTV,total_pixels) ? "true" : "false");
-            cudaMemcpy(d_norm2, d_dimgTV, mem_size, cudaMemcpyDeviceToDevice);
-            cudaCheckErrors("Copy from gradient call error");
-            reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_norm2, d_aux_small, total_pixels);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("reduce1");
-            if (dimgridRed > 1) {
-                reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float) >> >(d_aux_small, d_norm2, dimgridRed);
-                cudaDeviceSynchronize();
-                cudaCheckErrors("reduce2");
-                cudaMemcpy(&sumnorm2, d_norm2, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
-
-            }
-            else {
-                cudaMemcpy(&sumnorm2, d_aux_small, sizeof(float), cudaMemcpyDeviceToHost);
-                cudaCheckErrors("cudaMemcpy");
-            }
-//             mexPrintf("alpha/sqrt(sumnorm2): %f\n",alpha/sqrt(sumnorm2));
-            //MULTIPLY HYPERPARAMETER sqrt(sumnorm2)
-            multiplyArrayScalar<<<60,MAXTHREADS>>>(d_dimgTV,alpha/sqrt(sumnorm2),  total_pixels);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Multiplication error");
-            //SUBSTRACT GRADIENT
-            substractArrays    <<<60,MAXTHREADS>>>(d_image,d_dimgTV, total_pixels);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Substraction error");
-//             mexPrintf("Final update is nan: %s\n",isnan_cuda(d_image,total_pixels) ? "true" : "false");
-//             mexPrintf("\n");
-            sumnorm2=0;
-        }
-        
-        cudaCheckErrors("TV minimization");
-        
-        cudaMemcpy(dst, d_image, mem_size, cudaMemcpyDeviceToHost);
-        cudaCheckErrors("Copy result back");
-        
-        cudaFree(d_image);
-        cudaFree(d_dpiccsTV);
-        cudaFree(d_aux_image);
-        cudaFree(d_aux_small);
-        cudaFree(d_prior);
-        cudaFree(d_norm2);
-
-
-        cudaCheckErrors("Memory free");
-        cudaDeviceReset();
-    }
-    
diff --git a/Common/CUDA/PICCS.hpp.prehip b/Common/CUDA/PICCS.hpp.prehip
deleted file mode 100644
index e3592dbb..00000000
--- a/Common/CUDA/PICCS.hpp.prehip
+++ /dev/null
@@ -1,61 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header for CUDA functions for Steepest descend in POCS-type algorithms.
- *
- * This file has the required headers for POCS_TV.cu
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-
-
-#ifndef GD_TV_HPP
-#define GD_TV_HPP
-#include "TIGRE_common.hpp"
-#include "GpuIds.hpp"
-
-void piccs_tv(const float* img,const float* prior, float* dst,float alpha, float ratio, const long* image_size, int maxIter, const GpuIds& gpuids);
-
-
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/RandomNumberGenerator.cu.prehip b/Common/CUDA/RandomNumberGenerator.cu.prehip
deleted file mode 100644
index d7d1224a..00000000
--- a/Common/CUDA/RandomNumberGenerator.cu.prehip
+++ /dev/null
@@ -1,193 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for random number generator
- *
- * Adds noise of Poisson and normal distribution to the input.
- *
- * CODE by       Tomoyuki SADAKANE
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda.h>
-#include <curand_kernel.h>
-#include <curand.h>
-
-#include "gpuUtils.hpp"
-#include "RandomNumberGenerator.hpp"
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                mexErrMsgIdAndTxt("RandomNumberGenerator:",cudaGetErrorString(__err));\
-        } \
-} while (0)
-
-
-__global__ void setup_kernel(curandState *state) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    /* Each thread gets same seed, a different sequence number, no offset */
-    curand_init(1234, idx, 0, &state[idx]);
-}
-
-__global__ void GeneratePoisson(curandState *state, const float* pfIn, size_t uiLen, float* pfOut) {
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
-    int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
-    for (int iI = 0; iI < iIter; ++iI) {
-        size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
-        if (uiPos < uiLen) {
-            /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
-            pfOut[uiPos] = (float)uiPoisson;
-        }
-    }
-    /* Copy state back to global memory */
-    state[idx] = localState;
-}
-
-__global__ void GeneratePoissonAddGaussian(curandState *state,
-                        const float* pfIn,
-                        size_t uiLen, 
-                        float fGaussMu,
-                        float fGaussSigma,
-                        float* pfOut)
-{
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    /* Copy state to local memory for efficiency */
-    curandState localState = state[idx];
-    int iIter = (uiLen + blockDim.x*gridDim.x - 1)/(blockDim.x*gridDim.x);
-    for (int iI = 0; iI < iIter; ++iI) {
-        size_t uiPos = (size_t)blockDim.x*gridDim.x*iI+idx;
-        if (uiPos < uiLen) {
-            /* Poisson */
-            unsigned int uiPoisson = curand_poisson(&localState, pfIn[uiPos]);
-            /* Gaussian */
-            float fNormal = curand_normal(&localState) * fGaussSigma + fGaussMu;
-            pfOut[uiPos] = fNormal + (float)uiPoisson;
-        }
-    }
-    /* Copy state back to global memory */
-    state[idx] = localState;
-}
-
-
-template<class T_value>
-void GetMinMax(const T_value* pfIn, size_t uiLen, T_value& tvMin, T_value& tvMax) {
-    tvMin = pfIn[0];
-    tvMax = pfIn[0];
-    T_value tvVal;
-    for (int iI = 1; iI < uiLen; ++iI) {
-        tvVal = pfIn[iI];
-        if (tvMax < tvVal) { tvMax = tvVal; continue;}
-        if (tvMin > tvVal) { tvMin = tvVal; continue;}
-    }
-}
-void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids) {
-    // printf("poisson_1d(pfIn = %p, uiLen = %zd, pfOut = %p)\n", pfIn, uiLen, pfOut);
-    float* d_pfIn = nullptr;
-    float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 1");
-
-    // float fMin, fMax;
-    // GetMinMax(pfIn, uiLen, fMin, fMax);
-    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
-    const int kiBlockDim = 1024;  // Threads per Block
-    const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_1d fail cudaMalloc 3");
-    setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
-    GeneratePoisson<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_1d fail cudaMemcpy 2");
-    // GetMinMax(pfOut, uiLen, fMin, fMax);
-    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
-}
-
-void poisson_gaussian_1d(const float* pfIn,
-                        size_t uiLen,
-                        float fGaussMu,
-                        float fGaussSigma,
-                        float* pfOut,
-                        GpuIds& gpuids)
-{
-    // printf("poisson_gaussian_1d(pfIn = %p, uiLen = %zd, fGaussMu = %+f, fGaussSigma = %f, pfOut = %p)\n", pfIn, uiLen, fGaussMu, fGaussSigma, pfOut);
-    float* d_pfIn = nullptr;
-    float* d_pfOut = nullptr;
-    cudaMalloc((void **)&d_pfIn, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 1");
-    cudaMalloc((void **)&d_pfOut, uiLen * sizeof(float));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 2");
-    cudaMemcpy(d_pfIn, pfIn, uiLen*sizeof(float), cudaMemcpyHostToDevice);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 1");
-
-    // float fMin, fMax;
-    // GetMinMax(pfIn, uiLen, fMin, fMax);
-    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-    curandState *curandStates = nullptr;
-    const int kiBlockDim = 64;  // Threads per Block
-    const int kiGridDim = 64;//(uiLen+kiBlockDim-1)/kiBlockDim;
-    cudaMalloc((void **)&curandStates, kiGridDim * kiBlockDim * sizeof(curandState));
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMalloc 3");
-    setup_kernel<<<kiGridDim, kiBlockDim>>>(curandStates);
-    GeneratePoissonAddGaussian<<<kiGridDim, kiBlockDim>>>(curandStates, d_pfIn, uiLen, fGaussMu, fGaussSigma, d_pfOut);
-    cudaMemcpy(pfOut, d_pfOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
-    cudaCheckErrors("poisson_gaussian_1d fail cudaMemcpy 2");
-    // GetMinMax(pfOut, uiLen, fMin, fMax);
-    // printf("fMin, fMax = %f, %f\n", fMin, fMax);
-
-
-    cudaFree(d_pfIn); d_pfIn = nullptr;
-    cudaFree(d_pfOut); d_pfOut = nullptr;
-    cudaFree(curandStates); curandStates = nullptr;
-}
diff --git a/Common/CUDA/RandomNumberGenerator.hpp.prehip b/Common/CUDA/RandomNumberGenerator.hpp.prehip
deleted file mode 100644
index 4ba68d8d..00000000
--- a/Common/CUDA/RandomNumberGenerator.hpp.prehip
+++ /dev/null
@@ -1,49 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA functions for random number generator
- *
- * Adds noise of Poisson and normal distribution to the input.
- *
- * CODE by       Tomoyuki SADAKANE
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-#include "TIGRE_common.hpp"
-#include "GpuIds.hpp"
-void poisson_1d(const float* pfIn, size_t uiLen, float* pfOut, const GpuIds& gpuids);
-void poisson_gaussian_1d(const float* pfPoissonL, size_t uiLen, float fGaussMu, float fGaussSigma, float* pfOut, GpuIds& gpuids);
diff --git a/Common/CUDA/Siddon_projection.cu.prehip b/Common/CUDA/Siddon_projection.cu.prehip
deleted file mode 100644
index 2a025f8c..00000000
--- a/Common/CUDA/Siddon_projection.cu.prehip
+++ /dev/null
@@ -1,859 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for ray-voxel intersection based projection
- *
- * This file has the necessary fucntiosn to perform X-ray CBCT projection
- * operation given a geaometry, angles and image. It usesthe so-called
- * Jacobs algorithm to compute efficiently the length of the x-rays over
- * voxel space.
- *
- * CODE by       Ander Biguri
- *               Sepideh Hatamikia (arbitrary rotation)
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-#include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include "Siddon_projection.hpp"
-#include "TIGRE_common.hpp"
-#include <math.h>
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("Ax:Siddon_projection",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-    
-#define MAXTREADS 1024
-#define PROJ_PER_BLOCK 9
-#define PIXEL_SIZE_BLOCK 9
-    /*GEOMETRY DEFINITION
-     *
-     *                Detector plane, behind
-     *            |-----------------------------|
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |      +--------+             |
-     *            |     /        /|             |
-     *   A Z      |    /        / |*D           |
-     *   |        |   +--------+  |             |
-     *   |        |   |        |  |             |
-     *   |        |   |     *O |  +             |
-     *    --->y   |   |        | /              |
-     *  /         |   |        |/               |
-     * V X        |   +--------+                |
-     *            |-----------------------------|
-     *
-     *           *S
-     *
-     *
-     *
-     *
-     *
-     **/
-    
-    void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc);
-
-__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
-
-
-__global__ void vecAddInPlace(float *a, float *b, unsigned long  n)
-{
-    int idx = blockIdx.x*blockDim.x+threadIdx.x;
-    // Make sure we do not go out of bounds
-    if (idx < n)
-        a[idx] = a[idx] + b[idx];
-}
-
-__global__ void kernelPixelDetector( Geometry geo,
-        float* detector,
-        const int currProjSetNumber,
-        const int totalNoOfProjections,
-        cudaTextureObject_t tex){
-    
-    
-    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned long long projNumber=threadIdx.z;
-    
-    
-    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
-        return;
-    
-#if IS_FOR_MATLAB_TIGRE
-    size_t idx =  (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#else
-    size_t idx =  (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#endif
-    unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array (for a given GPU)
-
-    if(indAlpha>=totalNoOfProjections)
-        return;
-    
-    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
-    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
-    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
-    Point3D source = projParamsArrayDev[4*projNumber+3];
-    
-    /////// Get coordinates XYZ of pixel UV
-    unsigned long pixelV = geo.nDetecV-v-1;
-    unsigned long pixelU = u;
-    Point3D pixel1D;
-    pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
-    pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
-    pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
-    ///////
-    // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516
-    //////
-    // Also called Jacobs algorithms
-    Point3D ray;
-    // vector of Xray
-    ray.x=pixel1D.x-source.x;
-    ray.y=pixel1D.y-source.y;
-    ray.z=pixel1D.z-source.z;
-    float eps=0.001;
-    ray.x=(fabsf(ray.x)<eps)? 0 : ray.x;
-    ray.y=(fabsf(ray.y)<eps)? 0 : ray.y; 
-    ray.z=(fabsf(ray.z)<eps)? 0 : ray.z; 
-    // This variables are ommited because
-    // bx,by,bz ={0,0,0}
-    // dx,dy,dz ={1,1,1}
-    // compute parameter values for x-ray parametric equation. eq(3-10)
-    float axm,aym,azm;
-    float axM,ayM,azM;
-    // In the paper Nx= number of X planes-> Nvoxel+1
-   
-    axm=fminf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x));
-    aym=fminf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y));
-    azm=fminf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z));
-    axM=fmaxf(__fdividef(-source.x,ray.x),__fdividef(geo.nVoxelX-source.x,ray.x));
-    ayM=fmaxf(__fdividef(-source.y,ray.y),__fdividef(geo.nVoxelY-source.y,ray.y));
-    azM=fmaxf(__fdividef(-source.z,ray.z),__fdividef(geo.nVoxelZ-source.z,ray.z));
-    
-    float am=fmaxf(fmaxf(axm,aym),azm);
-    float aM=fminf(fminf(axM,ayM),azM);
-    
-    // line intersects voxel space ->   am<aM
-    if (am>=aM)
-        detector[idx]=0;
-    
-    // Compute max/min image INDEX for intersection eq(11-19)
-    // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version
-    float imin,imax,jmin,jmax,kmin,kmax;
-    // for X
-    if( source.x<pixel1D.x){
-        imin=(am==axm)? 1.0f             : ceilf (source.x+am*ray.x);
-        imax=(aM==axM)? geo.nVoxelX      : floorf(source.x+aM*ray.x);
-    }else{
-        imax=(am==axm)? geo.nVoxelX-1.0f : floorf(source.x+am*ray.x);
-        imin=(aM==axM)? 0.0f             : ceilf (source.x+aM*ray.x);
-    }
-    // for Y
-    if( source.y<pixel1D.y){
-        jmin=(am==aym)? 1.0f             : ceilf (source.y+am*ray.y);
-        jmax=(aM==ayM)? geo.nVoxelY      : floorf(source.y+aM*ray.y);
-    }else{
-        jmax=(am==aym)? geo.nVoxelY-1.0f : floorf(source.y+am*ray.y);
-        jmin=(aM==ayM)? 0.0f             : ceilf (source.y+aM*ray.y);
-    }
-    // for Z
-    if( source.z<pixel1D.z){
-        kmin=(am==azm)? 1.0f             : ceilf (source.z+am*ray.z);
-        kmax=(aM==azM)? geo.nVoxelZ      : floorf(source.z+aM*ray.z);
-    }else{
-        kmax=(am==azm)? geo.nVoxelZ-1.0f : floorf(source.z+am*ray.z);
-        kmin=(aM==azM)? 0.0f             : ceilf (source.z+aM*ray.z);
-    }
-    
-    // get intersection point N1. eq(20-21) [(also eq 9-10)]
-    float ax,ay,az;
-    ax=(source.x<pixel1D.x)?  __fdividef(imin-source.x,ray.x) :  __fdividef(imax-source.x,ray.x);
-    ay=(source.y<pixel1D.y)?  __fdividef(jmin-source.y,ray.y) :  __fdividef(jmax-source.y,ray.y);
-    az=(source.z<pixel1D.z)?  __fdividef(kmin-source.z,ray.z) :  __fdividef(kmax-source.z,ray.z);
-    
-    // If its Infinite (i.e. ray is parallel to axis), make sure its positive
-    ax=(isinf(ax))? abs(ax) : ax;
-    ay=(isinf(ay))? abs(ay) : ay;
-    az=(isinf(az))? abs(az) : az;    
-       
-    
-    // get index of first intersection. eq (26) and (19)
-    unsigned long i,j,k;
-    float aminc=fminf(fminf(ax,ay),az);
-    i=(unsigned long)floorf(source.x+ (aminc+am)*0.5f*ray.x);
-    j=(unsigned long)floorf(source.y+ (aminc+am)*0.5f*ray.y);
-    k=(unsigned long)floorf(source.z+ (aminc+am)*0.5f*ray.z);
-    // Initialize
-    float ac=am;
-    //eq (28), unit anlges
-    float axu,ayu,azu;
-    axu=__frcp_rd(fabsf(ray.x));
-    ayu=__frcp_rd(fabsf(ray.y));
-    azu=__frcp_rd(fabsf(ray.z));
-    // eq(29), direction of update
-    float iu,ju,ku;
-    iu=(source.x< pixel1D.x)? 1.0f : -1.0f;
-    ju=(source.y< pixel1D.y)? 1.0f : -1.0f;
-    ku=(source.z< pixel1D.z)? 1.0f : -1.0f;
-    
-    float maxlength=__fsqrt_rd(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
-    float sum=0.0f;
-    unsigned long Np=(imax-imin+1)+(jmax-jmin+1)+(kmax-kmin+1); // Number of intersections
-    // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed
-    i+=0.5f;
-    j+=0.5f;
-    k+=0.5f;
-    for (unsigned long ii=0;ii<Np;ii++){
-        if (ax==aminc){
-            sum+=(ax-ac)*tex3D<float>(tex, i, j, k);
-            i=i+iu;
-            ac=ax;
-            ax+=axu;
-        }else if(ay==aminc){
-            sum+=(ay-ac)*tex3D<float>(tex, i, j, k);
-            j=j+ju;
-            ac=ay;
-            ay+=ayu;
-        }else if(az==aminc){
-            sum+=(az-ac)*tex3D<float>(tex, i, j, k);
-            k=k+ku;
-            ac=az;
-            az+=azu;
-        }
-        aminc=fminf(fminf(ax,ay),az);
-    }
-    detector[idx]=sum*maxlength;
-}
-
-
-int siddon_ray_projection(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
-    // Prepare for MultiGPU
-    int deviceCount = gpuids.GetLength();
-    cudaCheckErrors("Device query fail");
-    if (deviceCount == 0) {
-        mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUselect","There are no available device(s) that support CUDA\n");
-    }
-    //
-    // CODE assumes
-    // 1.-All available devices are usable by this code
-    // 2.-All available devices are equal, they are the same machine (warning thrown)
-    // Check the available devices, and if they are the same
-    if (!gpuids.AreEqualDevices()) {
-        mexWarnMsgIdAndTxt("Ax:Siddon_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
-    }
-    int dev;
-    
-    // Check free memory
-    size_t mem_GPU_global;
-    checkFreeMemory(gpuids, &mem_GPU_global);
-
-    size_t mem_image=                 (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
-    size_t mem_proj=                  (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float);
-    
-    // Does everything fit in the GPUs?
-    const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj<mem_GPU_global;
-    unsigned int splits=1;
-    if (!fits_in_memory) {
-        // Nope nope.
-        // approx free memory we have. We already have left some extra 5% free for internal stuff
-        // we need a second projection memory to combine multi-GPU stuff.
-        size_t mem_free=mem_GPU_global-4*PROJ_PER_BLOCK*mem_proj;
-        splits=mem_image/mem_free+1;// Ceil of the truncation
-    }
-    Geometry* geoArray = (Geometry*)malloc(splits*sizeof(Geometry));
-    splitImage(splits,geo,geoArray,nangles);
-    
-    // Allocate axuiliary memory for projections on the GPU to accumulate partial results
-    float ** dProjection_accum;
-    size_t num_bytes_proj = PROJ_PER_BLOCK*geo.nDetecU*geo.nDetecV * sizeof(float);
-    if (!fits_in_memory){
-        dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
-        for (dev = 0; dev < deviceCount; dev++) {
-            cudaSetDevice(gpuids[dev]);
-            for (int i = 0; i < 2; ++i){
-                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
-                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
-                cudaCheckErrors("cudaMallocauxiliarty projections fail");
-            }
-        }
-    }
-    
-    // This is happening regarthless if the image fits on memory
-    float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        
-        for (int i = 0; i < 2; ++i){
-            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
-            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
-            cudaCheckErrors("cudaMalloc projections fail");
-        }
-    }
-    
-    
-    //Pagelock memory for synchronous copy.
-    // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
-    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
-#ifndef NO_PINNED_MEMORY
-    if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
-        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
-    }
-#endif
-    cudaCheckErrors("Error pinning memory");
-
-    
-    
-    // auxiliary variables
-    Point3D source, deltaU, deltaV, uvOrigin;
-    Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
-    cudaCheckErrors("Error allocating auxiliary constant memory");
-    
-    // Create Streams for overlapping memcopy and compute
-    int nStreams=deviceCount*2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
-    
-    
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        for (int i = 0; i < 2; ++i){
-            cudaStreamCreate(&stream[i+dev*2]);
-            
-        }
-    }
-    cudaCheckErrors("Stream creation fail");
-
-    int nangles_device=(nangles+deviceCount-1)/deviceCount;
-    int nangles_last_device=(nangles-(deviceCount-1)*nangles_device);
-    unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
-    unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management.
-    int projection_this_block;
-    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
-    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
-    
-    for (unsigned int sp=0;sp<splits;sp++){
-        
-        // Create texture objects for all GPUs
-        
-        
-        size_t linear_idx_start;
-        //First one should always be  the same size as all the rest but the last
-        linear_idx_start= (size_t)sp*(size_t)geoArray[0].nVoxelX*(size_t)geoArray[0].nVoxelY*(size_t)geoArray[0].nVoxelZ;
-        
-        
-        CreateTexture(gpuids,&img[linear_idx_start],geoArray[sp],d_cuArrTex,texImg,!sp);
-        cudaCheckErrors("Texture object creation fail");
-        
-        
-        // Prepare kernel lauch variables
-        
-        int divU,divV;
-        divU=PIXEL_SIZE_BLOCK;
-        divV=PIXEL_SIZE_BLOCK;
-        dim3 grid((geoArray[sp].nDetecU+divU-1)/divU,(geoArray[0].nDetecV+divV-1)/divV,1);
-        dim3 block(divU,divV,PROJ_PER_BLOCK);
-        
-        unsigned int proj_global;
-        // Now that we have prepared the image (piece of image) and parameters for kernels
-        // we project for all angles.
-        for (unsigned int i=0; i<noOfKernelCalls; i++) {
-            for (dev=0;dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
-                
-                for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
-                    proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
-                    if (proj_global>=nangles)
-                        break;
-                    if ((i*PROJ_PER_BLOCK+j)>=nangles_device)
-                        break;
-                    geoArray[sp].alpha=angles[proj_global*3];
-                    geoArray[sp].theta=angles[proj_global*3+1];
-                    geoArray[sp].psi  =angles[proj_global*3+2];
-                    
-                    
-                    //precomute distances for faster execution
-                    //Precompute per angle constant stuff for speed
-                    computeDeltas_Siddon(geoArray[sp],proj_global, &uvOrigin, &deltaU, &deltaV, &source);
-                    //Ray tracing!
-                    projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
-                    projParamsArrayHost[4*j+1]=deltaU;
-                    projParamsArrayHost[4*j+2]=deltaV;
-                    projParamsArrayHost[4*j+3]=source;
-                    
-                }
-                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*2]);
-                cudaStreamSynchronize(stream[dev*2]);
-                cudaCheckErrors("kernel fail");
-                kernelPixelDetector<<<grid,block,0,stream[dev*2]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
-            }
-
-
-            // Now that the computation is happening, we need to either prepare the memory for
-            // combining of the projections (splits>1) and start removing previous results.
-            
-            
-            // If our image does not fit in memory then we need to make sure we accumulate previous results too.
-            // This is done in 2 steps: 
-            // 1)copy previous results back into GPU 
-            // 2)accumulate with current results
-            // The code to take them out is the same as when there are no splits needed
-            if( !fits_in_memory&&sp>0)
-            {
-                // 1) grab previous results and put them in the auxiliary variable dProjection_accum
-                for (dev = 0; dev < deviceCount; dev++)
-                {
-                    cudaSetDevice(gpuids[dev]);
-                    //Global index of FIRST projection on this set on this GPU
-                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
-                    if(proj_global>=nangles) 
-                        break;
-
-                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
-                    if(i+1==noOfKernelCalls) //is it the last block?
-                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
-                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
-                    else
-                        projection_this_block=PROJ_PER_BLOCK;
-
-                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
-                }
-                //  2) take the results from current compute call and add it to the code in execution.
-                for (dev = 0; dev < deviceCount; dev++)
-                {
-                    cudaSetDevice(gpuids[dev]);
-                    //Global index of FIRST projection on this set on this GPU
-                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
-                    if(proj_global>=nangles) 
-                        break;
-
-                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
-                    if(i+1==noOfKernelCalls) //is it the last block?
-                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
-                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
-                    else
-                        projection_this_block=PROJ_PER_BLOCK;
-
-                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
-                    vecAddInPlace<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
-                }
-            } // end accumulation case, where the image needs to be split 
-
-            // Now, lets get out the projections from the previous execution of the kernels.
-            if (i>0){
-                for (dev = 0; dev < deviceCount; dev++)
-                {
-                    cudaSetDevice(gpuids[dev]);
-                    //Global index of FIRST projection on previous set on this GPU
-                    proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
-                    if (dev+1==deviceCount) {    //is it the last device?
-                        // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device
-                        if (i-1 < noOfKernelCallsLastDev) {
-                            // The previous set(block) was not empty.
-                            projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global);
-                        }
-                        else {
-                            // The previous set was empty.
-                            // This happens if deviceCount > PROJ_PER_BLOCK+1.
-                            // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199.
-                            // e.g. PROJ_PER_BLOCK = 1, deviceCount =  3, nangles =   7.
-                            break;
-                        }
-                    }
-                    else {
-                        projection_this_block=PROJ_PER_BLOCK;
-                    }
-                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
-                }
-            }
-            // Make sure Computation on kernels has finished before we launch the next batch.
-            for (dev = 0; dev < deviceCount; dev++){
-                cudaSetDevice(gpuids[dev]);
-                cudaStreamSynchronize(stream[dev*2]);
-            }
-        }
-        
-        
-         // We still have the last set of projections to get out of GPUs
-        for (dev = 0; dev < deviceCount; dev++)
-        {
-            cudaSetDevice(gpuids[dev]);
-            //Global index of FIRST projection on this set on this GPU
-            proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
-            if(proj_global>=nangles) 
-                break;
-            // How many projections are left here?
-            projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
-                                      nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
-
-            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
-            cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
-            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
-        }
-        // Make sure everyone has done their bussiness before the next image split:
-        cudaDeviceSynchronize();
-    } // End image split loop.
-    
-    cudaCheckErrors("Main loop  fail");
-    ///////////////////////////////////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////
-    for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texImg[dev]);
-            cudaFreeArray(d_cuArrTex[dev]);
-    }
-    delete[] texImg; texImg = 0;
-    delete[] d_cuArrTex; d_cuArrTex = 0;
-    // Freeing Stage
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dProjection[dev*2]);
-        cudaFree(dProjection[dev*2+1]);
-        
-    }
-    free(dProjection);
-    
-    if(!fits_in_memory){
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(dProjection_accum[dev*2]);
-            cudaFree(dProjection_accum[dev*2+1]);
-            
-        }
-        free(dProjection_accum);
-    }
-    freeGeoArray(splits,geoArray);
-    cudaFreeHost(projParamsArrayHost);
-   
-    
-    for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]) ;
-#ifndef NO_PINNED_MEMORY
-    if (isHostRegisterSupported & (splits>1 |deviceCount>1)){
-        cudaHostUnregister(img);
-    }
-    cudaCheckErrors("cudaFree  fail");
-#endif
-    //cudaDeviceReset();
-    return 0;
-}
-
-
-
-
-void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool alloc)
-{
-    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
-    const unsigned int num_devices = gpuids.GetLength();
-    if(alloc){
-        for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
-            
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-            //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
-        }
-    }
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
-        //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
-        copyParams.dstArray = d_cuArrTex[dev];
-        copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams);
-    }
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
-        texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
-        texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModePoint;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
-        
-    }
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDeviceSynchronize();
-    }
-    cudaCheckErrors("Texture object creation fail");
-}
-
-/* This code generates the geometries needed to split the image properly in
- * cases where the entire image does not fit in the memory of the GPU
- **/
-void splitImage(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){
-    
-    unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible
-    for(unsigned int sp=0;sp<splits;sp++){
-        geoArray[sp]=geo;
-        // All of them are splitsize, but the last one, possible
-        geoArray[sp].nVoxelZ=((sp+1)*splitsize<geo.nVoxelZ)?  splitsize:  geo.nVoxelZ-splitsize*sp;
-        geoArray[sp].sVoxelZ= geoArray[sp].nVoxelZ* geoArray[sp].dVoxelZ;
-        
-        // We need to redefine the offsets, as now each subimage is not aligned in the origin.
-        geoArray[sp].offOrigZ=(float *)malloc(nangles*sizeof(float));
-        for (unsigned int i=0;i<nangles;i++){
-            geoArray[sp].offOrigZ[i]=geo.offOrigZ[i]-geo.sVoxelZ/2+sp*geoArray[0].sVoxelZ+geoArray[sp].sVoxelZ/2;
-        }
-        
-    }
-    
-}
-
-/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
- * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
- * it does saves about 30% of each of the kernel calls. Thats something!
- **/
-void computeDeltas_Siddon(Geometry geo,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
-
-    
-    Point3D S;
-    S.x=geo.DSO[i];
-    S.y=0;
-    S.z=0;
-    
-    //End point
-    Point3D P,Pu0,Pv0;
-    
-    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(-((double)geo.nDetecU/2.0)+0.5);       P.z  = geo.dDetecV*(((double)geo.nDetecV/2.0)-0.5);
-    Pu0.x=0;                          Pu0.y= geo.dDetecU;                                    Pu0.z= 0;
-    Pv0.x=0;                          Pv0.y= 0;                                              Pv0.z= geo.dDetecV*(-1);
-
-    // Geometric transformations:
-    // Now we have the Real world (OXYZ) coordinates of the bottom corner and its two neighbours.
-    // The objective is to get a position of the detector in a coordinate system where:
-    // 1-units are voxel size (in each direction can be different)
-    // 2-The image has the its first voxel at (0,0,0)
-    // 3-The image never rotates
-
-    // To do that, we need to compute the "deltas" the detector, or "by how much
-    // (in new xyz) does the voxels change when and index is added". To do that
-    // several geometric steps needs to be changed
-
-    //1.Roll,pitch,jaw
-    // The detector can have a small rotation.
-    // according to
-    //"A geometric calibration method for cone beam CT systems" Yang K1, Kwan AL, Miller DF, Boone JM. Med Phys. 2006 Jun;33(6):1695-706.
-    // Only the Z rotation will have a big influence in the image quality when they are small.
-    // Still all rotations are supported
-
-    // To roll pitch jaw, the detector has to be in centered in OXYZ.
-    // NB: do not apply offsets to Pu0 and Pv0: they are directions, and are invariant through translations
-    P.x=0;
-
-    // Roll pitch yaw
-    rollPitchYaw(geo,i,&P);
-    rollPitchYaw(geo,i,&Pu0);
-    rollPitchYaw(geo,i,&Pv0);
-    //Now let's translate the points where they should be:
-    // NB: do not apply offsets to Pu0 and Pv0: they are directions, and are invariant through translations
-    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
-
-    //1: Offset detector
-
-
-    //S doesnt need to chagne
-
-
-    //3: Rotate (around z)!
-    Point3D Pfinal, Pfinalu0, Pfinalv0;
-    Pfinal.x  =P.x;
-    Pfinal.y  =P.y  +geo.offDetecU[i]; Pfinal.z  =P.z  +geo.offDetecV[i];
-    Pfinalu0 = Pu0;
-    Pfinalv0 = Pv0;
-
-    eulerZYZ(geo,&Pfinal);
-    eulerZYZ(geo,&Pfinalu0);
-    eulerZYZ(geo,&Pfinalv0);
-    eulerZYZ(geo,&S);
-
-    //2: Offset image (instead of offseting image, -offset everything else)
-    // NB: do not apply offsets to Pfinalu0 and Pfinalv0: they are directions, and are invariant through translations
-
-    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
-    S.x=S.x-geo.offOrigX[i];               S.y=S.y-geo.offOrigY[i];               S.z=S.z-geo.offOrigZ[i];
-
-    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
-    Pfinal.x  =Pfinal.x+geo.sVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2;
-    S.x      =S.x+geo.sVoxelX/2;          S.y      =S.y+geo.sVoxelY/2;              S.z      =S.z      +geo.sVoxelZ/2;
-
-    //4. Scale everything so dVoxel==1
-    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
-    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
-    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
-    S.x      =S.x/geo.dVoxelX;          S.y      =S.y/geo.dVoxelY;            S.z      =S.z/geo.dVoxelZ;
-
-
-    //mexPrintf("COR: %f \n",geo.COR[i]);
-    //5. apply COR. Wherever everything was, now its offesetd by a bit
-    // NB: do not apply offsets to Pfinalu0 and Pfinalv0: they are directions, and are invariant through translations
-    double CORx, CORy;
-    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
-    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
-    Pfinal.x+=CORx;   Pfinal.y+=CORy;
-    S.x+=CORx; S.y+=CORy;
-
-    // return
-
-    *uvorigin=Pfinal;
-
-    *deltaU=Pfinalu0;
-    *deltaV=Pfinalv0;
-    
-    *source=S;
-}
-
-
-#ifndef PROJECTION_HPP
-
-float maxDistanceCubeXY(Geometry geo, float alpha,int i){
-    ///////////
-    // Compute initial "t" so we access safely as less as out of bounds as possible.
-    //////////
-    
-    
-    float maxCubX,maxCubY;
-    // Forgetting Z, compute max distance: diagonal+offset
-    maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX;
-    maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY;
-    
-    return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY);
-    
-}
-void rollPitchYaw(Geometry geo,int i, Point3D* point){
-    Point3D auxPoint;
-    auxPoint.x=point->x;
-    auxPoint.y=point->y;
-    auxPoint.z=point->z;
-    
-    point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
-            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
-            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
-    
-    point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
-            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
-            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
-    
-    point->z=-sin(geo.dPitch[i])*auxPoint.x
-            +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y
-            +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z;
-    
-}
-void eulerZYZ(Geometry geo, Point3D* point){
-    Point3D auxPoint;
-    auxPoint.x=point->x;
-    auxPoint.y=point->y;
-    auxPoint.z=point->z;
-    
-    point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+
-            (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+
-            cos(geo.alpha)*sin(geo.theta)*auxPoint.z;
-    
-    point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+
-            (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+
-            sin(geo.alpha)*sin(geo.theta)*auxPoint.z;
-    
-    point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+
-            sin(geo.theta)*sin(geo.psi)*auxPoint.y+
-            cos(geo.theta)*auxPoint.z;
-    
-    
-}
-//______________________________________________________________________________
-//
-//      Function:       freeGeoArray
-//
-//      Description:    Frees the memory from the geometry array for multiGPU.
-//______________________________________________________________________________
-void freeGeoArray(unsigned int splits,Geometry* geoArray){
-    for(unsigned int sp=0;sp<splits;sp++){
-        free(geoArray[sp].offOrigZ);
-    }
-    free(geoArray);
-}
-//______________________________________________________________________________
-//
-//      Function:       checkFreeMemory
-//
-//      Description:    check available memory on devices
-//______________________________________________________________________________
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
-    size_t memfree;
-    size_t memtotal;
-    const int deviceCount = gpuids.GetLength();
-
-    for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
-        if(dev==0) *mem_GPU_global=memfree;
-        if(memfree<memtotal/2){
-            mexErrMsgIdAndTxt("Ax:Siddon_projection:GPUmemory","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
-        }
-        cudaCheckErrors("Check mem error");
-        
-        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
-    }
-    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
-    
-    //*mem_GPU_global= insert your known number here, in bytes.
-}
-#endif
diff --git a/Common/CUDA/Siddon_projection.hpp.prehip b/Common/CUDA/Siddon_projection.hpp.prehip
deleted file mode 100644
index c2d38ed9..00000000
--- a/Common/CUDA/Siddon_projection.hpp.prehip
+++ /dev/null
@@ -1,66 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA functions for ray-voxel intersection based projection
- *
- *
- * CODE by       Ander Biguri
- *               Sepideh Hatamikia (arbitrary rotation)
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-#include "ray_interpolated_projection.hpp"
-#include "types_TIGRE.hpp"
-#include "GpuIds.hpp"
-
-#ifndef PROJECTION_HPP_SIDDON
-#define PROJECTION_HPP_SIDDON
-int siddon_ray_projection(float*  img, Geometry geo, float** result,float const * const angles,int nangle, const GpuIds& gpuids);
-
-//double computeMaxLength(Geometry geo, double alpha);
-void computeDeltas_Siddon(Geometry geo,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
-void splitImage(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles);
-void freeGeoArray(unsigned int splits,Geometry* geoArray);
-//double maxDistanceCubeXY(Geometry geo, double alpha,int i);
-
-
-#endif
-#ifndef PROJECTION_HPP
-void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global);
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/Siddon_projection_parallel.cu.prehip b/Common/CUDA/Siddon_projection_parallel.cu.prehip
deleted file mode 100644
index 25a07e9d..00000000
--- a/Common/CUDA/Siddon_projection_parallel.cu.prehip
+++ /dev/null
@@ -1,540 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for ray-voxel intersection based projection
- *
- * This file has the necessary fucntiosn to perform X-ray parallel projection
- * operation given a geaometry, angles and image. It usesthe so-called
- * Jacobs algorithm to compute efficiently the length of the x-rays over
- * voxel space. Its called Siddon because Jacobs algorithm its just a small
- * improvement over the traditional Siddons method.
- *
- * CODE by       Ander Biguri
- *
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-#include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include "Siddon_projection_parallel.hpp"
-#include "TIGRE_common.hpp"
-#include <math.h>
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("TIGRE:CUDA:Ax",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-    
-// Declare the texture reference.
-void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
-
-
-#define MAXTREADS 1024
-#define PROJ_PER_BLOCK 9
-#define PIXEL_SIZE_BLOCK 9
-/*GEOMETRY DEFINITION
- *
- *                Detector plane, behind
- *            |-----------------------------|
- *            |                             |
- *            |                             |
- *            |                             |
- *            |                             |
- *            |      +--------+             |
- *            |     /        /|             |
- *   A Z      |    /        / |*D           |
- *   |        |   +--------+  |             |
- *   |        |   |        |  |             |
- *   |        |   |     *O |  +             |
- *    --->y   |   |        | /              |
- *  /         |   |        |/               |
- * V X        |   +--------+                |
- *            |-----------------------------|
- *
- *           *S
- *
- *
- *
- *
- *
- **/
-
-
-__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
-
-
-__global__ void kernelPixelDetector_parallel( Geometry geo,
-        float* detector, const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex){
-    
-    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned long long projNumber=threadIdx.z;
-            
-    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
-        return;
-    
-    unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array
-    
-    
-#if IS_FOR_MATLAB_TIGRE
-    size_t idx =  (size_t)(u  * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#else
-    size_t idx =  (size_t)(v  * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#endif
-    
-    if(indAlpha>=totalNoOfProjections)
-        return;
-    
-    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
-    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
-    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
-    Point3D source = projParamsArrayDev[4*projNumber+3];
-    
-
-    /////// Get coordinates XYZ of pixel UV
-    unsigned long pixelV = geo.nDetecV-v-1;
-    unsigned long pixelU = u;
-    Point3D pixel1D;
-    pixel1D.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
-    pixel1D.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
-    pixel1D.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
-    
-    
-    source.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x);
-    source.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y);
-    source.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z);
-    ///////
-    // Siddon's ray-voxel intersection, optimized as in doi=10.1.1.55.7516
-    //////
-    Point3D ray;
-    // vector of Xray
-    ray.x=pixel1D.x-source.x;
-    ray.y=pixel1D.y-source.y;
-    ray.z=pixel1D.z-source.z;
-    // This variables are ommited because
-    // bx,by,bz ={0,0,0}
-    // dx,dy,dz ={1,1,1}
-    // compute parameter values for x-ray parametric equation. eq(3-10)
-    float axm,aym,azm;
-    float axM,ayM,azM;
-    
-    /**************************************
-     *
-     *
-     * Problem. In paralel beam, often ray.y or ray.x=0;
-     * This leads to infinities progpagating and breaking everything.
-     *
-     * We need to fix it.
-     *
-     ***************************************/
-    
-    // In the paper Nx= number of X planes-> Nvoxel+1
-    axm=fminf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x);
-    aym=fminf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y);
-//     azm=min(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z);
-    axM=fmaxf(-source.x/ray.x,(geo.nVoxelX-source.x)/ray.x);
-    ayM=fmaxf(-source.y/ray.y,(geo.nVoxelY-source.y)/ray.y);
-//     azM=max(-source.z/ray.z,(geo.nVoxelZ-source.z)/ray.z);
-    float am=(fmaxf(axm,aym));
-    float aM=(fminf(axM,ayM));
-    
-    // line intersects voxel space ->   am<aM
-    if (am>=aM)
-        detector[idx]=0.0f;
-    
-    // Compute max/min image INDEX for intersection eq(11-19)
-    // Discussion about ternary operator in CUDA: https://stackoverflow.com/questions/7104384/in-cuda-why-is-a-b010-more-efficient-than-an-if-else-version
-    float imin,imax,jmin,jmax;
-    // for X
-    if( source.x<pixel1D.x){
-        imin=(am==axm)? 1.0f             : ceilf (source.x+am*ray.x);
-        imax=(aM==axM)? geo.nVoxelX      : floorf(source.x+aM*ray.x);
-    }else{
-        imax=(am==axm)? geo.nVoxelX-1.0f : floorf(source.x+am*ray.x);
-        imin=(aM==axM)? 0.0f             : ceilf (source.x+aM*ray.x);
-    }
-    // for Y
-    if( source.y<pixel1D.y){
-        jmin=(am==aym)? 1.0f             : ceilf (source.y+am*ray.y);
-        jmax=(aM==ayM)? geo.nVoxelY      : floorf(source.y+aM*ray.y);
-    }else{
-        jmax=(am==aym)? geo.nVoxelY-1.0f : floorf(source.y+am*ray.y);
-        jmin=(aM==ayM)? 0.0f             : ceilf (source.y+aM*ray.y);
-    }
-//     // for Z
-//     if( source.z<pixel1D.z){
-//         kmin=(am==azm)? 1             : ceilf (source.z+am*ray.z);
-//         kmax=(aM==azM)? geo.nVoxelZ : floorf(source.z+aM*ray.z);
-//     }else{
-//         kmax=(am==azm)? geo.nVoxelZ-1 : floorf(source.z+am*ray.z);
-//         kmin=(aM==azM)? 0             : ceilf (source.z+aM*ray.z);
-//     }
-    
-    // get intersection point N1. eq(20-21) [(also eq 9-10)]
-    float ax,ay;
-    ax=(source.x<pixel1D.x)?  (imin-source.x)/(ray.x+0.000000000001f)  :  (imax-source.x)/(ray.x+0.000000000001f);
-    ay=(source.y<pixel1D.y)?  (jmin-source.y)/(ray.y+0.000000000001f)  :  (jmax-source.y)/(ray.y+0.000000000001f);
-    ay = (ray.y==0.0f)? -copysignf(1e11,ax) : ay;
-    ax = (ray.x==0.0f)? -copysignf(1e11,ay) : ax;
-//     az=(source.z<pixel1D.z)?  (kmin-source.z)/ray.z  :  (kmax-source.z)/ray.z;
-    
-    
-    
-    // get index of first intersection. eq (26) and (19)
-    unsigned long i,j,k;
-    float aminc=fminf(ax,ay);
-    i=(unsigned long)floorf(source.x+ (aminc+am)/2*ray.x);
-    j=(unsigned long)floorf(source.y+ (aminc+am)/2*ray.y);
-    k=(unsigned long)floorf(source.z+ (aminc+am)/2*ray.z);
-//     k=(int)source.z;
-    // Initialize
-    float ac=am;
-    //eq (28), unit angles
-    float axu,ayu;
-    axu=1.0f/fabsf(ray.x);
-    ayu=1.0f/fabsf(ray.y);
-//     azu=1/abs(ray.z);
-    // eq(29), direction of update
-    float iu,ju;
-    iu=(source.x< pixel1D.x)? 1.0f : -1.0f;
-    ju=(source.y< pixel1D.y)? 1.0f : -1.0f;
-//     ku=(source.z< pixel1D.z)? 1 : -1;
-    
-    float maxlength=sqrtf(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY);//+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
-    float sum=0.0f;
-    unsigned long Np=(imax-imin+1)+(jmax-jmin+1);//+(kmax-kmin+1); // Number of intersections
-    // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed
-    i+=0.5f;
-    j+=0.5f;
-    k+=0.5f;
-    // detector[idx]=aminc;
-    // return;
-    for (unsigned long ii=0;ii<Np;ii++){
-        if (ax==aminc){
-            sum+=(ax-ac)*tex3D<float>(tex, i, j, k);//(ax-ac)*
-            i=i+iu;
-            ac=ax;
-            ax+=axu;
-        }else if(ay==aminc){
-            sum+=(ay-ac)*tex3D<float>(tex, i, j, k);//(ay-ac)*
-            j=j+ju;
-            ac=ay;
-            ay+=ayu;
-//         }else if(az==aminc){
-//             sum+=(az-ac)*tex3D<float>(tex, i+0.5, j+0.5, k+0.5);
-//             k=k+ku;
-//             ac=az;
-//             az+=azu;
-        }
-        aminc=fminf(ay,ax);
-    }
-    detector[idx]=maxlength*sum;
-}
-
-
-int siddon_ray_projection_parallel(float* img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
-    
-    
-
-    
-    
-    size_t num_bytes = (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)PROJ_PER_BLOCK* (size_t)sizeof(float);
-    float** dProjection=(float **)malloc(2*sizeof(float *));
-    for (int i = 0; i < 2; ++i){
-        cudaMalloc((void**)&dProjection[i],   num_bytes);
-        cudaCheckErrors("cudaMalloc projections fail");
-    }
-    int nStreams=2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
-    
-    for (int i = 0; i < 2; ++i){
-        cudaStreamCreate(&stream[i]);
-    }
-    
-    
-        
-    // Texture object variables
-    cudaTextureObject_t *texImg = 0;
-    cudaArray **d_cuArrTex = 0;
-    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
-    
-    CreateTextureParallel(img,geo,&d_cuArrTex[0], &texImg   [0],stream);
-    cudaCheckErrors("Texture allocation fail");
-    //Done! Image put into texture memory.
-
-    
-    
-    Point3D source, deltaU, deltaV, uvOrigin;
-    
-    
-    Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
-
-    // 16x16 gave the best performance empirically
-    // Funnily that makes it compatible with most GPUs.....
-    int divU,divV,divangle;
-    divU=PIXEL_SIZE_BLOCK;
-    divV=PIXEL_SIZE_BLOCK;
-    
-    dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1);
-    
-    dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK);
-    
-    unsigned int proj_global;
-    unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
-    unsigned int i;
-    for ( i=0; i<noOfKernelCalls; i++){
-        
-         for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
-            proj_global=i*PROJ_PER_BLOCK+j;
-            if (proj_global>=nangles)
-               break;
-            geo.alpha=angles[proj_global*3];
-            geo.theta=angles[proj_global*3+1];
-            geo.psi  =angles[proj_global*3+2];
-            if(geo.alpha==0.0 || abs(geo.alpha-1.5707963267949)<0.0000001){
-                geo.alpha=geo.alpha+1.1920929e-07;
-            }
-            
-            //precomute distances for faster execution
-            //Precompute per angle constant stuff for speed
-            computeDeltas_Siddon_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source);
-            //Ray tracing!
-            projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
-            projParamsArrayHost[4*j+1]=deltaU;
-            projParamsArrayHost[4*j+2]=deltaV;
-            projParamsArrayHost[4*j+3]=source;
-
-         }
-         
-         cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-         cudaStreamSynchronize(stream[0]);
-         kernelPixelDetector_parallel<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
-         // copy result to host
-         if (i>0)
-             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);
-    }
-    cudaDeviceSynchronize();
-    
-    int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
-    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
-
-    
-
-    cudaDestroyTextureObject(texImg[0]);
-    cudaFreeArray(d_cuArrTex[0]);
-    free(texImg); texImg = 0;
-    free(d_cuArrTex); d_cuArrTex = 0;
-    cudaCheckErrors("Unbind  fail");
-    cudaFree(dProjection[0]);
-    cudaFree(dProjection[1]);
-    free(dProjection);
-    cudaFreeHost(projParamsArrayHost);
-    cudaCheckErrors("cudaFree d_imagedata fail");
-    
-    
-    for (int i = 0; i < 2; ++i){
-      cudaStreamDestroy(stream[i]);
-    }
-//     cudaDeviceReset();
-    return 0;
-}
-
-
-
-/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
- * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
- * it does saves about 30% of each of the kernel calls. Thats something!
- **/
-void computeDeltas_Siddon_parallel(Geometry geo, float angles,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
-    Point3D S;
-    
-    S.x  =geo.DSO[i];   S.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       S.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    
-    //End point
-    Point3D P,Pu0,Pv0;
-    
-    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       P.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    Pu0.x=-(geo.DSD[i]-geo.DSO[i]);   Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5);       Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    Pv0.x=-(geo.DSD[i]-geo.DSO[i]);   Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1);
-    // Geometric trasnformations:
-    P.x=0;Pu0.x=0;Pv0.x=0;
-    
-    // Roll pitch yaw
-    rollPitchYaw(geo,i,&P);
-    rollPitchYaw(geo,i,&Pu0);
-    rollPitchYaw(geo,i,&Pv0);
-    //Now lets translate the points where they should be:
-    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
-    Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]);
-    Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]);
-
-    S.x=0;
-    // Roll pitch yaw
-    rollPitchYaw(geo,i,&S);
-    //Now lets translate the points where they should be:
-    S.x=S.x+geo.DSO[i];
-
-    //1: Offset detector
-    
-    //P.x
-    P.y  =P.y  +geo.offDetecU[i];    P.z  =P.z  +geo.offDetecV[i];
-    Pu0.y=Pu0.y+geo.offDetecU[i];    Pu0.z=Pu0.z+geo.offDetecV[i];
-    Pv0.y=Pv0.y+geo.offDetecU[i];    Pv0.z=Pv0.z+geo.offDetecV[i];
-    //S doesnt need to chagne
-    
-    
-    //3: Rotate (around z)!
-    Point3D Pfinal, Pfinalu0, Pfinalv0;
-    
-    Pfinal.x  =P.x*cos(geo.alpha)-P.y*sin(geo.alpha);       Pfinal.y  =P.y*cos(geo.alpha)+P.x*sin(geo.alpha);       Pfinal.z  =P.z;
-    Pfinalu0.x=Pu0.x*cos(geo.alpha)-Pu0.y*sin(geo.alpha);   Pfinalu0.y=Pu0.y*cos(geo.alpha)+Pu0.x*sin(geo.alpha);   Pfinalu0.z=Pu0.z;
-    Pfinalv0.x=Pv0.x*cos(geo.alpha)-Pv0.y*sin(geo.alpha);   Pfinalv0.y=Pv0.y*cos(geo.alpha)+Pv0.x*sin(geo.alpha);   Pfinalv0.z=Pv0.z;
-    
-    Point3D S2;
-    S2.x=S.x*cos(geo.alpha)-S.y*sin(geo.alpha);
-    S2.y=S.y*cos(geo.alpha)+S.x*sin(geo.alpha);
-    S2.z=S.z;
-    
-    //2: Offset image (instead of offseting image, -offset everything else)
-    
-    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
-    Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i];   Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i];   Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i];
-    Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i];   Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i];   Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i];
-    S2.x=S2.x-geo.offOrigX[i];               S2.y=S2.y-geo.offOrigY[i];               S2.z=S2.z-geo.offOrigZ[i];
-    
-    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
-    Pfinal.x  =Pfinal.x+geo.sVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2;
-    Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2;    Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2;        Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2;
-    Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2;    Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2;        Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2;
-    S2.x      =S2.x+geo.sVoxelX/2;          S2.y      =S2.y+geo.sVoxelY/2;              S2.z      =S2.z      +geo.sVoxelZ/2;
-    
-    //4. Scale everything so dVoxel==1
-    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
-    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
-    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
-    S2.x      =S2.x/geo.dVoxelX;          S2.y      =S2.y/geo.dVoxelY;            S2.z      =S2.z/geo.dVoxelZ;
-    
-    
-    
-    //5. apply COR. Wherever everything was, now its offesetd by a bit
-    float CORx, CORy;
-    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
-    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
-    Pfinal.x+=CORx;   Pfinal.y+=CORy;
-    Pfinalu0.x+=CORx;   Pfinalu0.y+=CORy;
-    Pfinalv0.x+=CORx;   Pfinalv0.y+=CORy;
-    S2.x+=CORx; S2.y+=CORy;
-    
-    // return
-    
-    *uvorigin=Pfinal;
-    
-    deltaU->x=Pfinalu0.x-Pfinal.x;
-    deltaU->y=Pfinalu0.y-Pfinal.y;
-    deltaU->z=Pfinalu0.z-Pfinal.z;
-    
-    deltaV->x=Pfinalv0.x-Pfinal.x;
-    deltaV->y=Pfinalv0.y-Pfinal.y;
-    deltaV->z=Pfinalv0.z-Pfinal.z;
-    
-    *source=S2;
-}
-void CreateTextureParallel(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    
-    
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
-  
-    //cudaArray Descriptor
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-    //cuda Array
-    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
-
-
-        cudaMemcpy3DParms copyParams = {0};
-        //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
-        copyParams.dstArray = d_cuArrTex[0];
-        copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[1]);
-    
-
-    //Array creation End
-
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
-        texRes.res.array.array  = d_cuArrTex[0];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
-        texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModePoint;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
-    
-}
-
-#ifndef PROJECTION_HPP
-
-float maxDistanceCubeXY(Geometry geo, float alpha,int i){
-    ///////////
-    // Compute initial "t" so we access safely as less as out of bounds as possible.
-    //////////
-    
-    
-    float maxCubX,maxCubY;
-    // Forgetting Z, compute max distance: diagonal+offset
-    maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX;
-    maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY;
-    
-    return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY);
-    
-}
-
-#endif
diff --git a/Common/CUDA/Siddon_projection_parallel.hpp.prehip b/Common/CUDA/Siddon_projection_parallel.hpp.prehip
deleted file mode 100644
index c9c6fc77..00000000
--- a/Common/CUDA/Siddon_projection_parallel.hpp.prehip
+++ /dev/null
@@ -1,65 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA functions for ray-voxel intersection based projection
- *
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-#include "ray_interpolated_projection.hpp"
-#include "types_TIGRE.hpp"
-#include "GpuIds.hpp"
-
-#ifndef PROJECTION_PARALLEL_HPP_SIDDON
-#define PROJECTION_PARALLEL_HPP_SIDDON
-int siddon_ray_projection_parallel(float  *  img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids);
-
-//double computeMaxLength(Geometry geo, double alpha);
-void computeDeltas_Siddon_parallel(Geometry geo, float alpha,int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
-
-//double maxDistanceCubeXY(Geometry geo, double alpha,int i);
-
-// below, not used
-//Geometry nomralizeGeometryImage(Geometry geo);
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/TIGRE_common.cpp.prehip b/Common/CUDA/TIGRE_common.cpp.prehip
deleted file mode 100644
index cf98e4b9..00000000
--- a/Common/CUDA/TIGRE_common.cpp.prehip
+++ /dev/null
@@ -1,20 +0,0 @@
-#if defined(IS_FOR_PYTIGRE)
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include "TIGRE_common.hpp"
-void mexPrintf(const char* format, ...) {
-    PRINT_HERE("");
-    va_list argpointer;
-    va_start(argpointer, format);
-    vprintf(format, argpointer);
-    va_end(argpointer);
-}
-void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg) {
-    PRINT_HERE("%s %s\n", pcTag, pcMsg);
-    exit(1);
-}
-void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg) {
-    PRINT_HERE("%s %s\n", pcTag, pcMsg);
-}
-#endif  // IS_FOR_PYTIGRE
diff --git a/Common/CUDA/TIGRE_common.hpp.prehip b/Common/CUDA/TIGRE_common.hpp.prehip
deleted file mode 100644
index faf8d7ab..00000000
--- a/Common/CUDA/TIGRE_common.hpp.prehip
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _COMMON_HPP_20201017_
-#define _COMMON_HPP_20201017_
-
-#define STRINGIFY(n) #n
-#define TOSTRING(n) STRINGIFY(n)
-#define __HERE__ __FILE__ " (" TOSTRING(__LINE__) "): "
-#define PRINT_HERE printf(__HERE__);printf
-// #define PRINT_HERE (void*)0
-
-#if defined(IS_FOR_PYTIGRE)
-#ifndef IS_FOR_MATLAB_TIGRE
-    #define IS_FOR_MATLAB_TIGRE 0
-#endif  // IS_FOR_MATLAB_TIGRE
-void mexPrintf(const char*, ...);
-void mexErrMsgIdAndTxt(const char* pcTag, const char* pcMsg);
-void mexWarnMsgIdAndTxt(const char* pcTag, const char* pcMsg);
-#else
-#ifndef IS_FOR_MATLAB_TIGRE
-    #define IS_FOR_MATLAB_TIGRE 1
-#endif  // IS_FOR_MATLAB_TIGRE
-#include "mex.h"
-#include "tmwtypes.h"
-#endif  // IS_TIGRE_FOR_PYTHON
-#endif  // _COMMON_HPP_20201017_
diff --git a/Common/CUDA/errors.hpp.prehip b/Common/CUDA/errors.hpp.prehip
deleted file mode 100644
index 05518b20..00000000
--- a/Common/CUDA/errors.hpp.prehip
+++ /dev/null
@@ -1,10 +0,0 @@
-#define CUDA_SUCCESS 0
-#define ERR_CUDA 1
-
-#define ERR_NO_CAPABLE_DEVICES 2
-#define ERR_NO_FREE_DEVICES 3
-#define ERR_BAD_ASSERT 4
-#define ERR_ASSERT_FAIL 5
-
-
-
diff --git a/Common/CUDA/gpuUtils.cu.prehip b/Common/CUDA/gpuUtils.cu.prehip
deleted file mode 100644
index 8f2754e4..00000000
--- a/Common/CUDA/gpuUtils.cu.prehip
+++ /dev/null
@@ -1,70 +0,0 @@
-
-#include "gpuUtils.hpp"
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include <string.h>
-#include <stdio.h>
-
-int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage) {
-    if (pcMessage) {
-        for (int iI = 0; iI < 65535; ++iI) {
-            pcMessage[iI] = '\0';
-        }
-    }
-    if (piDeviceIds == 0 || iIdCountMax == 0) {
-        return 0;
-    }
-    int iMessagePos = 0;
-    // Count installed GPUs.
-    int iCudaDeviceCount = GetGpuCount();
-    iMessagePos += sprintf(pcMessage + iMessagePos, "Found GPUs: %d\n", iCudaDeviceCount);
-    if (iCudaDeviceCount == 0) {
-        // printf("No GPU found\n");
-        return 0;
-    }
-
-    iCudaDeviceCount = min(iCudaDeviceCount, iIdCountMax);
-    iMessagePos += sprintf(pcMessage + iMessagePos, "Max GPUs: %d\n", iCudaDeviceCount);
-    if (strlen(kacGPUName) == 0) {
-        // Semi-compatible mode:
-        //    Return all GPUs
-        for (int iI = 0; iI < iCudaDeviceCount; ++iI) {
-            piDeviceIds[iI] = iI;
-        }
-        return iCudaDeviceCount;
-    }
-
-    cudaError_t err;
-    cudaDeviceProp propDevice;
-    int nMatch = 0;
-    for (int iId = 0; iId < iCudaDeviceCount; ++iId) {
-        err = cudaGetDeviceProperties(&propDevice, iId);
-        iMessagePos += sprintf(pcMessage + iMessagePos, "propDevice.name = %s\n", propDevice.name);
-        if (strcmp(propDevice.name, kacGPUName) == 0) {
-            piDeviceIds[nMatch] = iId;
-            ++nMatch;
-        }
-    }
-
-    for (int iI = 0; iI < nMatch; ++iI) {
-        iMessagePos += sprintf(pcMessage + iMessagePos, "%d, ", piDeviceIds[iI]);
-    }  
-    return nMatch;
-
-}
-
-void GetGpuName(int iDeviceId, char* pcName) {
-    memset(pcName, 0, 128);
-    cudaError_t err;
-    cudaDeviceProp propDevice;
-    int id = iDeviceId;
-    err = cudaGetDeviceProperties(&propDevice, id);
-    memcpy(pcName, propDevice.name, strlen(propDevice.name)*sizeof(char));
-}
-
-
-int GetGpuCount() {
-    int iCudaDeviceCount = 0;
-    cudaGetDeviceCount(&iCudaDeviceCount);
-    return iCudaDeviceCount;
-}
diff --git a/Common/CUDA/gpuUtils.hpp.prehip b/Common/CUDA/gpuUtils.hpp.prehip
deleted file mode 100644
index 38b518cf..00000000
--- a/Common/CUDA/gpuUtils.hpp.prehip
+++ /dev/null
@@ -1,18 +0,0 @@
-
-#ifndef GPUUTILS_HPP
-#define GPUUTILS_HPP
-//! @brief # of installed GPUs
-int GetGpuCount();
-
-//! @brief IDs of GPUs whose name is kacGPUName.
-//! @note Call GetGpuCount and allocate sufficient memory for piDeviceIds.
-//! @param [in] kacGPUName
-//! @param [in, out] piDeviceIds. 
-//! @param [in] iIdCountMax. Return value of GetGpuCount() 
-int GetGpuIdArray(const char* kacGPUName, int* piDeviceIds, int iIdCountMax, char* pcMessage);
-
-//! @brief GPU name of index iDeviceId. Allocate 128bytes for pcName before call.
-void GetGpuName(int iDeviceId, char* pcName);
-
-#endif  // GPUUTILS_HPP
-
diff --git a/Common/CUDA/improvedForwardProjections.cu.prehip b/Common/CUDA/improvedForwardProjections.cu.prehip
deleted file mode 100644
index 0f32be72..00000000
--- a/Common/CUDA/improvedForwardProjections.cu.prehip
+++ /dev/null
@@ -1,1032 +0,0 @@
-/*-------------------------------------------------------------------------
- * CUDA function for optimized proton CT radiographies
- * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
- * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232)
- */
- 
-/*--------------------------------------------------------------------------
- This file is part of the TIGRE Toolbox
- 
- Copyright (c) 2015, University of Bath and 
-                     CERN-European Organization for Nuclear Research
-                     All rights reserved.
-
- License:            Open Source under BSD. 
-                     See the full license at
-                     https://github.com/CERN/TIGRE/blob/master/LICENSE
-
- Contact:            tigre.toolbox@gmail.com
- Codes:              https://github.com/CERN/TIGRE/
- Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
---------------------------------------------------------------------------*/
-
-#include <cuda.h>
-#include "mex.h"
-#include <cuda_runtime_api.h>
-#include "improvedForwardProjections.hpp"
-#include <algorithm>
-#include <math.h>
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
-        } \
-} while (0)
-
-
-__device__ int SolvePolynomial(float*x, float a, float b, float c){
-    // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method
-    // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 
-    // Herefore, we have to make a substitution: x = z - a/3
-    float p = b - a*a / 3.0;
-    float q = 2*a*a*a/27.0 - a*b / 3.0 + c;
-    float disc = q*q/4.0 + p*p*p/27.0;
-    if(disc > 0){
-        float u = cbrt(-0.5*q + sqrt(disc)); 
-        float v = cbrt(-0.5*q - sqrt(disc)); 
-        x[0] = u + v - a/3.0; // don't forget to substitute back z --> x
-        return 1;
-    }
-    else if(disc == 0 && p == 0){
-        x[0] = -a/3.0; // don't forget to substitute back z --> x
-        return 1;
-    }
-    else if(disc == 0 && p != 0){
-        x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x
-        x[1] = -3.0*q/(2.0*p) - a/3.0; 
-        return 2;
-    }
-    else{
-        x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x
-        x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0;
-        x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0;
-        return 3;
-    }
-}
-
-__device__ float cspline(float t, float a, float b, float c, float d){
-
-    return a*(t*t*t) + b*(t*t) + c*t +d;
-
-}
-
-__device__ void SimpleSort(float* arr, int size_arr){
-    // Insertion sorting method
-    float curr_elem;
-    int j;
-    
-    for (int i=1; i<size_arr; i++){
-    
-        curr_elem = arr[i];
-        j = i-1;    // minimum is zero
-
-        while(j>=0 && curr_elem<arr[j]){
-            arr[j+1] = arr[j];
-            j = j-1;
-        }//j
-        arr[j+1] = curr_elem;
-    }//i 
-  }
-
-
-__device__ int hullEntryExit(float* HullIntercept, float* position, float* direction, int in_or_out, float* hullparams, float detOff){
-  float a = hullparams[0];
-  float b = hullparams[1];
-  float alpha = hullparams[2];
-  float h = hullparams[3];
-  float kx = direction[0];
-  float dx = position[0] - kx*detOff;
-  float pref_z2 = b*b*kx*kx*cos(alpha)*cos(alpha) - 2.0 * b*b*kx*cos(alpha)*sin(alpha) + b*b*sin(alpha)*sin(alpha) \
-          + a*a*kx*kx*sin(alpha)*sin(alpha) + 2.0 * a*a*kx*cos(alpha)*sin(alpha) + a*a*cos(alpha)*cos(alpha);
-
-  float pref_z = b*b*2.0*kx*dx*cos(alpha)*cos(alpha) - 2.0*b*b*dx*cos(alpha)*sin(alpha) + \
-           a*a*2.0*kx*dx*sin(alpha)*sin(alpha) + 2.0*a*a*dx*cos(alpha)*sin(alpha);
-
-  float pref = b*b*dx*dx*cos(alpha)*cos(alpha) + a*a*dx*dx*sin(alpha)*sin(alpha) - a*a*b*b;
-
-  float p = pref_z/pref_z2;
-  float q = pref/pref_z2;
-  float disc = (p/2.0) * (p/2.0) - q;
-  
-  if(disc>0){
-
-    float z_1 = -p/2.0 + sqrt(disc);
-    float z_2 = -p/2.0 - sqrt(disc);
-    float z_solve;
-
-    if(in_or_out == 1){
-      z_solve = min(z_1, z_2);
-    }
-    else {
-      z_solve = max(z_1, z_2);
-    }
-
-  	float x_solve = kx*z_solve + dx;
-
-    float ky = direction[1];
-    float dy = position[1] - ky*detOff;
-    float y_solve = ky*z_solve + dy;
-
-    if(-h/2 <= y_solve && y_solve <= h/2){
-
-    HullIntercept[0] = x_solve;
-    HullIntercept[1] = y_solve;
-    HullIntercept[2] = z_solve;
-
-    return 0;
-    }
-    else{
-    float z1_h = (1.0/ky) * (0.5*h-dy); 
-    float z2_h = (1.0/ky) * (-0.5*h-dy);  
-
-    if(in_or_out == 1){
-      z_solve = min(z1_h, z2_h);
-      if(dy > 0){y_solve = -h*0.5;}
-      else{y_solve = h*0.5;}
-      x_solve = kx*z_solve + dx;
-    }
-    else {
-      z_solve = max(z1_h, z2_h);
-      if(dy < 0){y_solve = -h*0.5;}
-      else{y_solve = h*0.5;}
-      x_solve = kx*z_solve + dx;
-    }
-    
-    if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){
-
-    HullIntercept[0] = x_solve;
-    HullIntercept[1] = y_solve;
-    HullIntercept[2] = z_solve;
-
-    return 0;
-    }
-
-    else{return 1;}}
-  }
-else{return 1;}
-}
-
-
-__device__ int MinMax(float* solutions, float a, float b, float c){
-    float p = 2*b/(3*a);
-    float q = c / (3*a);
-    float disc = 0.25*p*p - q;
-    if (disc > 0){
-        solutions[0] = -0.5*p + sqrt(disc);
-        solutions[1] = -0.5*p - sqrt(disc);
-        return 0;
-    }
-    solutions[0] = -1;
-    solutions[1] = -1;
-    return 1;
-}
-
-
-__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag){
-  float boundary;
-  int counter = 0;
-  int nx, ny;
-  nx = int(abs(stop[0] - start[0])/pix[0]);
-  ny = int(abs(stop[1] - start[1])/pix[1]);
-    if(nx+ny>=maxIntercep){
-        *protFlag = false;
-        return 1;}
-  
-  if (int(stop[0]/pix[0]) == int(start[0]/pix[0]) && int(stop[1]/pix[1]) == int(start[1]/pix[1])) {
-  *protFlag = true;
-  return 0;
-  }
-          
-  if (int(stop[0]/pix[0]) != int(start[0]/pix[0])) {
-    float k = direction[0];
-    float d = start[0] - k*start[2];
-    boundary = trunc( ((stop[0] > start[0]) ? stop[0]:start[0])/pix[0])*pix[0];
-
-    for (int ix=0; ix<nx; ix++){
-        if(ix != 0){
-          boundary = boundary - pix[0];
-        }
-        float intercept = (boundary - d) / k;
-
-        if(intercept > start[2] && intercept < stop[2]){
-          LinInterceptsVec[ix] = intercept; 
-          counter++;
-          if (counter >= maxIntercep){
-              *protFlag = false;
-              return counter;}
-        }
-    }
-  }
-
-  if (int(stop[1]/pix[1]) != int(start[1]/pix[1])) {
-    float k = direction[1];
-    float d = start[1] - k*start[2];
-    boundary = trunc( ((stop[1] > start[1]) ? stop[1]:start[1])/pix[1])*pix[1];
-    for (int iy=nx; iy<nx+ny; iy++){
-        if(iy != nx){
-          boundary = boundary - pix[1];
-        }
-    float intercept = (boundary - d) / k;
-    if(intercept > start[2] && intercept < stop[2]){
-      LinInterceptsVec[iy] = intercept; 
-      counter++;
-      if(counter >= maxIntercep){
-          *protFlag = false;
-          return counter;}
-    }
-  }
-  }
-  int diff = maxIntercep - counter;
-  for(int j = 0; j<diff; j++){
-    LinInterceptsVec[counter+j] = 2*abs(stop[2]-start[2]); //Just ensure that array Element is larger than total distance                      
-  }
-  SimpleSort(LinInterceptsVec, maxIntercep);
-  for(int j = 0; j<diff; j++){
-    LinInterceptsVec[counter+j] = 0; // Set value back to zero (just for safety...)                     
-  } 
-  *protFlag = true;
-  return counter;
-}
-
-        
-
-
-__device__ int calcIntercepts(float* InterceptsVec ,float* a, float* b, \
-                      float* c, float* d, float* pos1, float* pixelSize, bool* protFlag, int maxIntercep){
-                          
-            /*Calculates channel Intercepts and the lengths the proton (ion) has spent in the
-              corresponding channel.
-              Returns 1 if proton is accepted and 0 if it is rejected due to too many Intercepts
-            */
-                    
-      float oneX, oneY, zeroX, zeroY;
-	  zeroX = d[0];
-	  oneX = pos1[0];
-	  zeroY = d[1];
-	  oneY = pos1[1];
-
-
-          int status, nx, ny;
-          float IntercepX[3];
-          float IntercepY[3];
-          float solutions[2];
-          float boundary;
-          // counter has to be implemented despite the initial discrimination because one can not state beforehand if
-          // the cubic spline has more than one Intercept with the channel boundary
-          int counter=0;
-        
-
-          //Check how many Intercepts will occur approximately
-          int test = MinMax(solutions, a[0], b[0], c[0]);
-           if (test == 0){
-           if (solutions[0] < 1 && solutions[0] > 0){
-               float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0];
-               if (cand > d[0] && cand > pos1[0]){
-               (oneX > zeroX) ? oneX:zeroX=cand;
-               }
-               else if(cand < d[0] && cand < pos1[0]){
-                (oneX < zeroX) ? oneX:zeroX=cand;
-               }
-           }
-
-           if (solutions[1] < 1 && solutions[1] > 0){
-               float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0];
-               if (cand > oneX && cand > zeroX){
-                (oneX > zeroX) ? oneX:zeroX=cand;
-               }
-               else if(cand < oneX && cand < zeroX){
-                (oneX < zeroX) ? oneX:zeroX=cand;
-               }
-           }
-           }
-
-          
-           test = MinMax(solutions, a[1], b[1], c[1]);
-           if (test == 0){
-           if (solutions[0] < 1 && solutions[0] > 0){
-               float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1];
-               if (cand > d[1] && cand > pos1[1]){
-               (oneY > zeroY) ? oneY:zeroY=cand;
-               }
-               else if(cand < d[1] && cand < pos1[1]){
-                (oneY < zeroY) ? oneY:zeroY=cand;
-               }
-           }
-
-           if (solutions[1] < 1 && solutions[1] > 0){
-               float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1];
-               if (cand > oneY && cand > zeroY){
-                (oneY > zeroY) ? oneY:zeroY=cand;
-               }
-               else if(cand < oneY && cand < zeroY){
-                (oneY < zeroY) ? oneY:zeroY=cand;
-               }
-           }
-           } 
-
-          nx = int(abs(oneX - zeroX) / pixelSize[0]);
-          ny = int(abs(oneY - zeroY) / pixelSize[1]);
-          if (nx + ny == 0) {
-          *protFlag = true;
-          return 0;
-         }
-
-          if ((nx + ny) <= maxIntercep){ 
-          
-              if (int(oneX/pixelSize[0]) != int(zeroX/pixelSize[0])) {
-                boundary = trunc( ((oneX > zeroX) ? oneX:zeroX)/pixelSize[0])*pixelSize[0];
-                for (int ix=0; ix<nx; ix++){
-                  if(ix != 0){
-                    boundary = boundary - pixelSize[0];
-                  }
-                  //Start from the largest pixel boundary and propagate to the smallest
-                  status = SolvePolynomial(IntercepX, b[0]/a[0], c[0]/a[0], d[0]/a[0] - boundary/a[0]);
-                  for (int kx=0; kx < status; kx++ ){
-                    if(IntercepX[kx]< 1. && IntercepX[kx] > 0. ){
-                      if (counter >=maxIntercep){break;}
-                      InterceptsVec[counter] = IntercepX[kx];
-                      counter++;
-                    }
-                  }//kx
-                 if (counter >=maxIntercep){break;}     
-                }
-              }
-
-                if ( int(oneY/pixelSize[1]) != int(zeroY/pixelSize[1])) {
-                  boundary = trunc( ((oneY > zeroY) ? oneY:zeroY)/pixelSize[1])*pixelSize[1];
-                  for (int iy=0; iy<ny; iy++){ 
-                    if(iy != 0){
-                        boundary = boundary - pixelSize[1];
-                    }
-                    //Start from the largest pixel boundary and propagate to the smallest
-                    status = SolvePolynomial(IntercepY, b[1]/a[1], c[1]/a[1], d[1]/a[1] - boundary/a[1]);
-                    for (int ky=0; ky < status; ky++ ){
-                      if ((IntercepY[ky]< 1.) &&  (IntercepY[ky] > 0.) ){
-                        if (counter >=maxIntercep){break;}
-                        InterceptsVec[counter] = IntercepY[ky];
-                        counter++;
-                      }
-                     }//ky
-                    if (counter >=maxIntercep){break;}
-                    }
-                  }
-
-                  if (counter >= maxIntercep){ // || counter == 0){ 
-                    *protFlag = false;
-                    return counter;
-                  }else{
-                      
-
-                    int diff = maxIntercep - counter;
-                    for(int j = 0; j<diff; j++){
-                        InterceptsVec[counter+j] = 2. + (float)j; //Just ensure that array Element is larger than 1                        
-                      }     
-                   
-                    SimpleSort(InterceptsVec, maxIntercep);
-                    *protFlag = true;
-                    return counter;
-                  }
-
-          }else{
-          // Too many channel Intercepts - Proton neglected 
-          // Discrimination is implemented to neglect protons with large entry angles
-          // and to reduce the size of the array that has to be allocated for each thread
-          *protFlag = false;
-          return counter;
-          }
-        }
-
-
-__global__ void ParticleKernel(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
-                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int* detectSizeY, \
-                               float* pix, float* detectDistIn, float* detectDistOut, float *ein, float *hull, float *reject){
-    /*Calculate Spline Parameters
-    c = deviceDirIn / d = devicePosIn (pos0)
-    */
-    
-    // int customsize = int(50/(*pixelSize));
-    /*float *tInterceptsVec;  ---> this is too slow! 7 s instead of 1.5 s
-    tInterceptsVec = new float[customsize]; 
-    delete[] tInterceptsVec;*/
-    /*float *ptr; ---> this is too slow! 7.3s instead of 1.5 s
-    ptr = (float*) malloc(customsize * sizeof(float));
-    free(ptr);*/
-            
-    unsigned int protonIndex = blockIdx.x*blockDim.x  + threadIdx.x;
-    float dimX, dimY, lk, lenX, lenY;
-    float lenZ = abs(*detectDistIn) + abs(*detectDistOut);
-    dimX = (float) *detectSizeX;
-    dimY = (float) *detectSizeY;
-
-    //Dereference input parameters
-    int entries, dSizeX, dSizeY;
-    // float pix;
-    
-    entries = *numOfEntries;
-    dSizeX = *detectSizeX;
-    dSizeY = *detectSizeY;
-    // pix = *pixelSize;
-            
-            
-    if(hull[3] == 0){
-    lenX = sqrt((devicePosOut[protonIndex] - devicePosIn[protonIndex]) * (devicePosOut[protonIndex] - devicePosIn[protonIndex]) \
-            + lenZ*lenZ); 
-    lenY = sqrt((devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) * (devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) \
-            + lenZ*lenZ);
-   
-    float lambda0, lambda1, ref_wepl;
-    ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
-    lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
-    lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
-
-    float a[2], b[2], c[2], d[2], pos1[2];
-    
-    //Allocate memory for all pointers
-    // Calculate optimized xdir_in
-    devicedirIn[protonIndex] = devicedirIn[protonIndex] \
-            / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    //  ... dz = 1!
-    devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
-    
-    // Calculate optimized ydir_in
-    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
-            / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);  // ... dz = 1!
-    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
-    
-    // Calculate optimized xdir_out
-    devicedirOut[protonIndex] = devicedirOut[protonIndex] \
-            / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); //  ... dz = 1!
-    devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
-    
-    // Calculate optimized ydir_out
-    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
-            / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
-    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
-            
-    // Calculate spline parameters
-    a[0] = devicePosIn[protonIndex]*2. + devicedirIn[protonIndex] - 2.*devicePosOut[protonIndex] + devicedirOut[protonIndex];
-    a[1] = devicePosIn[protonIndex + entries]*2. + devicedirIn[protonIndex + entries] - \
-    2.*devicePosOut[protonIndex + entries] +  devicedirOut[protonIndex + entries];
-
-    b[0] = -3.*devicePosIn[protonIndex] -2.*devicedirIn[protonIndex] + 3.*devicePosOut[protonIndex] - devicedirOut[protonIndex];
-    b[1] = -3.*devicePosIn[protonIndex + entries] -2.* devicedirIn[protonIndex + entries] \
-    + 3.*devicePosOut[protonIndex + entries] - devicedirOut[protonIndex + entries];
-
-    c[0] = devicedirIn[protonIndex];
-    c[1] = devicedirIn[protonIndex + entries];
-
-    d[0] = devicePosIn[protonIndex];
-    d[1] = devicePosIn[protonIndex + entries];
-
-    pos1[0] = devicePosOut[protonIndex];
-    pos1[1] = devicePosOut[protonIndex + entries];
-    
-    /* --------------------------------------------------------------------------------- */
-    /* ------------------------ Start without Hull (CS only)  -------------------------- */
-    /* --------------------------------------------------------------------------------- */ 
-    int count;
-    bool status = false;
-    float InterceptsVec[vecSizeCS] = {0}; 
-    
-    count = calcIntercepts(InterceptsVec, a, b, c, d, pos1, pix, &status, vecSizeCS);
-       
-    if (status) { 
-        int indX, indY, linInd;
-        float tOld = 0.0;
-         if (count==0){ 
-           indX = int(pos1[0]/pix[0]+dimX/2.); // REPLACE: pos1 by pos0
-           indY = int(pos1[1]/pix[1]+dimY/2.);
-
-           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-               linInd = indY + indX*(dSizeY);  
-               atomicAdd(&dhist1[linInd], p_wepl[protonIndex]);
-               atomicAdd(&dhist2[linInd], 1.0f);
-           }
-
-         } 
-         else{
-            for(int i= 0; i<=count; i++){
-              lk = (InterceptsVec[i]- tOld)*lenZ;
-              if(tOld == 0){
-                indX = int(d[0]/pix[0] +dimX/2);
-                indY = int(d[1]/pix[1] +dimY/2);
-                linInd = indY + indX*(dSizeY); 
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-                    linInd = indY + indX*(dSizeY);
-                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
-                }
-                tOld = InterceptsVec[i];
-
-              }else if(i == count){
-                lk = lenZ - InterceptsVec[i-1]*lenZ;
-                indX = int(pos1[0]/pix[0] +dimX/2);
-                indY = int(pos1[1]/pix[1] +dimY/2);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
-                }
-
-              }else{
-                indX = int(cspline(InterceptsVec[i] - eps, a[0], b[0], c[0], d[0])/pix[0] +dimX/2);
-                indY = int(cspline(InterceptsVec[i] - eps, a[1], b[1], c[1], d[1])/pix[1] +dimY/2);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
-                }
-                tOld = InterceptsVec[i];
-              }
-
-            }//i
-         }//if - Intercepts
-     }
-    else{
-        atomicAdd(reject, 1.0);
-    }
-/* ------------------------ End no Hull calculation (CS only)  -------------------------- */
-    }
-
-else{
-    // WEIGHTING FACTORS FOR CHANNELS I 
-    float weight_air_in = 0.00479; 
-    float weight_air_out = 0.00479; 
-
-    float HullIn[3], HullOut[3], initpos[3], exitpos[3];  
-    float initdir[2], exitdir[2]; 
-            
-    initpos[0] = devicePosIn[protonIndex];
-    initpos[1] = devicePosIn[protonIndex + entries];
-    initpos[2] = *detectDistIn;
-
-    exitpos[0] = devicePosOut[protonIndex];
-    exitpos[1] = devicePosOut[protonIndex + entries];
-    exitpos[2] = *detectDistOut;
-
-    initdir[0] = devicedirIn[protonIndex];
-    initdir[1] = devicedirIn[protonIndex + entries];
-
-    exitdir[0] = devicedirOut[protonIndex];
-    exitdir[1] = devicedirOut[protonIndex + entries];
-
-    int check = hullEntryExit(HullIn, initpos, initdir, 1, hull, *detectDistIn);
-
-    if(check == 0){
-        check = hullEntryExit(HullOut, exitpos, exitdir, 0, hull, *detectDistOut);
-    }
-
-    if(check == 0 && HullOut[2] > HullIn[2]){            
-        /* --------------------------------------------------------------------------------- */
-        /* ------------------------ Start with Hull + SL outside  -------------------------- */
-        /* --------------------------------------------------------------------------------- */
-        const int hullIntercep = int(vecSizeCS);  
-        const int airIntercepIn = int(vecSizeIn);   
-        const int airIntercepOut = int(vecSizeOut);   
-        bool status1 = false;
-        bool status2 = false; 
-        bool status3 = false;
-        
-        int countIn, countHull, countOut;
-        float InterceptsVecOut[airIntercepOut] = {0}; 
-        float InterceptsVecIn[airIntercepIn] = {0};
-        float InterceptsVecHull[hullIntercep] = {0}; 
-        lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); 
-        lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2]));
-
-        countIn = calcInterceptsLinear(InterceptsVecIn, initpos, HullIn, initdir, pix, airIntercepIn, &status1);
-        countOut = calcInterceptsLinear(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2);
-
-        /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */
-        float lambda0, lambda1, ref_wepl;
-        ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
-        lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
-        lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
-
-        float a[2], b[2], c[2], d[2], pos1[2];
-
-        //Allocate memory for all pointers
-        // Calculate optimized xdir_in
-	devicedirIn[protonIndex] = devicedirIn[protonIndex] \
-                / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    // ... dz = 1! 
-        devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
-
-        // Calculate optimized ydir_in
-	devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
-                / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);   // ... dz = 1! 
-        devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
-
-        // Calculate optimized xdir_out
-	devicedirOut[protonIndex] = devicedirOut[protonIndex] \
-                / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1!
-        devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
-
-        // Calculate optimized ydir_out
-	devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
-                / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
-        devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
-
-        // Calculate spline parameters
-        a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex];
-        a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \
-        2.*HullOut[1] +  devicedirOut[protonIndex + entries];
-
-        b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex];
-        b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \
-        + 3.*HullOut[1] - devicedirOut[protonIndex + entries];
-
-        c[0] = devicedirIn[protonIndex];
-        c[1] = devicedirIn[protonIndex + entries];
-
-        d[0] = HullIn[0];
-        d[1] = HullIn[1];
-
-        pos1[0] = HullOut[0];
-        pos1[1] = HullOut[1];
-
-        countHull = calcIntercepts(InterceptsVecHull, a, b, c, d, pos1, pix, &status3, hullIntercep);
-        /* -------------------- End CS Preparations! -------------- */
-
-        if(status1 && status2 && status3){
-        float tOld = initpos[2];
-        int indX, indY, linInd;
-
-        // WEIGHTING FACTORS FOR CHANNELS II
-        float weight_water = 1;  // p_wepl[protonIndex]/(len_b*weight_air_in);
-
-        // ---------------------------------------- Start with SL from detector to hull
-        if (countIn == 0){
-        indX = int(initpos[0]/pix[0] + dimX/2.);
-        indY = int(initpos[1]/pix[1] + dimY/2.);
-        lk = HullIn[2] - initpos[2];
-        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-           linInd = indY + indX*(dSizeY);  
-           atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-           atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-            }
-        }
-
-        else{
-        for(int i= 0; i<=countIn; i++){
-           lk = InterceptsVecIn[i] - tOld;
-           if(i == 0){
-             indX = int(initpos[0]/pix[0] + dimX/2.);
-             indY = int(initpos[1]/pix[1] + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecIn[i];
-             }   
-           }
-           else if(i == countIn){
-             lk = HullIn[2] - InterceptsVecIn[i-1];
-             indX = int(HullIn[0]/pix[0] + dimX/2.);
-             indY = int(HullIn[1]/pix[1] + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-             }
-           }
-
-           else{
-             indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2])))/pix[0] + dimX/2.);
-             indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2])))/pix[1] + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecIn[i];
-             }
-            }
-           }
-          }   // end else
-        // --------------------------- CS within hull
-        
-             tOld = 0.0;
-             if (countHull==0){ 
-               indX = int(HullIn[0]/pix[0] + dimX/2.); 
-               indY = int(HullIn[1]/pix[1] + dimY/2.);
-               lk = HullOut[2] - HullIn[2];
-               if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-                   linInd = indY + indX*(dSizeY);  
-                   atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                   atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-               }
-
-             } else{
-                for(int i= 0; i<=countHull; i++){
-                  lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]);
-                  if(tOld == 0){
-                    indX = int(d[0]/pix[0] + dimX/2.);
-                    indY = int(d[1]/pix[1] + dimY/2.);
-                    linInd = indY + indX*(dSizeY); 
-
-                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
-                        linInd = indY + indX*(dSizeY);
-                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-                    }
-                    tOld = InterceptsVecHull[i]; 
-
-                  }else if(i == countHull){
-                    lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]);
-                    indX = int(pos1[0]/pix[0] + dimX/2.);
-                    indY = int(pos1[1]/pix[1] + dimY/2.);
-
-                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
-                        linInd = indY + indX*(dSizeY); 
-                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-                    }
-
-                  }else{
-                    indX = int(cspline(InterceptsVecHull[i] -eps, a[0], b[0], c[0], d[0])/pix[0] + dimX/2.);
-                    indY = int(cspline(InterceptsVecHull[i] -eps, a[1], b[1], c[1], d[1])/pix[1] + dimY/2.);
-
-                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
-                        linInd = indY + indX*(dSizeY); 
-                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-                    }
-                    tOld = InterceptsVecHull[i];
-                  }
-
-             }//i
-         }
-
-        // --------------------------- SL from hull to detector
-        tOld = HullOut[2];
-        if (countOut == 0){
-        indX = int(exitpos[0]/pix[0] + dimX/2.);
-        indY = int(exitpos[1]/pix[1] + dimY/2.);
-        lk = exitpos[2] - HullOut[2];
-        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-           linInd = indY + indX*(dSizeY);  
-           atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-           atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-            }
-        }
-
-        else{
-        for(int i= 0; i<=countOut; i++){
-           lk = abs(InterceptsVecOut[i] - tOld);
-           if(i == 0){
-             indX = int(HullOut[0]/pix[0] + dimX/2.);
-             indY = int(HullOut[1]/pix[1] + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
-             linInd = indY + indX*(dSizeY);  
-             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecOut[i];
-             }   
-           }
-           else if(i == countOut){
-             lk = exitpos[2] - InterceptsVecOut[i-1];
-             indX = int(exitpos[0]/pix[0] + dimX/2.);
-             indY = int(exitpos[1]/pix[1] + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-             }
-           }
-
-           else{
-             indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/pix[0] + dimX/2.);
-             indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/pix[1] + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecOut[i];
-             }
-            }
-           }
-          }   // end else
-        }
-        else{
-        atomicAdd(reject, 1.0);
-    }
-
-        /* --------------------------- End Hull + SL outside ------------------------------- */
-        
-        }  
-
-    else{   
-    
-    /* --------------------------------------------------------------------------------- */
-    /* ----------------------------- Start with SL only!  ------------------------------ */
-    /* --------------------------------------------------------------------------------- */ 
-    int count;
-    bool status = false;
-    float InterceptsVec[vecSizeCS] = {0}; 
-    
-    float initpos[3], exitpos[3]; 
-    float mydir[2]; 
-    initpos[0] = devicePosIn[protonIndex];
-    initpos[1] = devicePosIn[protonIndex + entries];
-    initpos[2] = *detectDistIn;
-    exitpos[0] = devicePosOut[protonIndex];
-    exitpos[1] = devicePosOut[protonIndex + entries];
-    exitpos[2] = *detectDistOut;
-
-    mydir[0] = (exitpos[0] - initpos[0])/lenZ;
-    mydir[1] = (exitpos[1] - initpos[1])/lenZ;  // dz = 1
-    count = calcInterceptsLinear(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status);
-            
-       
-    if (status) { 
-        int indX, indY, linInd;
-        float tOld = initpos[2];
-         if (count==0){ 
-           indX = int(initpos[0]/pix[0] + dimX/2.); 
-           indY = int(initpos[1]/pix[1] + dimY/2.);
-
-           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-               linInd = indY + indX*(dSizeY);  
-               atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]);
-               atomicAdd(&dhist2[linInd], weight_air_out*1.0f);
-           }
-
-         } else{
-            for(int i= 0; i<=count; i++){
-              lk = InterceptsVec[i] - tOld;
-              if(tOld == initpos[2]){
-                indX = int(initpos[0]/pix[0] + dimX/2.);
-                indY = int(initpos[1]/pix[1] + dimY/2.);
-                linInd = indY + indX*(dSizeY); 
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
-                    linInd = indY + indX*(dSizeY);
-                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-                }
-                tOld = InterceptsVec[i];
-
-              }else if(i == count){
-                lk = exitpos[2] - InterceptsVec[i-1];
-                indX = int(exitpos[0]/pix[0] + dimX/2.);
-                indY = int(exitpos[1]/pix[1] + dimY/2.);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-                }
-
-              }else{
-                indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/pix[0] + dimX/2.);
-                indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/pix[1] + dimY/2.);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-                }
-                tOld = InterceptsVec[i];
-              }
-
-            } //i
-         }//if - Intercepts
-     }
-    else{
-        // *reject += 1;
-        atomicAdd(reject, 1.0);
-    }
-    /* ------------------------------ End SL only! ------ -------------------------- */
-    }   
-   }
-}
-
-__global__ void sumHist(float* hist, float* histNorm){
-    
-    unsigned int index = blockIdx.x*blockDim.x  + threadIdx.x;
-    hist[index] = hist[index]/histNorm[index];
-}
-
-__host__ void ParticleProjections(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \
-                                  float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \
-                                  float detectDistIn, float detectDistOut, float ein, float* ch_param){
-
-    /*
-    Detect Size = 400x400
-    Prepare Input for GPU*/
-
-    const int sizeInputs = 2*numOfEntries*sizeof(float);
-    const int detectorMem = detectSizeX*detectSizeY*sizeof(float);
-    float reject = 0.0;
-
-    float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull;
-    int *dnumEntries, *ddetectorX, *ddetectorY;
-    float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dEin, *dReject;
-
-    float *hist1, *hist2;
-    hist1 = new float[detectSizeX*detectSizeY];
-    hist2 = new float[detectSizeX*detectSizeY];
-    for(int i = 0; i<detectSizeX*detectSizeY; i++){
-        hist1[i] = 0.f;
-        hist2[i]= 0.f;
-    
-    }
-
-    //Allocate Memory on GPU
-    cudaMalloc( (void**) &dPosIn, sizeInputs );
-    cudaMalloc( (void**) &dPosOut, sizeInputs );
-    cudaMalloc( (void**) &ddirIn, sizeInputs );
-    cudaMalloc( (void**) &ddirOut, sizeInputs );
-    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
-    cudaMalloc( (void**) &dhist1, detectorMem );
-    cudaMalloc( (void**) &dhist2, detectorMem );
-    cudaMalloc( (void**) &dnumEntries, sizeof(int));
-    cudaMalloc( (void**) &ddetectorX, sizeof(int));
-    cudaMalloc( (void**) &ddetectorY, sizeof(int));
-    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
-    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
-    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
-    cudaMalloc( (void**) &dEin, sizeof(float));
-    cudaMalloc( (void**) &dReject, sizeof(float));
-    cudaMalloc( (void**) &dHull, 5*sizeof(float));
-    cudaError_t _err_alloc = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
-    cudaCheckErrors("GPU Allocation failed!");
-
-    //Copy Arrays to GPU
-    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
-    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
-    cudaCheckErrors("Host to device transport failed!");
-
-
-
-    dim3 grid(floor(numOfEntries/maxthreads),1,1);
-    dim3 block(maxthreads,1,1);
-
-    
-    ParticleKernel<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
-            dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject);
-    cudaError_t _err = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err));
-    cudaCheckErrors("Kernel fail!");
-    
-    //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
-    //dim3 block_sum(64,1,1);
-    //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
-        
-    //Copy result from device to host
-    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
-    //cudaError_t _errcp = cudaGetLastError();
-    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
-    cudaCheckErrors("Device to host transport failed!");
-    
-    for(int j = 0; j<detectSizeX*detectSizeY; j++){
-        outProjection[j] = hist1[j]/hist2[j]; 
-    }
-
-    std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
-
-    cudaFree(dPosIn);
-    cudaFree(dPosOut);
-    cudaFree(ddirIn);
-    cudaFree(ddirOut);
-    cudaFree(dhist1);
-    cudaFree(dhist2);
-    cudaFree(d_wepl);
-    cudaFree(dnumEntries);
-    cudaFree(ddetectorX);
-    cudaFree(ddetectorY);
-    cudaFree(dpixelSize);
-    cudaFree(dDetectDistIn);
-    cudaFree(dDetectDistOut);
-    cudaFree(dEin);
-    cudaFree(dReject);
-    cudaFree(dHull);
-
-    delete(hist1);
-    delete(hist2);
-    // delete(&reject);
-
-
-}
diff --git a/Common/CUDA/improvedForwardProjections.hpp.prehip b/Common/CUDA/improvedForwardProjections.hpp.prehip
deleted file mode 100644
index 6da25b63..00000000
--- a/Common/CUDA/improvedForwardProjections.hpp.prehip
+++ /dev/null
@@ -1,263 +0,0 @@
-/*-------------------------------------------------------------------------
- * CUDA function for optimized proton CT radiographies
- * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
- * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232)
- */
- 
-/*--------------------------------------------------------------------------
- This file is part of the TIGRE Toolbox
- 
- Copyright (c) 2015, University of Bath and 
-                     CERN-European Organization for Nuclear Research
-                     All rights reserved.
-
- License:            Open Source under BSD. 
-                     See the full license at
-                     https://github.com/CERN/TIGRE/blob/master/LICENSE
-
- Contact:            tigre.toolbox@gmail.com
- Codes:              https://github.com/CERN/TIGRE/
- Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
---------------------------------------------------------------------------*/
-
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include <iostream>
-#ifndef improvedForwardProjections_H
-#define improvedForwardProjections_H
-#define pi 3.14159265359
-#define eps 1e-8
-#define vecSizeCS 220
-#define vecSizeOut 100
-#define vecSizeIn 10
-#define maxthreads 256
-//#include <thrust/host_vector.h>
-//#include <thrust/device_vector.h>
-
-void ParticleProjections(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \
-        int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float ein, float* ch_param);
-
-__device__ int calcIntercepts(float* InterceptsVec ,float*  a, float* b, \
-                      float*  c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep);
-
-__device__ int SolvePolynomial(float*x, float a, float b, float c);
-
-__device__ int MinMax(float* solutions, float a, float b, float c);
-
-__device__ void SimpleSort(float* arr, int size_arr);
-
-__global__ void ParticleKernel(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
-                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \
-                               float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject);
-
-__device__ int hullEntryExit(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff);
-
-__device__ int calcInterceptsLinear(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \
-        bool* protFlag);
-
-void ParticleProjectionsCone(float* outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, float* p_wepl, \
-        int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, float detectDistIn, float detectDistOut, float sourcePos, \
-        float ein, float* ch_param);
-
-__device__ int calcInterceptsCone(float* InterceptsVec ,float*  a, float* b, \
-                      float*  c, float* d, float* pos1, float pixelSize, bool* protFlag, int maxIntercep, \
-                      float sourcePos, float din, float dout);
-
-__device__ int SolvePolynomialCone(float*x, float a, float b, float c);
-
-__device__ void SimpleSortCone(float* arr, int size_arr);
-
-__device__ int MinMaxCone(float* solutions, float a, float b, float c);
-
-__global__ void ParticleKernelCone(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
-                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int *detectSizeY, \
-                               float* pixelSize, float *detectDistIn, float *detectDistOut, float *ein, float *hull, float *reject, \
-                               float* sourceDist);
-
-__device__ int hullEntryExitCone(float* HullIntercept, float* position, float* direction, int in_or_out, float *hullparams, float detOff);
-
-__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float pix, int maxIntercep, \
-        bool* protFlag, float sourcePos);
-
-#endif
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/Common/CUDA/improvedForwardProjections_cone.cu.prehip b/Common/CUDA/improvedForwardProjections_cone.cu.prehip
deleted file mode 100644
index 7a4f6b46..00000000
--- a/Common/CUDA/improvedForwardProjections_cone.cu.prehip
+++ /dev/null
@@ -1,1230 +0,0 @@
-/*-------------------------------------------------------------------------
- * CUDA function for optimized proton CT radiographies
- * The full method is described in Kaser et al.: Integration of proton imaging into the TIGRE toolbox (submitted to ZMP)
- * and based on the method of Collins-Fekete (https://doi.org/10.1088/0031-9155/61/23/8232)
- */
- 
-/*--------------------------------------------------------------------------
- This file is part of the TIGRE Toolbox
- 
- Copyright (c) 2015, University of Bath and 
-                     CERN-European Organization for Nuclear Research
-                     All rights reserved.
-
- License:            Open Source under BSD. 
-                     See the full license at
-                     https://github.com/CERN/TIGRE/blob/master/LICENSE
-
- Contact:            tigre.toolbox@gmail.com
- Codes:              https://github.com/CERN/TIGRE/
- Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
---------------------------------------------------------------------------*/
-
-
-#include <cuda.h>
-#include "mex.h"
-#include <cuda_runtime_api.h>
-#include "improvedForwardProjections.hpp"
-// #include <algorithm>
-// #include <math.h>
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("ImprovedForwardProj:",cudaGetErrorString(__err));\
-        } \
-} while (0)
-
-
-__device__ int SolvePolynomialCone(float*x, float a, float b, float c){
-    // Calculates real roots of a third-order polynomial function using Vieta's method and Cardano's method
-    // We obtain a polynomial of the form x³ + ax² + bx + c = 0 and reduce it to z³+pz+q = 0 
-    // Herefore, we have to make a substitution: x = z - a/3
-    float p = b - a*a / 3.0;
-    float q = 2*a*a*a/27.0 - a*b / 3.0 + c;
-    float disc = q*q/4.0 + p*p*p/27.0;
-    if(disc > 0){
-        float u = cbrt(-0.5*q + sqrt(disc));
-        float v = cbrt(-0.5*q - sqrt(disc));
-        x[0] = u + v - a/3.0; // don't forget to substitute back z --> x
-        return 1;
-    }
-    else if(disc == 0 && p == 0){
-        x[0] = -a/3.0; // don't forget to substitute back z --> x
-        return 1;
-    }
-    else if(disc == 0 && p != 0){
-        x[0] = 3.0*q/p - a/3.0; // don't forget to substitute back z --> x
-        x[1] = -3.0*q/(2.0*p) - a/3.0; 
-        return 2;
-    }
-    else{
-        x[0] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) + pi/3.0) - a/3.0; // don't forget to substitute back z --> x
-        x[1] = sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p)))) - a/3.0;
-        x[2] = -sqrt(-4.0 * p / 3.0) * cos(1./3. * acos(-0.5*q*sqrt(-27./(p*p*p))) - pi/3.0) - a/3.0;
-        return 3;
-    }
-}
-
-__device__ float csplineCone(float t, float a, float b, float c, float d){
-
-    return a*(t*t*t) + b*(t*t) + c*t +d;
-
-}
-
-__device__ void SimpleSortCone(float* arr, int size_arr){
-    // Insertion sorting method
-    float curr_elem;
-    int j;
-    
-    for (int i=1; i<size_arr; i++){
-    
-        curr_elem = arr[i];
-        j = i-1;    // minimum is zero
-
-        while(j>=0 && curr_elem<arr[j]){
-            arr[j+1] = arr[j];
-            j = j-1;
-        }//j
-        arr[j+1] = curr_elem;
-    }//i 
-  }
-
-
-__device__ int hullEntryExitCone(float* HullIntercept, float* position, float* direction, int in_or_out, float* hullparams, float detOff){
-  float a = hullparams[0];
-  float b = hullparams[1];
-  float alpha = hullparams[2];
-  float h = hullparams[3];
-  float kx = direction[0];
-  float dx = position[0] - kx*detOff;
-  float pref_z2 = b*b*kx*kx*cos(alpha)*cos(alpha) - 2.0 * b*b*kx*cos(alpha)*sin(alpha) + b*b*sin(alpha)*sin(alpha) \
-          + a*a*kx*kx*sin(alpha)*sin(alpha) + 2.0 * a*a*kx*cos(alpha)*sin(alpha) + a*a*cos(alpha)*cos(alpha);
-
-  float pref_z = b*b*2.0*kx*dx*cos(alpha)*cos(alpha) - 2.0*b*b*dx*cos(alpha)*sin(alpha) + \
-           a*a*2.0*kx*dx*sin(alpha)*sin(alpha) + 2.0*a*a*dx*cos(alpha)*sin(alpha);
-
-  float pref = b*b*dx*dx*cos(alpha)*cos(alpha) + a*a*dx*dx*sin(alpha)*sin(alpha) - a*a*b*b;
-
-  float p = pref_z/pref_z2;
-  float q = pref/pref_z2;
-  float disc = (p/2.0) * (p/2.0) - q;
-  
-  if(disc>0){
-
-    float z_1 = -p/2.0 + sqrt(disc);
-    float z_2 = -p/2.0 - sqrt(disc);
-    float z_solve;
-
-    if(in_or_out == 1){
-      z_solve = min(z_1, z_2);
-    }
-    else {
-      z_solve = max(z_1, z_2);
-    }
-
-  	float x_solve = kx*z_solve + dx;
-
-    float ky = direction[1];
-    float dy = position[1] - ky*detOff;
-    float y_solve = ky*z_solve + dy;
-
-    if(-h/2 <= y_solve && y_solve <= h/2){
-
-    HullIntercept[0] = x_solve;
-    HullIntercept[1] = y_solve;
-    HullIntercept[2] = z_solve;
-
-    return 0;
-    }
-    else{
-    float z1_h = (1.0/ky) * (0.5*h-dy); 
-    float z2_h = (1.0/ky) * (-0.5*h-dy);  
-
-    if(in_or_out == 1){
-      z_solve = min(z1_h, z2_h);
-      if(dy > 0){y_solve = -h*0.5;}
-      else{y_solve = h*0.5;}
-      x_solve = kx*z_solve + dx;
-    }
-    else {
-      z_solve = max(z1_h, z2_h);
-      if(dy < 0){y_solve = -h*0.5;}
-      else{y_solve = h*0.5;}
-      x_solve = kx*z_solve + dx;
-    }
-    
-    if(min(z_1, z_2) <= z_solve && z_solve <= max(z_1, z_2)){
-
-    HullIntercept[0] = x_solve;
-    HullIntercept[1] = y_solve;
-    HullIntercept[2] = z_solve;
-
-    return 0;
-    }
-
-    else{return 1;}}
-  }
-else{return 1;}
-}
-
-
-
-__device__ int calcInterceptsLinearCone(float* LinInterceptsVec, float* start, float* stop, float* direction, float* pix, int maxIntercep, bool* protFlag,
-        float sourcePos){
-  float tan_alpha, d_channel;
-  int counter = 0;
-  int nx, ny;
-  float sdd = abs(stop[2] - sourcePos);  // distance source detector
-  float sidd = abs(start[2] - sourcePos);   // distance sourcce inital detector
-  int select;
-
-  float pix_start_x = sidd * (pix[0]/sdd);
-  float pix_start_y = sidd * (pix[1]/sdd); 
-
-  nx = int(abs(stop[0]/pix[0] - start[0]/pix_start_x));
-  ny = int(abs(stop[1]/pix[1] - start[1]/pix_start_y));
-    if(nx+ny>=maxIntercep){
-        *protFlag = false;
-        return 1;}
-  
-  if (int(stop[0]/pix[0]) == int(start[0]/pix_start_x) && int(stop[1]/pix[1]) == int(start[1]/pix_start_y)) {
-  *protFlag = true;
-  return 0;
-  }
-          
-  if (int(stop[0]/pix[0]) != int(start[0]/pix_start_x)) {
-    float k = direction[0];
-    float d = start[0] - k*start[2];
-    if(stop[0]/pix[0] > start[0]/pix_start_x){
-    tan_alpha = (trunc(stop[0]/pix[0])*pix[0])/sdd;
-    d_channel = trunc(stop[0]/pix[0])*pix[0] - tan_alpha * stop[2];
-    select = 0;
-    }
-    else{
-    tan_alpha = (trunc(start[0]/pix_start_x)*pix_start_x)/sidd;
-    d_channel = trunc(start[0]/pix_start_x)*pix_start_x - tan_alpha * start[2];
-    select = 1;
-    }
-    
-    for (int ix=0; ix<nx; ix++){
-        if(ix != 0){
-          if (select == 0){
-          tan_alpha = (trunc((stop[0]-ix*pix[0])/pix[0])*pix[0])/sdd;
-          d_channel = trunc((stop[0]-ix*pix[0])/pix[0])*pix[0] - tan_alpha * stop[2];
-          }
-          else{
-          tan_alpha = (trunc((start[0]-ix*pix_start_x)/pix_start_x)*pix_start_x)/sidd;
-          d_channel = trunc((start[0]-ix*pix_start_x)/pix_start_x)*pix_start_x - tan_alpha * start[2];
-          }
-        }
-        float intercept = (d_channel - d)/(k - tan_alpha);
-
-        if(intercept > start[2] && intercept < stop[2]){
-          LinInterceptsVec[ix] = intercept; 
-          counter++;
-          if (counter >= maxIntercep){
-              *protFlag = false;
-              return counter;}
-        }
-    }
-  }
-
-  if (int(stop[1]/pix[1]) != int(start[1]/pix_start_y)) {
-    float k = direction[1];
-    float d = start[1] - k*start[2];
-    if(stop[1]/pix[1] > start[1]/pix_start_y){
-    tan_alpha = (trunc(stop[1]/pix[1])*pix[1])/sdd;
-    d_channel = trunc(stop[1]/pix[1])*pix[1] - tan_alpha * stop[2];
-    select = 0;
-    }
-    else{
-    tan_alpha = (trunc(start[1]/pix_start_y)*pix_start_y)/sidd;
-    d_channel = trunc(start[1]/pix_start_y)*pix_start_y - tan_alpha * start[2];
-    select = 1;
-    }
-    
-    for (int iy=nx; iy<nx+ny; iy++){
-        if(iy != nx){
-          if (select == 0){
-          tan_alpha = (trunc((stop[1]-(iy-nx)*pix[1])/pix[1])*pix[1])/sdd;
-          d_channel = trunc((stop[1]-(iy-nx)*pix[1])/pix[1])*pix[1] - tan_alpha * stop[2];
-          }
-          else{
-          tan_alpha = (trunc((start[1]-(iy-nx)*pix_start_y)/pix_start_y)*pix_start_y)/sidd;
-          d_channel = trunc((start[1]-(iy-nx)*pix_start_y)/pix_start_y)*pix_start_y - tan_alpha * start[2];
-          }
-        }
-        float intercept = (d_channel - d)/(k - tan_alpha);
-
-        if(intercept > start[2] && intercept < stop[2]){
-          LinInterceptsVec[iy] = intercept; 
-          counter++;
-          if (counter >= maxIntercep){
-              *protFlag = false;
-              return counter;}
-        }
-    }
-  }
-
-  int diff = maxIntercep - counter;
-  for(int j = 0; j<diff; j++){
-    LinInterceptsVec[counter+j] = 2*abs(stop[2]-start[2]); //Just ensure that array Element is larger than total distance                      
-  }
-  SimpleSortCone(LinInterceptsVec, maxIntercep);
-  for(int j = 0; j<diff; j++){
-    LinInterceptsVec[counter+j] = 0; // Set value back to zero (just for safety...)                     
-  } 
-  *protFlag = true;
-  return counter;
-}
-        
-
-__device__ int MinMaxCone(float* solutions, float a, float b, float c){
-    float p = 2*b/(3*a);
-    float q = c / (3*a);
-    float disc = 0.25*p*p - q;
-    if (disc > 0){
-        solutions[0] = -0.5*p + sqrt(disc);
-        solutions[1] = -0.5*p - sqrt(disc);
-        return 0;
-    }
-    solutions[0] = -1;
-    solutions[1] = -1;
-    return 1;
-}
-
-
-
-__device__ int calcInterceptsCone(float* InterceptsVec ,float* a, float* b, \
-                      float* c, float* d, float* pos1, float* pixelSize, bool* protFlag, int maxIntercep, \
-                      float sourcePos, float din, float dout){
-                          
-            /*Calculates channel Intercepts and the lengths the proton (ion) has spent in the
-              corresponding channel.
-              Returns 1 if proton is accepted and 0 if it is rejected due to too many Intercepts
-            */
-      float oneX, oneY, zeroX, zeroY, pix_oneX, pix_oneY, pix_zeroX, pix_zeroY;
-      float tan_alpha, d_channel;
-      float sdd_init = abs(dout - sourcePos)/abs(dout-din);  // normalize to 1!
-      float sidd_init = abs(din - sourcePos)/abs(dout-din);
-      float sdd_x = abs(dout - sourcePos)/abs(dout-din);  // normalize to 1!
-      float sidd_x = abs(din - sourcePos)/abs(dout-din);
-      float sdd_y = abs(dout - sourcePos)/abs(dout-din);  // normalize to 1!
-      float sidd_y = abs(din - sourcePos)/abs(dout-din);
-      int select;
-      float pix_start_x = sidd_init * (pixelSize[0]/sdd_init);
-      float pix_start_y = sidd_init * (pixelSize[1]/sdd_init);
-	  zeroX = d[0];
-	  oneX = pos1[0];
-	  zeroY = d[1];
-	  oneY = pos1[1];
-      pix_zeroX = pix_start_x;
-      pix_zeroY = pix_start_y;
-      pix_oneX = pixelSize[0];
-      pix_oneY = pixelSize[1];
-
-
-      int status, nx, ny;
-      float IntercepX[3];
-      float IntercepY[3];
-      float solutions[2];
-      // counter has to be implemented despite the initial discrimination because one can not state beforehand if
-      // the cubic spline has more than one Intercept with the channel boundary
-      int counter=0;
-
-      int test = MinMaxCone(solutions, a[0], b[0], c[0]);
-       if (test == 0){
-       if (solutions[0] < 1 && solutions[0] > 0){
-	   float cand = a[0] * solutions[0]*solutions[0]*solutions[0] + b[0] * solutions[0]*solutions[0] + c[0] * solutions[0] + d[0];
-           float pix_cand = (sidd_init + solutions[0]) * (pixelSize[0]/sdd_init);
-           if (cand/pix_cand > d[0]/pix_start_x && cand/pix_cand > pos1[0]/pixelSize[0]){
-           (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand;
-           (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
-           (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din);
-           }
-           else if(cand/pix_cand < d[0]/pix_start_x && cand/pix_cand < pos1[0]/pixelSize[0]){
-            (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand;
-            (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
-            (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[0] - sourcePos/(dout-din);
-           }
-       }
-
-       if (solutions[1] < 1 && solutions[1] > 0){
-           float cand = a[0] * solutions[1]*solutions[1]*solutions[1] + b[0] * solutions[1]*solutions[1] + c[0] * solutions[1] + d[0];
-           float pix_cand = (sidd_init + solutions[1]) * (pixelSize[0]/sdd_init);
-           if (cand/pix_cand > oneX/pix_oneX && cand/pix_cand > zeroX/pix_zeroX){
-            (oneX/pix_oneX > zeroX/pix_zeroX) ? oneX:zeroX=cand;
-            (oneX/pix_oneX > zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
-            (oneX/pix_oneX > zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din);
-           }
-           else if(cand/pix_cand < oneX/pix_oneX && cand/pix_cand < zeroX/pix_zeroX){
-            (oneX/pix_oneX < zeroX/pix_zeroX) ? oneX:zeroX=cand;
-            (oneX/pix_oneX < zeroX/pix_zeroX) ? pix_oneX:pix_zeroX = pix_cand;
-            (oneX/pix_oneX < zeroX/pix_zeroX) ? sdd_x:sidd_x = solutions[1] - sourcePos/(dout-din);
-           }
-       }
-       }
-
-       test = MinMaxCone(solutions, a[1], b[1], c[1]);
-       if (test == 0){
-       if (solutions[0] < 1 && solutions[0] > 0){
-           float cand = a[1] * solutions[0]*solutions[0]*solutions[0] + b[1] * solutions[0]*solutions[0] + c[1] * solutions[0] + d[1];
-           float pix_cand = (sidd_init + solutions[0]) * (pixelSize[1]/sdd_init);
-           if (cand/pix_cand > d[1]/pix_start_y && cand/pix_cand > pos1[1]/pixelSize[1]){
-           (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand;
-           (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
-           (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din);
-           }
-           else if(cand/pix_cand < d[1]/pix_start_y && cand/pix_cand < pos1[1]/pixelSize[1]){
-            (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand;
-            (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
-            (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[0] - sourcePos/(dout-din);
-           }
-       }
-
-       if (solutions[1] < 1 && solutions[1] > 0){
-           float cand = a[1] * solutions[1]*solutions[1]*solutions[1] + b[1] * solutions[1]*solutions[1] + c[1] * solutions[1] + d[1];
-           float pix_cand = (sidd_init + solutions[1]) * (pixelSize[1]/sdd_init);
-           if (cand/pix_cand > oneY/pix_oneY && cand/pix_cand > zeroY/pix_zeroY){
-            (oneY/pix_oneY > zeroY/pix_zeroY) ? oneY:zeroY=cand;
-            (oneY/pix_oneY > zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
-            (oneY/pix_oneY > zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din);
-           }
-           else if(cand/pix_cand < oneY/pix_oneY && cand/pix_cand < zeroY/pix_zeroY){
-            (oneY/pix_oneY < zeroY/pix_zeroY) ? oneY:zeroY=cand;
-            (oneY/pix_oneY < zeroY/pix_zeroY) ? pix_oneY:pix_zeroY = pix_cand;
-            (oneY/pix_oneY < zeroY/pix_zeroY) ? sdd_y:sidd_y = solutions[1] - sourcePos/(dout-din);
-           }
-       }
-       }
-      //Check how many Intercepts will occur approximately
-      nx = int(abs(oneX/pix_oneX - zeroX/pix_zeroX));
-      ny = int(abs(oneY/pix_oneY - zeroY/pix_zeroY));
-
-      if (nx + ny == 0) {
-      *protFlag = true;
-      return 0;
-      }
-      if ((nx + ny) <= maxIntercep){ 
-
-          if (int(oneX/pix_oneX) != int(zeroX/pix_zeroX)) {
-            if(oneX/pix_oneX > zeroX/pix_zeroX){            
-            tan_alpha = (trunc(oneX/pix_oneX)*pix_oneX)/sdd_x;
-            d_channel = trunc(oneX/pix_oneX)*pix_oneX * (sidd_init/sdd_x);
-            select = 0;
-            }
-            else{
-            tan_alpha = (trunc(zeroX/pix_zeroX)*pix_zeroX)/sidd_x;
-            d_channel = trunc(zeroX/pix_zeroX)*pix_zeroX * (sidd_init/sidd_x);
-            select = 1;
-            }
-            for (int ix=0; ix<nx; ix++){
-              if(ix != 0){
-                if (select == 0){
-                  tan_alpha = (trunc((oneX-ix*pix_oneX)/pix_oneX)*pix_oneX)/sdd_x;
-                  d_channel = trunc((oneX-ix*pix_oneX)/pix_oneX)*pix_oneX * (sidd_init/sdd_x);
-                  }
-                  else{
-                  tan_alpha = (trunc((zeroX-ix*pix_zeroX)/pix_zeroX)*pix_zeroX)/sidd_x;
-                  d_channel = trunc((zeroX-ix*pix_zeroX)/pix_zeroX)*pix_zeroX * (sidd_init/sidd_x);
-                  }
-              }
-              //Start from the largest pixel boundary and propagate to the smallest
-              status = SolvePolynomialCone(IntercepX, b[0]/a[0], c[0]/a[0] - tan_alpha/a[0], d[0]/a[0] - d_channel/a[0]);
-              for (int kx=0; kx < status; kx++ ){
-                if(IntercepX[kx]< 1. && IntercepX[kx] > 0. ){
-                  if (counter >=maxIntercep){break;}
-                  InterceptsVec[counter] = IntercepX[kx];
-                  counter++;
-                }
-              }//kx
-             if (counter >=maxIntercep){break;}     
-            }
-          }
-
-           if ( int(oneY/pix_oneY) != int(zeroY/pix_zeroY)) {
-              if(oneY/pix_oneY > zeroY/pix_zeroY){
-                tan_alpha = (trunc(oneY/pix_oneY)*pix_oneY)/sdd_y;
-                d_channel = trunc(oneY/pix_oneY)*pix_oneY * (sidd_init/sdd_y);
-                select = 0;
-                }
-                else{
-                tan_alpha = (trunc(zeroY/pix_zeroY)*pix_zeroY)/sidd_y;
-                d_channel = trunc(zeroY/pix_zeroY)*pix_zeroY * (sidd_init/sidd_y);
-                select = 1;
-                }
-                for (int iy=0; iy<ny; iy++){
-                  if(iy != 0){
-                    if (select == 0){
-                      tan_alpha = (trunc((oneY-iy*pix_oneY)/pix_oneY)*pix_oneY)/sdd_y;
-                      d_channel = trunc((oneY-iy*pix_oneY)/pix_oneY)*pix_oneY * (sidd_init/sdd_y);
-                      }
-                      else{
-                      tan_alpha = (trunc((zeroY-iy*pix_zeroY)/pix_zeroY)*pix_zeroY)/sidd_y;
-                      d_channel = trunc((zeroY-iy*pix_zeroY)/pix_zeroY)*pix_zeroY * (sidd_init/sidd_y);
-                      }
-                  }
-                  //Start from the largest pixel boundary and propagate to the smallest
-                  status = SolvePolynomialCone(IntercepY, b[1]/a[1], c[1]/a[1] - tan_alpha/a[1], d[1]/a[1] - d_channel/a[1]);
-                  for (int ky=0; ky < status; ky++ ){
-                    if(IntercepY[ky]< 1. && IntercepY[ky] > 0. ){
-                      if (counter >=maxIntercep){break;}
-                      InterceptsVec[counter] = IntercepY[ky];
-                      counter++;
-                    }
-                  }//kx
-                 if (counter >=maxIntercep){break;}     
-                }
-              }
-
-          if (counter >= maxIntercep){ // || counter == 0){ 
-            *protFlag = false;
-            return counter;
-          }
-
-         else{
-            int diff = maxIntercep - counter;
-            for(int j = 0; j<diff; j++){
-                InterceptsVec[counter+j] = 2. + (float)j; //Just ensure that array Element is larger than 1                        
-              }     
-
-            SimpleSortCone(InterceptsVec, maxIntercep);
-            *protFlag = true;
-            return counter;
-          }
-        }
-
-        else{
-          // Too many channel Intercepts - Proton neglected 
-          // Discrimination is implemented to neglect protons with large entry angles
-          // and to reduce the size of the array that has to be allocated for each thread
-          *protFlag = false;
-          return counter;
-          }
-        }
-
-
-__global__ void ParticleKernelCone(float* dhist1, float* dhist2, float* devicePosIn, float* devicePosOut, float* devicedirIn, \
-                               float* devicedirOut ,float* p_wepl,int* numOfEntries, int* detectSizeX, int* detectSizeY, \
-                               float* pix, float* detectDistIn, float* detectDistOut, float *ein, float *hull, float *reject, \
-                               float* sourceDist){
-            
-    unsigned int protonIndex = blockIdx.x*blockDim.x  + threadIdx.x;
-    float dimX, dimY, lk, lenX, lenY;
-    float lenZ = abs(*detectDistIn) + abs(*detectDistOut);
-    dimX = (float) *detectSizeX;
-    dimY = (float) *detectSizeY;
-
-    //Dereference input parameters
-    int entries, dSizeX, dSizeY;
-    // float pix;
-    
-    entries = *numOfEntries;
-    dSizeX = *detectSizeX;
-    dSizeY = *detectSizeY;
-    // pix = *pixelSize;
-            
-            
-    if(hull[3] == 0){
-    lenX = sqrt((devicePosOut[protonIndex] - devicePosIn[protonIndex]) * (devicePosOut[protonIndex] - devicePosIn[protonIndex]) \
-            + lenZ*lenZ); 
-    lenY = sqrt((devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) * (devicePosOut[protonIndex + entries] - devicePosIn[protonIndex + entries]) \
-            + lenZ*lenZ);
-   
-    float lambda0, lambda1, ref_wepl;
-    ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
-    lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
-    lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl) * (p_wepl[protonIndex]/ref_wepl);
-
-    float a[2], b[2], c[2], d[2], pos1[2];
-    
-    //Allocate memory for all pointers
-    // Calculate optimized xdir_in
-    devicedirIn[protonIndex] = devicedirIn[protonIndex] \
-            / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    //  ... dz = 1!
-    devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
-    
-    // Calculate optimized ydir_in
-    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
-            / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);  // ... dz = 1!
-    devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
-    
-    // Calculate optimized xdir_out
-    devicedirOut[protonIndex] = devicedirOut[protonIndex] \
-            / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); //  ... dz = 1!
-    devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
-    
-    // Calculate optimized ydir_out
-    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
-            / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
-    devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
-            
-    // Calculate spline parameters
-    a[0] = devicePosIn[protonIndex]*2. + devicedirIn[protonIndex] - 2.*devicePosOut[protonIndex] + devicedirOut[protonIndex];
-    a[1] = devicePosIn[protonIndex + entries]*2. + devicedirIn[protonIndex + entries] - \
-    2.*devicePosOut[protonIndex + entries] +  devicedirOut[protonIndex + entries];
-
-    b[0] = -3.*devicePosIn[protonIndex] -2.*devicedirIn[protonIndex] + 3.*devicePosOut[protonIndex] - devicedirOut[protonIndex];
-    b[1] = -3.*devicePosIn[protonIndex + entries] -2.* devicedirIn[protonIndex + entries] \
-    + 3.*devicePosOut[protonIndex + entries] - devicedirOut[protonIndex + entries];
-
-    c[0] = devicedirIn[protonIndex];
-    c[1] = devicedirIn[protonIndex + entries];
-
-    d[0] = devicePosIn[protonIndex];
-    d[1] = devicePosIn[protonIndex + entries];
-
-    pos1[0] = devicePosOut[protonIndex];
-    pos1[1] = devicePosOut[protonIndex + entries];
-    
-    /* --------------------------------------------------------------------------------- */
-    /* ------------------------ Start without Hull (CS only)  -------------------------- */
-    /* --------------------------------------------------------------------------------- */ 
-    int count;
-    bool status = false;
-    float InterceptsVec[vecSizeCS] = {0}; 
-    // float InterceptsLengths[vecSizeCS+1] = {0};          
-    count = calcInterceptsCone(InterceptsVec, a, b, c, d, pos1, pix, &status, vecSizeCS, *sourceDist, *detectDistIn, *detectDistOut);
-    if (status) { 
-        float pix_start_x = abs(*detectDistIn - *sourceDist) * (pix[0]/abs(*detectDistOut - *sourceDist));
-        float pix_start_y = abs(*detectDistIn - *sourceDist) * (pix[1]/abs(*detectDistOut - *sourceDist));
-        int indX, indY, linInd;
-        
-        // for cone beam we need this
-        // Calculate new lenZ
-        /*float lenZ_custom = 0.0;
-        float head[3], tail[3];
-        for (int i=0; i<=count; i++){
-            if (i == 0){
-                head[0] = cspline(InterceptsVec[i], a[0], b[0], c[0], d[0]);
-                head[1] = cspline(InterceptsVec[i], a[1], b[1], c[1], d[1]);
-                head[2] = InterceptsVec[i]*lenZ;
-                InterceptsLengths[i] = sqrt(powf(head[0] - d[0], 2.0) + powf(head[1] - d[1], 2.0) + powf(head[2], 2.0));
-                tail[0] = head[0];
-                tail[1] = head[1];
-                tail[2] = head[2];
-                lenZ_custom += InterceptsLengths[i];
-            }
-            else if (i == count){
-                InterceptsLengths[i] = sqrt(powf(pos1[0] - tail[0], 2.0) + powf(pos1[1] - tail[1], 2.0) + powf(*detectDistOut - tail[2], 2.0));
-                lenZ_custom += InterceptsLengths[i];
-            }
-            else{
-               head[0] = cspline(InterceptsVec[i], a[0], b[0], c[0], d[0]);
-               head[1] = cspline(InterceptsVec[i], a[1], b[1], c[1], d[1]);
-               head[2] = InterceptsVec[i]*lenZ;
-               InterceptsLengths[i] = sqrt(powf(head[0] - tail[0], 2.0) + powf(head[1] - tail[1], 2.0) + powf(head[2] - tail[2], 2.0));
-               tail[0] = head[0];
-               tail[1] = head[1];
-               tail[2] = head[2]; 
-               lenZ_custom += InterceptsLengths[i];
-            }
-        }*/
-
-         float tOld = 0.0;
-         if (count==0){ 
-           indX = int(pos1[0]/pix[0]+dimX/2.); // REPLACE: pos1 by pos0
-           indY = int(pos1[1]/pix[1]+dimY/2.);
-
-           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-               linInd = indY + indX*(dSizeY);  
-               atomicAdd(&dhist1[linInd], p_wepl[protonIndex]);
-               atomicAdd(&dhist2[linInd], 1.0f);
-           }
-
-         } 
-         else{
-            for(int i= 0; i<=count; i++){
-              // lk = InterceptsLengths[i]; 
-              lk = (InterceptsVec[i]- tOld)*lenZ;
-              if(i == 0){
-                indX = int(d[0]/pix_start_x + dimX/2);
-                indY = int(d[1]/pix_start_y + dimY/2);
-                linInd = indY + indX*(dSizeY); 
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-                    linInd = indY + indX*(dSizeY);
-		    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
-                }
-                // tOld = InterceptsVec[i]; 
-
-              }else if(i == count){
-                // lk = InterceptsLengths[i]; 
-                lk = lenZ - InterceptsVec[i-1]*lenZ;
-                indX = int(pos1[0]/pix[0] + dimX/2);
-                indY = int(pos1[1]/pix[1] + dimY/2);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
-                }
-
-              }else{
-                if (i != 0 && i != count){
-                float curr_pix_x = ((InterceptsVec[i]-eps)*lenZ + *detectDistIn - *sourceDist) * (pix[0]/abs(*detectDistOut - *sourceDist));
-                float curr_pix_y = ((InterceptsVec[i]-eps)*lenZ + *detectDistIn - *sourceDist) * (pix[1]/abs(*detectDistOut - *sourceDist));
-                indX = int(csplineCone(InterceptsVec[i] - eps, a[0], b[0], c[0], d[0])/curr_pix_x + dimX/2);
-                indY = int(csplineCone(InterceptsVec[i] - eps, a[1], b[1], c[1], d[1])/curr_pix_y + dimY/2);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], (lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], (lk/lenZ)*(lk/lenZ));
-                }
-                tOld = InterceptsVec[i]; 
-              }
-            }
-            }//i
-         }//if - Intercepts         
-     }
-    else{
-        atomicAdd(reject, 1.0);
-    }
-/* ------------------------ End no Hull calculation (CS only)  -------------------------- */
-    }
-
-else{
-    // WEIGHTING FACTORS FOR CHANNELS I 
-    float weight_air_in = 0.00479; 
-    float weight_air_out = 0.00479; 
-
-    float HullIn[3], HullOut[3], initpos[3], exitpos[3];  
-    float initdir[2], exitdir[2]; 
-            
-    initpos[0] = devicePosIn[protonIndex];
-    initpos[1] = devicePosIn[protonIndex + entries];
-    initpos[2] = *detectDistIn;
-
-    exitpos[0] = devicePosOut[protonIndex];
-    exitpos[1] = devicePosOut[protonIndex + entries];
-    exitpos[2] = *detectDistOut;
-
-    initdir[0] = devicedirIn[protonIndex];
-    initdir[1] = devicedirIn[protonIndex + entries];
-
-    exitdir[0] = devicedirOut[protonIndex];
-    exitdir[1] = devicedirOut[protonIndex + entries];
-
-    int check = hullEntryExitCone(HullIn, initpos, initdir, 1, hull, *detectDistIn);
-
-    if(check == 0){
-        check = hullEntryExitCone(HullOut, exitpos, exitdir, 0, hull, *detectDistOut);
-    }
-
-    if(check == 0 && HullOut[2] > HullIn[2]){            
-        /* --------------------------------------------------------------------------------- */
-        /* ------------------------ Start with Hull + SL outside  -------------------------- */
-        /* --------------------------------------------------------------------------------- */
-        const int hullIntercep = int(vecSizeCS);  
-        const int airIntercepIn = int(vecSizeIn);   
-        const int airIntercepOut = int(vecSizeOut);   
-        bool status1 = false;
-        bool status2 = false; 
-        bool status3 = false;
-        
-        int countIn, countHull, countOut;
-        float InterceptsVecOut[airIntercepOut] = {0}; 
-        float InterceptsVecIn[airIntercepIn] = {0};
-        float InterceptsVecHull[hullIntercep] = {0}; 
-        lenX = sqrt((HullOut[0] - HullIn[0])*(HullOut[0] - HullIn[0]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2])); 
-        lenY = sqrt((HullOut[1] - HullIn[1])*(HullOut[1] - HullIn[1]) + (HullOut[2] - HullIn[2])*(HullOut[2] - HullIn[2]));
-        
-        float newpix[2];
-        newpix[0] = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-        newpix[1] = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-        countIn = calcInterceptsLinearCone(InterceptsVecIn, initpos, HullIn, initdir, newpix, airIntercepIn, &status1, *sourceDist);
-        countOut = calcInterceptsLinearCone(InterceptsVecOut, HullOut, exitpos, exitdir, pix, airIntercepOut, &status2, *sourceDist);
-
-        /* ------------ CUBIC SPLINE PREPARATIONS ---------------- */
-        float lambda0, lambda1, ref_wepl;
-        ref_wepl = 10 * 0.00244 * powf(*ein, 1.75);
-        lambda0 = 1.01 + 0.43 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
-        lambda1 = 0.99 - 0.46 * (p_wepl[protonIndex]/ref_wepl)*(p_wepl[protonIndex]/ref_wepl);
-
-        float a[2], b[2], c[2], d[2], pos1[2];
-
-        //Allocate memory for all pointers
-        // Calculate optimized xdir_in
-        devicedirIn[protonIndex] = devicedirIn[protonIndex] \
-                / sqrt(devicedirIn[protonIndex]*devicedirIn[protonIndex] + 1.0);    // ... dz = 1!
-        devicedirIn[protonIndex] = devicedirIn[protonIndex] * lenX * lambda0;
-
-        // Calculate optimized ydir_in
-        devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] \
-                / sqrt(devicedirIn[protonIndex + entries]*devicedirIn[protonIndex + entries] + 1.0);   // ... dz = 1!
-        devicedirIn[protonIndex + entries] = devicedirIn[protonIndex + entries] * lenY * lambda0;
-
-        // Calculate optimized xdir_out
-        devicedirOut[protonIndex] = devicedirOut[protonIndex] \
-                / sqrt(devicedirOut[protonIndex]*devicedirOut[protonIndex] + 1.0); // ... dz = 1!
-        devicedirOut[protonIndex] = devicedirOut[protonIndex] * lenX * lambda1;
-
-        // Calculate optimized ydir_out
-        devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] \
-                / sqrt(devicedirOut[protonIndex + entries]*devicedirOut[protonIndex + entries] + 1.0); // ... dz = 1!
-        devicedirOut[protonIndex + entries] = devicedirOut[protonIndex + entries] * lenY * lambda1;
-
-        // Calculate spline parameters
-        a[0] = HullIn[0]*2. + devicedirIn[protonIndex] - 2.*HullOut[0] + devicedirOut[protonIndex];
-        a[1] = HullIn[1]*2. + devicedirIn[protonIndex + entries] - \
-        2.*HullOut[1] +  devicedirOut[protonIndex + entries];
-
-        b[0] = -3.*HullIn[0] -2.*devicedirIn[protonIndex] + 3.*HullOut[0] - devicedirOut[protonIndex];
-        b[1] = -3.*HullIn[1] -2.* devicedirIn[protonIndex + entries] \
-        + 3.*HullOut[1] - devicedirOut[protonIndex + entries];
-
-        c[0] = devicedirIn[protonIndex];
-        c[1] = devicedirIn[protonIndex + entries];
-
-        d[0] = HullIn[0];
-        d[1] = HullIn[1];
-
-        pos1[0] = HullOut[0];
-        pos1[1] = HullOut[1];
-
-        // float newpix[2];
-        newpix[0] = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-        newpix[1] = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-        countHull = calcInterceptsCone(InterceptsVecHull, a, b, c, d, pos1, newpix, &status3, hullIntercep, *sourceDist, HullIn[2], HullOut[2]);
-        /* -------------------- End CS Preparations! -------------- */
-
-        if(status1 && status2 && status3){
-        float tOld = initpos[2];
-        int indX, indY, linInd;
-        // WEIGHTING FACTORS FOR CHANNELS II
-        float weight_water = 1;  
-
-        // ---------------------------------------- Start with SL from detector to hull
-        float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-        float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-        float pix_end_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-        float pix_end_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));  
-        if (countIn == 0){
-        indX = int(initpos[0]/pix_start_x + dimX/2.);
-        indY = int(initpos[1]/pix_start_y + dimY/2.);
-        lk = HullIn[2] - initpos[2];
-        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-           linInd = indY + indX*(dSizeY);  
-           atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-           atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-            }
-        }
-
-        else{
-        for(int i= 0; i<=countIn; i++){
-           lk = InterceptsVecIn[i] - tOld;
-           if(i == 0){
-             indX = int(initpos[0]/pix_start_x + dimX/2.);
-             indY = int(initpos[1]/pix_start_y + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecIn[i];
-             }   
-           }
-           else if(i == countIn){
-             lk = HullIn[2] - InterceptsVecIn[i-1];
-             indX = int(HullIn[0]/pix_end_x + dimX/2.);
-             indY = int(HullIn[1]/pix_end_y + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-             }
-           }
-
-           else{
-             float curr_pix_x = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-             float curr_pix_y = abs((InterceptsVecIn[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-             indX = int(((initdir[0]*(InterceptsVecIn[i]-eps) + (initpos[0] - initdir[0] * initpos[2] )))/curr_pix_x + dimX/2.);
-             indY = int(((initdir[1]*(InterceptsVecIn[i]-eps) + (initpos[1] - initdir[1] * initpos[2] )))/curr_pix_y + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullIn[2]-initpos[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_in*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecIn[i];
-             }
-            }
-           }
-          }   // end else
-
-        // ---cone beam------------------------ CS within hull
-        
-             tOld = 0.0;
-             pix_start_x = abs(HullIn[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-             pix_start_y = abs(HullIn[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-             pix_end_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-             pix_end_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-             if (countHull==0){ 
-               indX = int(HullIn[0]/pix_start_x + dimX/2.); 
-               indY = int(HullIn[1]/pix_start_y + dimY/2.);
-               lk = HullOut[2] - HullIn[2];
-               if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-                   linInd = indY + indX*(dSizeY);  
-                   atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                   atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-               }
-
-             } else{
-                for(int i= 0; i<=countHull; i++){
-                  lk = (InterceptsVecHull[i] - tOld)*(HullOut[2] - HullIn[2]);
-                  if(tOld == 0){
-                    indX = int(d[0]/pix_start_x + dimX/2.);
-                    indY = int(d[1]/pix_start_y + dimY/2.);
-                    linInd = indY + indX*(dSizeY); 
-
-                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
-                        linInd = indY + indX*(dSizeY);
-                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-                    }
-                    tOld = InterceptsVecHull[i];
-
-                  }else if(i == countHull){
-                    lk = (HullOut[2] - HullIn[2]) - InterceptsVecHull[i-1]*(HullOut[2] - HullIn[2]);
-                    indX = int(pos1[0]/pix_end_x + dimX/2.);
-                    indY = int(pos1[1]/pix_end_y + dimY/2.);
-
-                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
-                        linInd = indY + indX*(dSizeY); 
-                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-                    }
-
-                  }else{
-                    float curr_len = (InterceptsVecHull[i]-eps)*(HullOut[2]-HullIn[2]) + (HullIn[2] - *sourceDist); // abs(((InterceptsVecHull[i]-eps)*lenZ + *detectDistIn) - *sourceDist)
-                    float curr_pix_x = curr_len * (pix[0]/abs(exitpos[2] - *sourceDist));
-                    float curr_pix_y = curr_len * (pix[1]/abs(exitpos[2] - *sourceDist));
-                    indX = int(csplineCone(InterceptsVecHull[i] - eps, a[0], b[0], c[0], d[0])/curr_pix_x + dimX/2.);
-                    indY = int(csplineCone(InterceptsVecHull[i] - eps, a[1], b[1], c[1], d[1])/curr_pix_y + dimY/2.);
-
-                    if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (HullOut[2]-HullIn[2]))){
-                        linInd = indY + indX*(dSizeY); 
-                        atomicAdd(&dhist1[linInd], weight_water*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                        atomicAdd(&dhist2[linInd], weight_water*(lk/lenZ)*(lk/lenZ));
-                    }
-                    tOld = InterceptsVecHull[i];
-                  }
-
-             }//i
-         }
-
-        // --------------------------- SL from hull to detector
-        tOld = HullOut[2];
-        pix_start_x = abs(HullOut[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-        pix_start_y = abs(HullOut[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-        if (countOut == 0){
-        indX = int(exitpos[0]/pix[0] + dimX/2.);
-        indY = int(exitpos[1]/pix[1] + dimY/2.);
-        lk = exitpos[2] - HullOut[2];
-        if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){ 
-           linInd = indY + indX*(dSizeY);  
-           atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-           atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-            }
-        }
-
-        else{
-        for(int i= 0; i<=countOut; i++){
-           lk = abs(InterceptsVecOut[i] - tOld);
-           if(i == 0){
-             indX = int(HullOut[0]/pix_start_x + dimX/2.);
-             indY = int(HullOut[1]/pix_start_y + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
-             linInd = indY + indX*(dSizeY);  
-             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecOut[i];
-             }   
-           }
-           else if(i == countOut){
-             lk = exitpos[2] - InterceptsVecOut[i-1];
-             indX = int(exitpos[0]/pix[0] + dimX/2.);
-             indY = int(exitpos[1]/pix[1] + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-             }
-           }
-
-           else{
-             float curr_pix_x = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-             float curr_pix_y = abs((InterceptsVecOut[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-             indX = int(((exitdir[0]*(InterceptsVecOut[i]-eps) + (HullOut[0] - exitdir[0] * HullOut[2])))/curr_pix_x + dimX/2.);
-             indY = int(((exitdir[1]*(InterceptsVecOut[i]-eps) + (HullOut[1] - exitdir[1] * HullOut[2])))/curr_pix_y + dimY/2.);
-             if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < (exitpos[2]-HullOut[2]))){
-             linInd = indY + indX*(dSizeY);
-             atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-             atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-             tOld = InterceptsVecOut[i];
-             }
-            }
-           }
-          }   // end else
-
-        }
-        else{
-        atomicAdd(reject, 1.0);
-    }
-
-        /* --------------------------- End Hull + SL outside ------------------------------- */
-        
-        }  
-
-    else{   
-    
-    /* --------------------------------------------------------------------------------- */
-    /* ----------------------------- Start with SL only!  ------------------------------ */
-    /* --------------------------------------------------------------------------------- */ 
-    int count;
-    bool status = false;
-    float InterceptsVec[vecSizeCS] = {0}; 
-    //float InterceptsLengths[vecSizeCS+1] = {0}; 
-    
-    float initpos[3], exitpos[3]; 
-    float mydir[2]; 
-    initpos[0] = devicePosIn[protonIndex];
-    initpos[1] = devicePosIn[protonIndex + entries];
-    initpos[2] = *detectDistIn;
-    exitpos[0] = devicePosOut[protonIndex];
-    exitpos[1] = devicePosOut[protonIndex + entries];
-    exitpos[2] = *detectDistOut;
-
-    mydir[0] = (exitpos[0] - initpos[0])/lenZ;
-    mydir[1] = (exitpos[1] - initpos[1])/lenZ;  // dz = 1
-    count = calcInterceptsLinearCone(InterceptsVec, initpos, exitpos, mydir, pix, vecSizeCS, &status, *sourceDist);
-
-    // for cone beam we need this
-    /*float lenZ_custom = 0.0;
-    float head[3], tail[3];
-    for (int i=0; i<=count; i++){
-        if (i == 0){
-            head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]);
-            head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]);
-            head[2] = InterceptsVec[i];
-            InterceptsLengths[i] = sqrt(powf(head[0] - initpos[0], 2.0) + powf(head[1] - initpos[1], 2.0) + powf(head[2] - initpos[2], 2.0));
-            tail[0] = head[0];
-            tail[1] = head[1];
-            tail[2] = head[2];
-            lenZ_custom += InterceptsLengths[i];
-        }
-        else if (i == count){
-            InterceptsLengths[i] = sqrt(powf(exitpos[0] - tail[0], 2.0) + powf(exitpos[1] - tail[1], 2.0) + powf(exitpos[2] - tail[2], 2.0));
-            lenZ_custom += InterceptsLengths[i];
-        }
-        else{
-           head[0] = mydir[0]*InterceptsVec[i] + 0.5*(initpos[0] + exitpos[0]);
-           head[1] = mydir[1]*InterceptsVec[i] + 0.5*(initpos[1] + exitpos[1]);
-           head[2] = InterceptsVec[i];
-           InterceptsLengths[i] = sqrt(powf(head[0] - tail[0], 2.0) + powf(head[1] - tail[1], 2.0) + powf(head[2] - tail[2], 2.0));
-           tail[0] = head[0];
-           tail[1] = head[1];
-           tail[2] = head[2]; 
-           lenZ_custom += InterceptsLengths[i];
-        }
-    }*/
-            
-    float pix_start_x = abs(initpos[2] - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-    float pix_start_y = abs(initpos[2] - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-
-    if (status) { 
-        int indX, indY, linInd;
-        // exitpos[0] / (exitpos[2] - *sourceDir);
-         float tOld = initpos[2];
-         if (count==0){ 
-           indX = int(initpos[0]/pix_start_x + dimX/2.); 
-           indY = int(initpos[1]/pix_start_y + dimY/2.);
-
-           if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY)){
-               linInd = indY + indX*(dSizeY);  
-               atomicAdd(&dhist1[linInd], weight_air_out*p_wepl[protonIndex]);
-               atomicAdd(&dhist2[linInd], weight_air_out*1.0f);
-           }
-
-         } else{
-            for(int i= 0; i<=count; i++){
-              lk = InterceptsVec[i] - tOld; 
-              // lk = InterceptsLengths[i];
-              if(i == 0){
-                indX = int(initpos[0]/pix_start_x + dimX/2.);
-                indY = int(initpos[1]/pix_start_y + dimY/2.); 
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
-                    linInd = indY + indX*(dSizeY);
-                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-                }
-                tOld = InterceptsVec[i];
-
-              }else if(i == count){
-                lk = exitpos[2] - InterceptsVec[i-1];
-                indX = int(exitpos[0]/pix[0] + dimX/2.);
-                indY = int(exitpos[1]/pix[1] + dimY/2.);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-                }
-
-              }else{
-                float curr_pix_x = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[0]/abs(exitpos[2] - *sourceDist));
-                float curr_pix_y = abs((InterceptsVec[i]-eps) - *sourceDist) * (pix[1]/abs(exitpos[2] - *sourceDist));
-                indX = int(((mydir[0]*(InterceptsVec[i]-eps) + (initpos[0] - mydir[0] * (initpos[2]))))/curr_pix_x+dimX/2.);
-                indY = int(((mydir[1]*(InterceptsVec[i]-eps) + (initpos[1] - mydir[1] * (initpos[2]))))/curr_pix_y+dimY/2.);
-
-                if ((0 <= indX) && (indX < dSizeX) && (0 <= indY) && (indY < dSizeY) && (0 < lk) && (lk < lenZ)){
-                    linInd = indY + indX*(dSizeY); 
-                    atomicAdd(&dhist1[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ)*p_wepl[protonIndex]);
-                    atomicAdd(&dhist2[linInd], weight_air_out*(lk/lenZ)*(lk/lenZ));
-                }
-                tOld = InterceptsVec[i];
-              }
-
-            } //i
-         }//if - Intercepts
-     }
-    else{
-        // *reject += 1;
-        atomicAdd(reject, 1.0);
-    }
-    /* ------------------------------ End SL only! ------ -------------------------- */
-    }   
-   }
-}
-
-__global__ void sumHistCone(float* hist, float* histNorm){
-    
-    unsigned int index = blockIdx.x*blockDim.x  + threadIdx.x;
-    hist[index] = hist[index]/histNorm[index];
-}
-
-__host__ void ParticleProjectionsCone(float * outProjection, float* posIn, float* posOut, float* dirIn, float* dirOut, \
-                                  float* p_wepl, int numOfEntries, int detectSizeX, int detectSizeY, float* pixelSize, \
-                                  float detectDistIn, float detectDistOut, float sourcePos, \
-                                  float ein, float* ch_param){
-
-    /*
-    Detect Size = 400x400
-    Prepare Input for GPU*/
-
-    const int sizeInputs = 2*numOfEntries*sizeof(float);
-    const int detectorMem = detectSizeX*detectSizeY*sizeof(float);
-    float reject = 0.0;
-
-    float *dPosIn, *dPosOut, *ddirIn, *ddirOut, *dhist1, *dhist2, *d_wepl, *dHull;
-    int *dnumEntries, *ddetectorX, *ddetectorY;
-    float *dpixelSize, *dDetectDistIn, *dDetectDistOut, *dSourceDist, *dEin, *dReject;
-
-    float *hist1, *hist2;
-    hist1 = new float[detectSizeX*detectSizeY];
-    hist2 = new float[detectSizeX*detectSizeY];
-    for(int i = 0; i<detectSizeX*detectSizeY; i++){
-        hist1[i] = 0.f;
-        hist2[i]= 0.f;
-    
-    }
-
-    //Allocate Memory on GPU
-    cudaMalloc( (void**) &dPosIn, sizeInputs );
-    cudaMalloc( (void**) &dPosOut, sizeInputs );
-    cudaMalloc( (void**) &ddirIn, sizeInputs );
-    cudaMalloc( (void**) &ddirOut, sizeInputs );
-    cudaMalloc( (void**) &d_wepl, numOfEntries*sizeof(float));
-    cudaMalloc( (void**) &dhist1, detectorMem );
-    cudaMalloc( (void**) &dhist2, detectorMem );
-    cudaMalloc( (void**) &dnumEntries, sizeof(int));
-    cudaMalloc( (void**) &ddetectorX, sizeof(int));
-    cudaMalloc( (void**) &ddetectorY, sizeof(int));
-    cudaMalloc( (void**) &dpixelSize, 2*sizeof(float));
-    cudaMalloc( (void**) &dDetectDistIn, sizeof(float));
-    cudaMalloc( (void**) &dDetectDistOut, sizeof(float));
-    cudaMalloc( (void**) &dSourceDist, sizeof(float));
-    cudaMalloc( (void**) &dEin, sizeof(float));
-    cudaMalloc( (void**) &dReject, sizeof(float));
-    cudaMalloc( (void**) &dHull, 5*sizeof(float));
-    cudaError_t _err_alloc = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err_alloc));
-    cudaCheckErrors("GPU Allocation failed!");
-
-    //Copy Arrays to GPU
-    cudaMemcpy(dPosIn, posIn,sizeInputs ,cudaMemcpyHostToDevice);
-    cudaMemcpy(dPosOut, posOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirIn, dirIn,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(ddirOut, dirOut,sizeInputs,cudaMemcpyHostToDevice);
-    cudaMemcpy(d_wepl, p_wepl, numOfEntries*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dnumEntries, &numOfEntries,sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorX, &detectSizeX, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(ddetectorY, &detectSizeY, sizeof(int), cudaMemcpyHostToDevice);
-    cudaMemcpy(dpixelSize, pixelSize, 2*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistIn, &detectDistIn, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dDetectDistOut, &detectDistOut, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dSourceDist, &sourcePos, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dEin, &ein, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dReject, &reject, sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dHull, ch_param, 5*sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist1, hist1, detectorMem, cudaMemcpyHostToDevice);
-    cudaMemcpy(dhist2, hist2, detectorMem, cudaMemcpyHostToDevice);
-    cudaCheckErrors("Host to device transport failed!");
-
-
-
-    dim3 grid(floor(numOfEntries/maxthreads),1,1);
-    dim3 block(maxthreads,1,1);
-
-    
-    ParticleKernelCone<<<grid, block>>>(dhist1, dhist2, dPosIn, dPosOut, ddirIn, ddirOut, d_wepl, dnumEntries, ddetectorX, ddetectorY, \
-            dpixelSize, dDetectDistIn, dDetectDistOut, dEin, dHull, dReject, dSourceDist);
-    cudaError_t _err = cudaGetLastError();
-    mexPrintf("%s \n", cudaGetErrorString(_err));
-    cudaCheckErrors("Kernel fail!");
-    
-    //dim3 grid_sum((int)floor(detectSizeX*detectSizeY/64),1,1);
-    //dim3 block_sum(64,1,1);
-    //sumHist<<<grid_sum, block_sum>>>(dhist1, dhist2);
-        
-    //Copy result from device to host
-    //cudaMemcpy(outProjection, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist1, dhist1,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(hist2, dhist2,detectorMem ,cudaMemcpyDeviceToHost);
-    cudaMemcpy(&reject, dReject,sizeof(float) ,cudaMemcpyDeviceToHost);
-    //cudaError_t _errcp = cudaGetLastError();
-    //mexPrintf("%s \n", cudaGetErrorString(_errcp));
-    cudaCheckErrors("Device to host transport failed!");
-    
-    for(int j = 0; j<detectSizeX*detectSizeY; j++){
-        outProjection[j] = hist1[j]/hist2[j]; 
-    }
-
-    std::cout << "Particles rejected [%]: " << 100*reject/numOfEntries << std::endl;
-
-    cudaFree(dPosIn);
-    cudaFree(dPosOut);
-    cudaFree(ddirIn);
-    cudaFree(ddirOut);
-    cudaFree(dhist1);
-    cudaFree(dhist2);
-    cudaFree(d_wepl);
-    cudaFree(dnumEntries);
-    cudaFree(ddetectorX);
-    cudaFree(ddetectorY);
-    cudaFree(dpixelSize);
-    cudaFree(dDetectDistIn);
-    cudaFree(dDetectDistOut);
-    cudaFree(dEin);
-    cudaFree(dReject);
-    cudaFree(dHull);
-
-    delete(hist1);
-    delete(hist2);
-    // delete(&reject);
-
-
-}
diff --git a/Common/CUDA/projection.cpp.prehip b/Common/CUDA/projection.cpp.prehip
deleted file mode 100644
index aaebf6ef..00000000
--- a/Common/CUDA/projection.cpp.prehip
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "projection.hpp"
-#include <math.h>
-#include <algorithm>
-
-float maxDistanceCubeXY(Geometry geo, float alpha,int i){
-    ///////////
-    // Compute initial "t" so we access safely as less as out of bounds as possible.
-    //////////
-
-    float maxCubX,maxCubY;
-    // Forgetting Z, compute max distance: diagonal+offset
-    maxCubX=(geo.sVoxelX/2+ abs(geo.offOrigX[i]))/geo.dVoxelX;
-    maxCubY=(geo.sVoxelY/2+ abs(geo.offOrigY[i]))/geo.dVoxelY;
-
-    return geo.DSO[i]/geo.dVoxelX-sqrt(maxCubX*maxCubX+maxCubY*maxCubY);
-}
-
-void rollPitchYaw(Geometry geo,int i, Point3D* point){
-    Point3D auxPoint;
-    auxPoint.x=point->x;
-    auxPoint.y=point->y;
-    auxPoint.z=point->z;
-
-    point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
-         +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
-         +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
-
-    point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
-         +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
-         +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
-
-    point->z=-sin(geo.dPitch[i])*auxPoint.x
-         +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y
-         +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z;
-}
\ No newline at end of file
diff --git a/Common/CUDA/projection.hpp.prehip b/Common/CUDA/projection.hpp.prehip
deleted file mode 100644
index 54597d92..00000000
--- a/Common/CUDA/projection.hpp.prehip
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef PROJECTION_HPP
-#define PROJECTION_HPP
-
-#include "types_TIGRE.hpp"
-
-float maxDistanceCubeXY(Geometry geo, float alpha,int i);
-void rollPitchYaw(Geometry geo,int i, Point3D* point);
-
-#endif
diff --git a/Common/CUDA/ray_interpolated_projection.cu.prehip b/Common/CUDA/ray_interpolated_projection.cu.prehip
deleted file mode 100644
index e71c5b59..00000000
--- a/Common/CUDA/ray_interpolated_projection.cu.prehip
+++ /dev/null
@@ -1,843 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for texture-memory interpolation based projection
- *
- * This file has the necessary fucntiosn to perform X-ray CBCT projection
- * operation given a geaometry, angles and image. It uses the 3D texture
- * memory linear interpolation to uniformily sample a path to integrate the
- * X-rays.
- *
- * CODE by       Ander Biguri
- *               Sepideh Hatamikia (arbitrary rotation)
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-
-
-
-
-#include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include "ray_interpolated_projection.hpp"
-#include "TIGRE_common.hpp"
-#include <math.h>
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                cudaDeviceReset();\
-                        mexErrMsgIdAndTxt("TIGRE:Ax:interpolated",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-    
-    
-#define MAXTREADS 1024
-#define PROJ_PER_BLOCK 9
-#define PIXEL_SIZE_BLOCK 9
-    /*GEOMETRY DEFINITION
-     *
-     *                Detector plane, behind
-     *            |-----------------------------|
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |      +--------+             |
-     *            |     /        /|             |
-     *   A Z      |    /        / |*D           |
-     *   |        |   +--------+  |             |
-     *   |        |   |        |  |             |
-     *   |        |   |     *O |  +             |
-     *    --->y   |   |        | /              |
-     *  /         |   |        |/               |
-     * V X        |   +--------+                |
-     *            |-----------------------------|
-     *
-     *           *S
-     *
-     *
-     *
-     *
-     *
-     **/
-    void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate);
-__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
-__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
-
-
-__global__ void vecAddInPlaceInterp(float *a, float *b, unsigned long  n)
-{
-    int idx = blockIdx.x*blockDim.x+threadIdx.x;
-    // Make sure we do not go out of bounds
-    if (idx < n)
-        a[idx] = a[idx] + b[idx];
-}
-
-
-template<bool sphericalrotation>
-        __global__ void kernelPixelDetector( Geometry geo,
-        float* detector,
-        const int currProjSetNumber,
-        const int totalNoOfProjections,
-        cudaTextureObject_t tex){
-    
-    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned long long projNumber=threadIdx.z;
-    
-    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
-        return;
-    
-#if IS_FOR_MATLAB_TIGRE
-    size_t idx =  (size_t)(u * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#else
-    size_t idx =  (size_t)(v * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#endif
-
-    unsigned long indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array
-    
-    if(indAlpha>=totalNoOfProjections)
-        return;
-    
-    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
-    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
-    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
-    Point3D source = projParamsArrayDev[4*projNumber+3];
-    
-    float DSO = projFloatsArrayDev[2*projNumber+0];
-    float cropdist_init = projFloatsArrayDev[2*projNumber+1];
-    
-    
-    
-    /////// Get coordinates XYZ of pixel UV
-    unsigned long pixelV = geo.nDetecV-v-1;
-    unsigned long pixelU = u;
-    
-    
-    float vectX,vectY,vectZ;
-    Point3D P;
-    P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
-    P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
-    P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
-    
-    // Length is the ray length in normalized space
-    float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z));
-    //now legth is an integer of Nsamples that are required on this line
-    length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer
-    vectX=__fdividef(P.x -source.x,length);
-    vectY=__fdividef(P.y -source.y,length);
-    vectZ=__fdividef(P.z -source.z,length);
-    
-    
-//     //Integrate over the line
-    float tx,ty,tz;
-    float sum=0;
-    float i;
-    
-    
-    
-//  Because I have no idea how to efficiently cutoff the legth path in 3D, a very upper limit is computed (see maxdistanceCuboid)
-//  for the 3D case. However it would be bad to lose performance in the 3D case
-//  TODO: can ge really improve this?
-    if (sphericalrotation){
-        if ((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy  <   length)
-            length=ceilf((2*DSO/fminf(fminf(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)+cropdist_init)/geo.accuracy);
-    }
-    else{
-        if ((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy  <   length)
-            length=ceilf((2*DSO/fminf(geo.dVoxelX,geo.dVoxelY)+cropdist_init)/geo.accuracy);
-    }
-    
-    
-    //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel")
-    for (i=floorf(cropdist_init/geo.accuracy); i<=length; i=i+1){
-        tx=vectX*i+source.x;
-        ty=vectY*i+source.y;
-        tz=vectZ*i+source.z;
-        
-        sum += tex3D<float>(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time.
-    }
-    
-    float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+
-            (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+
-            (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) );
-    
-    detector[idx]=sum*deltalength;
-}
-
-
-
-// legnth(angles)=3 x nagnles, as we have roll, pitch, yaw.
-int interpolation_projection(float  *  img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
-    
-    
-    // Prepare for MultiGPU
-    int deviceCount = gpuids.GetLength();
-    cudaCheckErrors("Device query fail");
-    if (deviceCount == 0) {
-        mexErrMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","There are no available device(s) that support CUDA\n");
-    }
-    //
-    // CODE assumes
-    // 1.-All available devices are usable by this code
-    // 2.-All available devices are equal, they are the same machine (warning thrown)
-    // Check the available devices, and if they are the same
-    if (!gpuids.AreEqualDevices()) {
-        mexWarnMsgIdAndTxt("Ax:Interpolated_projection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
-    }
-    int dev;
-    
-    // Check free memory
-    size_t mem_GPU_global;
-    checkFreeMemory(gpuids,&mem_GPU_global);
-
-    // printf("geo.nDetec (U, V) = %d, %d\n", geo.nDetecU, geo.nDetecV);
-    
-    size_t mem_image=(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
-    size_t mem_proj =(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV * sizeof(float);
-    
-    // Does everything fit in the GPUs?
-    const bool fits_in_memory = mem_image+2*PROJ_PER_BLOCK*mem_proj<mem_GPU_global;
-    unsigned int splits=1;
-    if (!fits_in_memory) {
-        // Nope nope.
-        // approx free memory we have. We already have left some extra 5% free for internal stuff
-        // we need a second projection memory to combine multi-GPU stuff.
-        size_t mem_free=mem_GPU_global-4*PROJ_PER_BLOCK*mem_proj;
-        splits=mem_image/mem_free+1;// Ceil of the truncation
-    }
-    Geometry* geoArray = (Geometry*)malloc(splits*sizeof(Geometry));
-    splitImageInterp(splits,geo,geoArray,nangles);
-    
-    // Allocate auiliary memory for projections on the GPU to accumulate partial results
-    float ** dProjection_accum;
-    size_t num_bytes_proj = PROJ_PER_BLOCK*geo.nDetecU*geo.nDetecV * sizeof(float);
-    if (!fits_in_memory){
-        dProjection_accum=(float**)malloc(2*deviceCount*sizeof(float*));
-        for (dev = 0; dev < deviceCount; dev++) {
-            cudaSetDevice(gpuids[dev]);
-            for (int i = 0; i < 2; ++i){
-                cudaMalloc((void**)&dProjection_accum[dev*2+i], num_bytes_proj);
-                cudaMemset(dProjection_accum[dev*2+i],0,num_bytes_proj);
-                cudaCheckErrors("cudaMallocauxiliarty projections fail");
-            }
-        }
-    }
-    
-    // This is happening regarthless if the image fits on memory
-    float** dProjection=(float**)malloc(2*deviceCount*sizeof(float*));
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        
-        for (int i = 0; i < 2; ++i){
-            cudaMalloc((void**)&dProjection[dev*2+i],   num_bytes_proj);
-            cudaMemset(dProjection[dev*2+i]  ,0,num_bytes_proj);
-            cudaCheckErrors("cudaMalloc projections fail");
-        }
-    }
-    
-    
-    
-    
-    //Pagelock memory for synchronous copy.
-    // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
-    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
-   
-#ifndef NO_PINNED_MEMORY
-    if (isHostRegisterSupported & splits>1){
-        cudaHostRegister(img, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
-    }
-    cudaCheckErrors("Error pinning memory");
-#endif
-    Point3D source, deltaU, deltaV, uvOrigin;
-    
-    Point3D* projParamsArrayHost = 0;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
-    float* projFloatsArrayHost = 0;
-    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
-    cudaCheckErrors("Error allocating auxiliary constant memory");
-    
-    // Create Streams for overlapping memcopy and compute
-    int nStream_device=2;
-    int nStreams=deviceCount*nStream_device;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
-    
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        for (int i = 0; i < nStream_device; ++i){
-            cudaStreamCreate(&stream[i+dev*nStream_device]);
-            
-        }
-    }
-    cudaCheckErrors("Stream creation fail");
-    int nangles_device=(nangles+deviceCount-1)/deviceCount;
-    int nangles_last_device=(nangles-(deviceCount-1)*nangles_device);
-    unsigned int noOfKernelCalls = (nangles_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
-    unsigned int noOfKernelCallsLastDev = (nangles_last_device+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK; // we will use this in the memory management.
-    int projection_this_block;
-
-
-    
-    cudaTextureObject_t *texImg = new cudaTextureObject_t[deviceCount];
-    cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
-    for (unsigned int sp=0;sp<splits;sp++){
-        // Create texture objects for all GPUs
-        
-        
-        size_t linear_idx_start;
-        // They are all the same size, except the last one.
-        linear_idx_start= (size_t)sp*(size_t)geoArray[0].nVoxelX*(size_t)geoArray[0].nVoxelY*(size_t)geoArray[0].nVoxelZ;
-        CreateTextureInterp(gpuids,&img[linear_idx_start],geoArray[sp],d_cuArrTex,texImg,!sp);
-        cudaCheckErrors("Texture object creation fail");
-        
-        
-        int divU,divV;
-        divU=PIXEL_SIZE_BLOCK;
-        divV=PIXEL_SIZE_BLOCK;
-        dim3 grid((geoArray[sp].nDetecU+divU-1)/divU,(geoArray[0].nDetecV+divV-1)/divV,1);
-        dim3 block(divU,divV,PROJ_PER_BLOCK);
-        
-        unsigned int proj_global;
-        float maxdist;
-        // Now that we have prepared the image (piece of image) and parameters for kernels
-        // we project for all angles.
-        for (unsigned int i=0; i<noOfKernelCalls; i++) {
-            for (dev=0;dev<deviceCount;dev++){
-                float is_spherical=0;
-                cudaSetDevice(gpuids[dev]);
-                
-                for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
-                    proj_global=(i*PROJ_PER_BLOCK+j)+dev*nangles_device;
-                    if (proj_global>=nangles)
-                        break;
-                     if ((i*PROJ_PER_BLOCK+j)>=nangles_device)
-                        break;
-                    geoArray[sp].alpha=angles[proj_global*3];
-                    geoArray[sp].theta=angles[proj_global*3+1];
-                    geoArray[sp].psi  =angles[proj_global*3+2];
-                    
-                    is_spherical+=abs(geoArray[sp].theta)+abs(geoArray[sp].psi);
-                    
-                    //precomute distances for faster execution
-                    maxdist=maxdistanceCuboid(geoArray[sp],proj_global);
-                    //Precompute per angle constant stuff for speed
-                    computeDeltas(geoArray[sp], proj_global, &uvOrigin, &deltaU, &deltaV, &source);
-                    //Ray tracing!
-                    projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
-                    projParamsArrayHost[4*j+1]=deltaU;
-                    projParamsArrayHost[4*j+2]=deltaV;
-                    projParamsArrayHost[4*j+3]=source;
-                    
-                    projFloatsArrayHost[2*j]=geo.DSO[proj_global];
-                    projFloatsArrayHost[2*j+1]=floor(maxdist);
-                }
-                
-                cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
-                cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[dev*nStream_device]);
-                cudaStreamSynchronize(stream[dev*nStream_device]);
-                
-                
-                //TODO: we could do this around X and Y axis too, but we would need to compute the new axis of rotation (not possible to know from jsut the angles)
-                if (!is_spherical){
-                    kernelPixelDetector<false><<<grid,block,0,stream[dev*nStream_device]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
-                }
-                else{
-                    kernelPixelDetector<true> <<<grid,block,0,stream[dev*nStream_device]>>>(geoArray[sp],dProjection[(i%2)+dev*2],i,nangles_device,texImg[dev]);
-                }
-            }
-            
-            
-            // Now that the computation is happening, we need to either prepare the memory for
-            // combining of the projections (splits>1) and start removing previous results.
-            
-            
-            // If our image does not fit in memory then we need to make sure we accumulate previous results too.
-            // This is done in 2 steps: 
-            // 1)copy previous results back into GPU 
-            // 2)accumulate with current results
-            // The code to take them out is the same as when there are no splits needed
-            if( !fits_in_memory&&sp>0)
-            {
-                // 1) grab previous results and put them in the auxiliary variable dProjection_accum
-                for (dev = 0; dev < deviceCount; dev++)
-                {
-                    cudaSetDevice(gpuids[dev]);
-                    //Global index of FIRST projection on this set on this GPU
-                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
-                    if(proj_global>=nangles) 
-                        break;
-
-                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
-                    if(i+1==noOfKernelCalls) //is it the last block?
-                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
-                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
-                    else
-                        projection_this_block=PROJ_PER_BLOCK;
-                    cudaMemcpyAsync(dProjection_accum[(i%2)+dev*2], result[proj_global], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyHostToDevice,stream[dev*2+1]);
-                }
-                //  2) take the results from current compute call and add it to the code in execution.
-                for (dev = 0; dev < deviceCount; dev++)
-                {
-                    cudaSetDevice(gpuids[dev]);
-                    //Global index of FIRST projection on this set on this GPU
-                    proj_global=i*PROJ_PER_BLOCK+dev*nangles_device;
-                    if(proj_global>=nangles) 
-                        break;
-
-                    // Unless its the last projection set, we have PROJ_PER_BLOCK angles. Otherwise...
-                    if(i+1==noOfKernelCalls) //is it the last block?
-                        projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
-                                                  nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
-                    else
-                        projection_this_block=PROJ_PER_BLOCK;
-                    cudaStreamSynchronize(stream[dev*2+1]); // wait until copy is finished
-                    vecAddInPlaceInterp<<<(geo.nDetecU*geo.nDetecV*projection_this_block+MAXTREADS-1)/MAXTREADS,MAXTREADS,0,stream[dev*2]>>>(dProjection[(i%2)+dev*2],dProjection_accum[(i%2)+dev*2],(unsigned long)geo.nDetecU*geo.nDetecV*projection_this_block);
-                }
-            } // end accumulation case, where the image needs to be split 
-
-            // Now, lets get out the projections from the previous execution of the kernels.
-            if (i>0)
-            {
-                for (dev = 0; dev < deviceCount; dev++)
-                {
-                    cudaSetDevice(gpuids[dev]);
-                    //Global index of FIRST projection on previous set on this GPU
-                    proj_global=(i-1)*PROJ_PER_BLOCK+dev*nangles_device;
-                    if (dev+1==deviceCount) {    //is it the last device?
-                        // projections assigned to this device is >=nangles_device-(deviceCount-1) and < nangles_device
-                        if (i-1 < noOfKernelCallsLastDev) {
-                            // The previous set(block) was not empty.
-                            projection_this_block=min(PROJ_PER_BLOCK, nangles-proj_global);
-                        }
-                        else {
-                            // The previous set was empty.
-                            // This happens if deviceCount > PROJ_PER_BLOCK+1.
-                            // e.g. PROJ_PER_BLOCK = 9, deviceCount = 11, nangles = 199.
-                            // e.g. PROJ_PER_BLOCK = 1, deviceCount =  3, nangles =   7.
-                            break;
-                        }
-                    }
-                    else {
-                        projection_this_block=PROJ_PER_BLOCK;
-                    }
-                    cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(i%2))+dev*2],  projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
-                }
-            }
-            // Make sure Computation on kernels has finished before we launch the next batch.
-            for (dev = 0; dev < deviceCount; dev++)
-            {
-                cudaSetDevice(gpuids[dev]);
-                cudaStreamSynchronize(stream[dev*2]);
-            }
-        } // End noOfKernelCalls (i) loop.
-        
-        // We still have the last set of projections to get out of GPUs
-        for (dev = 0; dev < deviceCount; dev++)
-        {
-            cudaSetDevice(gpuids[dev]);
-            //Global index of FIRST projection on this set on this GPU
-            proj_global=(noOfKernelCalls-1)*PROJ_PER_BLOCK+dev*nangles_device;
-            if(proj_global>=nangles) 
-                break;
-            // How many projections are left here?
-            projection_this_block=min(nangles_device-(noOfKernelCalls-1)*PROJ_PER_BLOCK, //the remaining angles that this GPU had to do (almost never PROJ_PER_BLOCK)
-                                      nangles-proj_global);                              //or whichever amount is left to finish all (this is for the last GPU)
-
-            cudaDeviceSynchronize(); //Not really necessary, but just in case, we los nothing. 
-            cudaCheckErrors("Error at copying the last set of projections out (or in the previous copy)");
-            cudaMemcpyAsync(result[proj_global], dProjection[(int)(!(noOfKernelCalls%2))+dev*2], projection_this_block*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*2+1]);
-        }
-        // Make sure everyone has done their bussiness before the next image split:
-        for (dev = 0; dev < deviceCount; dev++)
-        {
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
-        }
-    } // End image split loop.
-    
-    cudaCheckErrors("Main loop  fail");
-    ///////////////////////////////////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDestroyTextureObject(texImg[dev]);
-        cudaFreeArray(d_cuArrTex[dev]);
-    }
-    delete[] texImg; texImg = 0;
-    delete[] d_cuArrTex; d_cuArrTex = 0;
-    // Freeing Stage
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dProjection[dev*2]);
-        cudaFree(dProjection[dev*2+1]);
-        
-    }
-    free(dProjection);
-    
-    if(!fits_in_memory){
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaFree(dProjection_accum[dev*2]);
-            cudaFree(dProjection_accum[dev*2+1]);
-            
-        }
-        free(dProjection_accum);
-    }
-    freeGeoArray(splits,geoArray);
-    cudaFreeHost(projParamsArrayHost);
-    cudaFreeHost(projFloatsArrayHost);
-    
-    
-    for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]) ;
-#ifndef NO_PINNED_MEMORY
-    if (isHostRegisterSupported & splits>1){
-        cudaHostUnregister(img);
-    }
-#endif
-    cudaCheckErrors("cudaFree  fail");
-    
-//     cudaDeviceReset();
-    return 0;
-}
-void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,bool allocate)
-{
-    const unsigned int num_devices = gpuids.GetLength();
-    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
-    if(allocate){
-        
-        for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
-            
-            //cudaArray Descriptor
-            
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-            //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
-            cudaCheckErrors("Texture memory allocation fail");
-        }
-        
-    }
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaMemcpy3DParms copyParams = {0};
-        cudaSetDevice(gpuids[dev]);
-        //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
-        copyParams.dstArray = d_cuArrTex[dev];
-        copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams);
-        //cudaCheckErrors("Texture memory data copy fail");
-        //Array creation End
-    }
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
-        texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
-        texDescr.normalizedCoords = false;
-        if (geo.accuracy>1){
-            texDescr.filterMode = cudaFilterModePoint;
-            geo.accuracy=1;
-        }
-        else{
-            texDescr.filterMode = cudaFilterModeLinear;
-        }
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
-        cudaCheckErrors("Texture object creation fail");
-    }
-}
-
-/* This code generates the geometries needed to split the image properly in
- * cases where the entire image does not fit in the memory of the GPU
- **/
-void splitImageInterp(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles){
-    
-    unsigned long splitsize=(geo.nVoxelZ+splits-1)/splits;// ceil if not divisible
-    for(unsigned int sp=0;sp<splits;sp++){
-        geoArray[sp]=geo;
-        // All of them are splitsize, but the last one, possible
-        geoArray[sp].nVoxelZ=((sp+1)*splitsize<geo.nVoxelZ)?  splitsize:  geo.nVoxelZ-splitsize*sp;
-        geoArray[sp].sVoxelZ= geoArray[sp].nVoxelZ* geoArray[sp].dVoxelZ;
-        
-        // We need to redefine the offsets, as now each subimage is not aligned in the origin.
-        geoArray[sp].offOrigZ=(float *)malloc(nangles*sizeof(float));
-        for (unsigned int i=0;i<nangles;i++){
-            geoArray[sp].offOrigZ[i]=geo.offOrigZ[i]-geo.sVoxelZ/2+sp*geoArray[0].sVoxelZ+geoArray[sp].sVoxelZ/2;
-        }
-        
-    }
-}
-
-
-
-/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
- * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
- * it does saves about 30% of each of the kernel calls. Thats something!
- **/
-void computeDeltas(Geometry geo,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
-    Point3D S;
-    S.x=geo.DSO[i];
-    S.y=0;
-    S.z=0;
-    
-    //End point
-    Point3D P,Pu0,Pv0;
-    
-    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       P.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    Pu0.x=-(geo.DSD[i]-geo.DSO[i]);   Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5);       Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    Pv0.x=-(geo.DSD[i]-geo.DSO[i]);   Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1);
-    // Geomtric trasnformations:
-    
-    
-    // Now we have the Real world (OXYZ) coordinates of the bottom corner and its two neighbours.
-    // The obkjective is to get a position of the detector in a coordinate system where:
-    // 1-units are voxel size (in each direction can be different)
-    // 2-The image has the its first voxel at (0,0,0)
-    // 3-The image never rotates
-    
-    // To do that, we need to compute the "deltas" the detector, or "by how much
-    // (in new xyz) does the voxels change when and index is added". To do that
-    // several geometric steps needs to be changed
-    
-    //1.Roll,pitch,jaw
-    // The detector can have a small rotation.
-    // according to
-    //"A geometric calibration method for cone beam CT systems" Yang K1, Kwan AL, Miller DF, Boone JM. Med Phys. 2006 Jun;33(6):1695-706.
-    // Only the Z rotation will have a big influence in the image quality when they are small.
-    // Still all rotations are supported
-    
-    // To roll pitch jaw, the detector has to be in centered in OXYZ.
-    P.x=0;Pu0.x=0;Pv0.x=0;
-    
-    // Roll pitch yaw
-    rollPitchYaw(geo,i,&P);
-    rollPitchYaw(geo,i,&Pu0);
-    rollPitchYaw(geo,i,&Pv0);
-    //Now ltes translate the detector coordinates to DOD (original position on real coordinate system:
-    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
-    Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]);
-    Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]);
-    //2: Offset detector
-    
-    
-    //S doesnt need to chagne
-    
-    
-    
-    //3: Rotate around RZ RY RZ
-    Point3D Pfinal, Pfinalu0, Pfinalv0;
-    Pfinal.x  =P.x;
-    Pfinal.y  =P.y  +geo.offDetecU[i]; Pfinal.z  =P.z  +geo.offDetecV[i];
-    Pfinalu0.x=Pu0.x;
-    Pfinalu0.y=Pu0.y  +geo.offDetecU[i]; Pfinalu0.z  =Pu0.z  +geo.offDetecV[i];
-    Pfinalv0.x=Pv0.x;
-    Pfinalv0.y=Pv0.y  +geo.offDetecU[i]; Pfinalv0.z  =Pv0.z  +geo.offDetecV[i];
-    
-    eulerZYZ(geo,&Pfinal);
-    eulerZYZ(geo,&Pfinalu0);
-    eulerZYZ(geo,&Pfinalv0);
-    eulerZYZ(geo,&S);
-    
-    
-    //3: Offset image (instead of offseting image, -offset everything else)
-    
-    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
-    Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i];   Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i];   Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i];
-    Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i];   Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i];   Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i];
-    S.x=S.x-geo.offOrigX[i];                 S.y=S.y-geo.offOrigY[i];                 S.z=S.z-geo.offOrigZ[i];
-    
-    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
-    Pfinal.x  =Pfinal.x+geo.sVoxelX/2-geo.dVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2-geo.dVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2-geo.dVoxelZ/2;
-    Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
-    Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
-    S.x       =S.x+geo.sVoxelX/2-geo.dVoxelX/2;           S.y       =S.y+geo.sVoxelY/2-geo.dVoxelY/2;               S.z       =S.z      +geo.sVoxelZ/2-geo.dVoxelZ/2;
-    
-    //4. Scale everything so dVoxel==1
-    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
-    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
-    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
-    S.x       =S.x/geo.dVoxelX;           S.y       =S.y/geo.dVoxelY;             S.z       =S.z/geo.dVoxelZ;
-    
-    
-    //mexPrintf("COR: %f \n",geo.COR[i]);
-    //5. apply COR. Wherever everything was, now its offesetd by a bit.
-//     Only wors for standard rotaiton, not aribtary axis rotation.
-    float CORx, CORy;
-    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
-    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
-    Pfinal.x+=CORx;   Pfinal.y+=CORy;
-    Pfinalu0.x+=CORx;   Pfinalu0.y+=CORy;
-    Pfinalv0.x+=CORx;   Pfinalv0.y+=CORy;
-    S.x+=CORx; S.y+=CORy;
-    
-    // return
-    
-    *uvorigin=Pfinal;
-    
-    deltaU->x=Pfinalu0.x-Pfinal.x;
-    deltaU->y=Pfinalu0.y-Pfinal.y;
-    deltaU->z=Pfinalu0.z-Pfinal.z;
-    
-    deltaV->x=Pfinalv0.x-Pfinal.x;
-    deltaV->y=Pfinalv0.y-Pfinal.y;
-    deltaV->z=Pfinalv0.z-Pfinal.z;
-    
-    *source=S;
-}
-
-float maxdistanceCuboid(Geometry geo,unsigned int i){
-    ///////////
-    // Compute initial "t" so we access safely as less as out of bounds as possible.
-    //////////
-    
-    
-    float maxCubX,maxCubY,maxCubZ;
-    // Forgetting Z, compute mas distance: diagonal+offset
-    maxCubX=(geo.nVoxelX/2+ abs(geo.offOrigX[i])/geo.dVoxelX);
-    maxCubY=(geo.nVoxelY/2+ abs(geo.offOrigY[i])/geo.dVoxelY);
-    maxCubZ=(geo.nVoxelZ/2+ abs(geo.offOrigZ[i])/geo.dVoxelZ);
-    
-    float a,b;
-    a=geo.DSO[i]/geo.dVoxelX;
-    b=geo.DSO[i]/geo.dVoxelY;
-    
-//  As the return of this value is in "voxel space", the source may have an elliptical curve.
-//  The distance returned is the safe distance that can be skipped for a given angle alpha, before we need to start sampling.
-    
-    if (geo.theta==0.0f & geo.psi==0.0f) // Special case, it will make the code faster
-        return max(a*b/sqrt(a*a*sin(geo.alpha)*sin(geo.alpha)+b*b*cos(geo.alpha)*cos(geo.alpha))-
-                sqrt(maxCubX*maxCubX+maxCubY*maxCubY),0.0f);
-    //TODO: think of more special cases?
-    return max(geo.DSO[i]/max(max(geo.dVoxelX,geo.dVoxelY),geo.dVoxelZ)-sqrt(maxCubX*maxCubX+maxCubY*maxCubY+maxCubZ*maxCubZ),0.0f);
-
-}
-void rollPitchYaw(Geometry geo,unsigned int i, Point3D* point){
-    Point3D auxPoint;
-    auxPoint.x=point->x;
-    auxPoint.y=point->y;
-    auxPoint.z=point->z;
-    
-    point->x=cos(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
-            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) - sin(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
-            +(cos(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) + sin(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
-    
-    point->y=sin(geo.dRoll[i])*cos(geo.dPitch[i])*auxPoint.x
-            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*sin(geo.dYaw[i]) + cos(geo.dRoll[i])*cos(geo.dYaw[i]))*auxPoint.y
-            +(sin(geo.dRoll[i])*sin(geo.dPitch[i])*cos(geo.dYaw[i]) - cos(geo.dRoll[i])*sin(geo.dYaw[i]))*auxPoint.z;
-    
-    point->z=-sin(geo.dPitch[i])*auxPoint.x
-            +cos(geo.dPitch[i])*sin(geo.dYaw[i])*auxPoint.y
-            +cos(geo.dPitch[i])*cos(geo.dYaw[i])*auxPoint.z;
-    
-}
-void eulerZYZ(Geometry geo,  Point3D* point){
-    Point3D auxPoint;
-    auxPoint.x=point->x;
-    auxPoint.y=point->y;
-    auxPoint.z=point->z;
-    
-    point->x=(+cos(geo.alpha)*cos(geo.theta)*cos(geo.psi)-sin(geo.alpha)*sin(geo.psi))*auxPoint.x+
-            (-cos(geo.alpha)*cos(geo.theta)*sin(geo.psi)-sin(geo.alpha)*cos(geo.psi))*auxPoint.y+
-            cos(geo.alpha)*sin(geo.theta)*auxPoint.z;
-    
-    point->y=(+sin(geo.alpha)*cos(geo.theta)*cos(geo.psi)+cos(geo.alpha)*sin(geo.psi))*auxPoint.x+
-            (-sin(geo.alpha)*cos(geo.theta)*sin(geo.psi)+cos(geo.alpha)*cos(geo.psi))*auxPoint.y+
-            sin(geo.alpha)*sin(geo.theta)*auxPoint.z;
-    
-    point->z=-sin(geo.theta)*cos(geo.psi)*auxPoint.x+
-            sin(geo.theta)*sin(geo.psi)*auxPoint.y+
-            cos(geo.theta)*auxPoint.z;
-    
-    
-}
-//______________________________________________________________________________
-//
-//      Function:       freeGeoArray
-//
-//      Description:    Frees the memory from the geometry array for multiGPU.
-//______________________________________________________________________________
-void freeGeoArray(unsigned int splits,Geometry* geoArray){
-    for(unsigned int sp=0;sp<splits;sp++){
-        free(geoArray[sp].offOrigZ);
-    }
-    free(geoArray);
-}
-//______________________________________________________________________________
-//
-//      Function:       checkFreeMemory
-//
-//      Description:    check available memory on devices
-//______________________________________________________________________________
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global){
-    size_t memfree;
-    size_t memtotal;
-    int deviceCount = gpuids.GetLength();
-    for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
-        if(dev==0) *mem_GPU_global=memfree;
-        if(memfree<memtotal/2){
-            mexErrMsgIdAndTxt("ray_interpolated_projection:ax:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
-        }
-        cudaCheckErrors("Check mem error");
-        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
-    }
-    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
-    
-    //*mem_GPU_global= insert your known number here, in bytes.
-}
diff --git a/Common/CUDA/ray_interpolated_projection.hpp.prehip b/Common/CUDA/ray_interpolated_projection.hpp.prehip
deleted file mode 100644
index 2adb8baa..00000000
--- a/Common/CUDA/ray_interpolated_projection.hpp.prehip
+++ /dev/null
@@ -1,66 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA functions for texture-memory interpolation based projection
- *
- *
- * CODE by       Ander Biguri
- *               Sepideh Hatamikia (arbitrary rotation)
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-#include "types_TIGRE.hpp"
-#include "GpuIds.hpp"
-
-
-#ifndef PROJECTION_HPP
-#define PROJECTION_HPP
-
-int interpolation_projection(float* img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids);
-float computeMaxLength(Geometry geo, float alpha);
-void computeDeltas(Geometry geo,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
-void rollPitchYaw(Geometry geo,unsigned int i, Point3D* point);
-float maxdistanceCuboid(Geometry geo,unsigned int i);
-void eulerZYZ(Geometry geo, Point3D* point);
-void splitImageInterp(unsigned int splits,Geometry geo,Geometry* geoArray, unsigned int nangles);
-void freeGeoArray(unsigned int splits,Geometry* geoArray);
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
-// below, not used
-Geometry nomralizeGeometryImage(Geometry geo);
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip b/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip
deleted file mode 100644
index 4aad5d6f..00000000
--- a/Common/CUDA/ray_interpolated_projection_parallel.cu.prehip
+++ /dev/null
@@ -1,449 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA functions for texture-memory interpolation based projection
- *
- * This file has the necessary functions to perform X-ray parallel projection
- * operation given a geaometry, angles and image. It uses the 3D texture
- * memory linear interpolation to uniformily sample a path to integrate the
- * X-rays.
- *
- * CODE by       Ander Biguri
- *               Sepideh Hatamikia (arbitrary rotation)
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-
-#include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include "ray_interpolated_projection_parallel.hpp"
-#include "TIGRE_common.hpp"
-#include <math.h>
-
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("TIGRE:Ax:interpolated_parallel",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-    
-
-#define MAXTREADS 1024
-#define PROJ_PER_BLOCK 8
-#define PIXEL_SIZE_BLOCK 8
-/*GEOMETRY DEFINITION
- *
- *                Detector plane, behind
- *            |-----------------------------|
- *            |                             |
- *            |                             |
- *            |                             |
- *            |                             |
- *            |      +--------+             |
- *            |     /        /|             |
- *   A Z      |    /        / |*D           |
- *   |        |   +--------+  |             |
- *   |        |   |        |  |             |
- *   |        |   |     *O |  +             |
- *    --->y   |   |        | /              |
- *  /         |   |        |/               |
- * V X        |   +--------+                |
- *            |-----------------------------|
- *
- *           *S
- *
- *
- *
- *
- *
- **/
-void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream);
-__constant__ Point3D projParamsArrayDev[4*PROJ_PER_BLOCK];  // Dev means it is on device
-__constant__ float projFloatsArrayDev[2*PROJ_PER_BLOCK];  // Dev means it is on device
-
-
-
-__global__ void kernelPixelDetector_parallel_interpolated( Geometry geo,
-        float* detector,
-        const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
-{
-//         Point3D source ,
-//         Point3D deltaU,
-//         Point3D deltaV,
-//         Point3D uvOrigin,
-//         float DSO,
-//         float maxdist){
-    
-    unsigned long long u = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned long long v = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned long long projNumber=threadIdx.z;
-    
-    if (u>= geo.nDetecU || v>= geo.nDetecV || projNumber>=PROJ_PER_BLOCK)
-        return;
-    
-    int indAlpha = currProjSetNumber*PROJ_PER_BLOCK+projNumber;  // This is the ABSOLUTE projection number in the projection array
-    
-    
-#if IS_FOR_MATLAB_TIGRE
-    size_t idx =  (size_t)(u  * (unsigned long long)geo.nDetecV + v)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#else
-    size_t idx =  (size_t)(v  * (unsigned long long)geo.nDetecU + u)+ projNumber*(unsigned long long)geo.nDetecV *(unsigned long long)geo.nDetecU ;
-#endif
-    
-    if(indAlpha>=totalNoOfProjections)
-        return;
-    
-    Point3D uvOrigin = projParamsArrayDev[4*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
-    Point3D deltaU = projParamsArrayDev[4*projNumber+1];
-    Point3D deltaV = projParamsArrayDev[4*projNumber+2];
-    Point3D source = projParamsArrayDev[4*projNumber+3];
-    
-    float DSO = projFloatsArrayDev[2*projNumber+0];
-    float maxdist = projFloatsArrayDev[2*projNumber+1];
-    
-    
-    /////// Get coordinates XYZ of pixel UV
-    unsigned long pixelV = geo.nDetecV-v-1;
-    unsigned long pixelU = u;
-    
-    
-    float vectX,vectY,vectZ;
-    Point3D P;
-    P.x=(uvOrigin.x+pixelU*deltaU.x+pixelV*deltaV.x);
-    P.y=(uvOrigin.y+pixelU*deltaU.y+pixelV*deltaV.y);
-    P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
-    Point3D S;
-    S.x=(source.x+pixelU*deltaU.x+pixelV*deltaV.x);
-    S.y=(source.y+pixelU*deltaU.y+pixelV*deltaV.y);
-    S.z=(source.z+pixelU*deltaU.z+pixelV*deltaV.z);
-    
-    // Length is the ray length in normalized space
-    double length=sqrtf((S.x-P.x)*(S.x-P.x)+(S.y-P.y)*(S.y-P.y)+(S.z-P.z)*(S.z-P.z));
-    //now legth is an integer of Nsamples that are required on this line
-    length=ceilf(length/geo.accuracy);//Divide the directional vector by an integer
-    vectX=(P.x -S.x)/(length);
-    vectY=(P.y -S.y)/(length);
-    vectZ=(P.z -S.z)/(length);
-    
-    
-//     //Integrate over the line
-    float tx,ty,tz;
-    float sum=0;
-    float i;
-    
-    
-    // limit the amount of mem access after the cube, but before the detector.
-    if ((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy  <   length)
-        length=ceilf((2*DSO/geo.dVoxelX+maxdist)/geo.accuracy);
-    //Length is not actually a length, but the amount of memreads with given accuracy ("samples per voxel")
-    
-    for (i=floorf(maxdist/geo.accuracy); i<=length; i=i+1){
-        tx=vectX*i+S.x;
-        ty=vectY*i+S.y;
-        tz=vectZ*i+S.z;
-        
-        sum += tex3D<float>(tex, tx+0.5f, ty+0.5f, tz+0.5f); // this line is 94% of time.
-        
-    }
-    float deltalength=sqrtf((vectX*geo.dVoxelX)*(vectX*geo.dVoxelX)+
-            (vectY*geo.dVoxelY)*(vectY*geo.dVoxelY)+
-            (vectZ*geo.dVoxelZ)*(vectZ*geo.dVoxelZ) );
-    detector[idx]=sum*deltalength;
-}
-
-
-
-int interpolation_projection_parallel(float  *  img, Geometry geo, float** result,float const * const angles,int nangles, const GpuIds& gpuids){
-    
-    
-    
-    size_t num_bytes = geo.nDetecU*geo.nDetecV *PROJ_PER_BLOCK* sizeof(float);
-    float** dProjection=(float **)malloc(2*sizeof(float *));
-    for (int i = 0; i < 2; ++i){
-        cudaMalloc((void**)&dProjection[i],   num_bytes);
-        cudaCheckErrors("cudaMalloc projections fail");
-    }
-    // allocate streams for memory and compute
-    int nStreams=2;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
-    
-    for (int i = 0; i < 2; ++i){
-        cudaStreamCreate(&stream[i]);
-    }
-    
-    
-    // Texture object variables
-    cudaTextureObject_t *texImg = 0;
-    cudaArray **d_cuArrTex = 0;
-    texImg =(cudaTextureObject_t*)malloc(1*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(1*sizeof(cudaArray*));
-    
-    CreateTextureParallelInterp(img,geo,&d_cuArrTex[0], &texImg[0],stream);
-    cudaCheckErrors("Texture allocation fail");
-    //Done! Image put into texture memory.
-    
-    
-    
-    Point3D source, deltaU, deltaV, uvOrigin;
-    
-    Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,4*PROJ_PER_BLOCK*sizeof(Point3D));
-    float* projFloatsArrayHost;
-    cudaMallocHost((void**)&projFloatsArrayHost,2*PROJ_PER_BLOCK*sizeof(float));
-    
-    // 16x16 gave the best performance empirically
-    // Funnily that makes it compatible with most GPUs.....
-    int divU,divV,divangle;
-    divU=PIXEL_SIZE_BLOCK;
-    divV=PIXEL_SIZE_BLOCK;
-    
-    dim3 numBlocks((geo.nDetecU+divU-1)/divU,(geo.nDetecV+divV-1)/divV,1);
-    dim3 threadsPerBlock(divU,divV,PROJ_PER_BLOCK);
-    unsigned int proj_global;
-    unsigned int noOfKernelCalls = (nangles+PROJ_PER_BLOCK-1)/PROJ_PER_BLOCK;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_BLOCK
-    unsigned int i;
-    
-    float maxdist;
-    for ( i=0; i<noOfKernelCalls; i++){
-        for(unsigned int j=0; j<PROJ_PER_BLOCK; j++){
-            proj_global=i*PROJ_PER_BLOCK+j;
-            if (proj_global>=nangles)
-                break;
-            
-            geo.alpha=angles[proj_global*3];
-            geo.theta=angles[proj_global*3+1];
-            geo.psi  =angles[proj_global*3+2];
-            //precomute distances for faster execution
-            maxdist=maxdistanceCuboid(geo,proj_global);
-            //Precompute per angle constant stuff for speed
-            computeDeltas_parallel(geo,geo.alpha,proj_global, &uvOrigin, &deltaU, &deltaV, &source);
-            //Ray tracing!
-            projParamsArrayHost[4*j]=uvOrigin;		// 6*j because we have 6 Point3D values per projection
-            projParamsArrayHost[4*j+1]=deltaU;
-            projParamsArrayHost[4*j+2]=deltaV;
-            projParamsArrayHost[4*j+3]=source;
-            
-            projFloatsArrayHost[2*j]=geo.DSO[proj_global];
-            projFloatsArrayHost[2*j+1]=floor(maxdist);
-            
-        }
-        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*4*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-        cudaMemcpyToSymbolAsync(projFloatsArrayDev, projFloatsArrayHost, sizeof(float)*2*PROJ_PER_BLOCK,0,cudaMemcpyHostToDevice,stream[0]);
-        cudaStreamSynchronize(stream[0]);
-        
-        kernelPixelDetector_parallel_interpolated<<<numBlocks,threadsPerBlock,0,stream[0]>>>(geo,dProjection[(int)i%2==0],i,nangles,texImg[0]);
-        // copy result to host
-        if (i>0)
-             cudaMemcpyAsync(result[i*PROJ_PER_BLOCK-PROJ_PER_BLOCK],dProjection[(int)i%2!=0], num_bytes, cudaMemcpyDeviceToHost,stream[1]);    
-    }
-    cudaDeviceSynchronize();
-    
-    int lastangles=nangles-(i-1)*PROJ_PER_BLOCK;
-    cudaMemcpyAsync(result[(i-1)*PROJ_PER_BLOCK],dProjection[(int)(i-1)%2==0], lastangles*geo.nDetecV*geo.nDetecU*sizeof(float), cudaMemcpyDeviceToHost,stream[1]);
-
-    
-    cudaDestroyTextureObject(texImg[0]);
-    cudaFreeArray(d_cuArrTex[0]);
-    free(texImg); texImg = 0;
-    free(d_cuArrTex); d_cuArrTex = 0;
-    cudaCheckErrors("Unbind  fail");
-    cudaFree(dProjection[0]);
-    cudaFree(dProjection[1]);
-    free(dProjection);
-    cudaFreeHost(projParamsArrayHost);
-    cudaFreeHost(projFloatsArrayHost);
-
-    cudaCheckErrors("cudaFree d_imagedata fail");
-    
-    
-    for (int i = 0; i < 2; ++i){
-      cudaStreamDestroy(stream[i]);
-    }
-//     cudaDeviceReset();
-    
-    return 0;
-}
-
-
-
-
-/* This code precomputes The location of the source and the Delta U and delta V (in the warped space)
- * to compute the locations of the x-rays. While it seems verbose and overly-optimized,
- * it does saves about 30% of each of the kernel calls. Thats something!
- **/
-void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source){
-    Point3D S;
-    S.x=geo.DSO[i];
-    S.y=geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);
-    S.z=geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    
-    //End point
-    Point3D P,Pu0,Pv0;
-    
-    P.x  =-(geo.DSD[i]-geo.DSO[i]);   P.y  = geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       P.z  = geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    Pu0.x=-(geo.DSD[i]-geo.DSO[i]);   Pu0.y= geo.dDetecU*(1-((float)geo.nDetecU/2)+0.5);       Pu0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-0);
-    Pv0.x=-(geo.DSD[i]-geo.DSO[i]);   Pv0.y= geo.dDetecU*(0-((float)geo.nDetecU/2)+0.5);       Pv0.z= geo.dDetecV*(((float)geo.nDetecV/2)-0.5-1);
-        // Geometric trasnformations:
-    P.x=0;Pu0.x=0;Pv0.x=0;
-    
-    // Roll pitch yaw
-    rollPitchYaw(geo,i,&P);
-    rollPitchYaw(geo,i,&Pu0);
-    rollPitchYaw(geo,i,&Pv0);
-    //Now lets translate the points where they should be:
-    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
-    Pu0.x=Pu0.x-(geo.DSD[i]-geo.DSO[i]);
-    Pv0.x=Pv0.x-(geo.DSD[i]-geo.DSO[i]);
-
-    S.x=0;
-    // Roll pitch yaw
-    rollPitchYaw(geo,i,&S);
-    //Now lets translate the points where they should be:
-    S.x=S.x+geo.DSO[i];
-
-    
-    //1: Offset detector
-    
-    //P.x
-    P.y  =P.y  +geo.offDetecU[i];    P.z  =P.z  +geo.offDetecV[i];
-    Pu0.y=Pu0.y+geo.offDetecU[i];    Pu0.z=Pu0.z+geo.offDetecV[i];
-    Pv0.y=Pv0.y+geo.offDetecU[i];    Pv0.z=Pv0.z+geo.offDetecV[i];
-    //S doesnt need to chagne
-    
-    
-    //3: Rotate (around z)!
-    Point3D Pfinal, Pfinalu0, Pfinalv0;
-    Pfinal.x  =P.x;
-    Pfinal.y  =P.y  +geo.offDetecU[i]; Pfinal.z  =P.z  +geo.offDetecV[i];
-    Pfinalu0.x=Pu0.x;
-    Pfinalu0.y=Pu0.y  +geo.offDetecU[i]; Pfinalu0.z  =Pu0.z  +geo.offDetecV[i];
-    Pfinalv0.x=Pv0.x;
-    Pfinalv0.y=Pv0.y  +geo.offDetecU[i]; Pfinalv0.z  =Pv0.z  +geo.offDetecV[i];
-    
-    eulerZYZ(geo,&Pfinal);
-    eulerZYZ(geo,&Pfinalu0);
-    eulerZYZ(geo,&Pfinalv0);
-    eulerZYZ(geo,&S);
-    
-    
-    
-    //2: Offset image (instead of offseting image, -offset everything else)
-    
-    Pfinal.x  =Pfinal.x-geo.offOrigX[i];     Pfinal.y  =Pfinal.y-geo.offOrigY[i];     Pfinal.z  =Pfinal.z-geo.offOrigZ[i];
-    Pfinalu0.x=Pfinalu0.x-geo.offOrigX[i];   Pfinalu0.y=Pfinalu0.y-geo.offOrigY[i];   Pfinalu0.z=Pfinalu0.z-geo.offOrigZ[i];
-    Pfinalv0.x=Pfinalv0.x-geo.offOrigX[i];   Pfinalv0.y=Pfinalv0.y-geo.offOrigY[i];   Pfinalv0.z=Pfinalv0.z-geo.offOrigZ[i];
-    S.x=S.x-geo.offOrigX[i];       S.y=S.y-geo.offOrigY[i];       S.z=S.z-geo.offOrigZ[i];
-    
-    // As we want the (0,0,0) to be in a corner of the image, we need to translate everything (after rotation);
-    Pfinal.x  =Pfinal.x+geo.sVoxelX/2-geo.dVoxelX/2;      Pfinal.y  =Pfinal.y+geo.sVoxelY/2-geo.dVoxelY/2;          Pfinal.z  =Pfinal.z  +geo.sVoxelZ/2-geo.dVoxelZ/2;
-    Pfinalu0.x=Pfinalu0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalu0.y=Pfinalu0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalu0.z=Pfinalu0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
-    Pfinalv0.x=Pfinalv0.x+geo.sVoxelX/2-geo.dVoxelX/2;    Pfinalv0.y=Pfinalv0.y+geo.sVoxelY/2-geo.dVoxelY/2;        Pfinalv0.z=Pfinalv0.z+geo.sVoxelZ/2-geo.dVoxelZ/2;
-    S.x       =S.x+geo.sVoxelX/2-geo.dVoxelX/2;           S.y       =S.y+geo.sVoxelY/2-geo.dVoxelY/2;               S.z       =S.z      +geo.sVoxelZ/2-geo.dVoxelZ/2;
-    
-    //4. Scale everything so dVoxel==1
-    Pfinal.x  =Pfinal.x/geo.dVoxelX;      Pfinal.y  =Pfinal.y/geo.dVoxelY;        Pfinal.z  =Pfinal.z/geo.dVoxelZ;
-    Pfinalu0.x=Pfinalu0.x/geo.dVoxelX;    Pfinalu0.y=Pfinalu0.y/geo.dVoxelY;      Pfinalu0.z=Pfinalu0.z/geo.dVoxelZ;
-    Pfinalv0.x=Pfinalv0.x/geo.dVoxelX;    Pfinalv0.y=Pfinalv0.y/geo.dVoxelY;      Pfinalv0.z=Pfinalv0.z/geo.dVoxelZ;
-    S.x       =S.x/geo.dVoxelX;           S.y       =S.y/geo.dVoxelY;             S.z       =S.z/geo.dVoxelZ;
-    
-    
-    
-    //5. apply COR. Wherever everything was, now its offesetd by a bit
-    float CORx, CORy;
-    CORx=-geo.COR[i]*sin(geo.alpha)/geo.dVoxelX;
-    CORy= geo.COR[i]*cos(geo.alpha)/geo.dVoxelY;
-    Pfinal.x+=CORx;   Pfinal.y+=CORy;
-    Pfinalu0.x+=CORx;   Pfinalu0.y+=CORy;
-    Pfinalv0.x+=CORx;   Pfinalv0.y+=CORy;
-    S.x+=CORx; S.y+=CORy;
-    
-    // return
-    
-    *uvorigin=Pfinal;
-    
-    deltaU->x=Pfinalu0.x-Pfinal.x;
-    deltaU->y=Pfinalu0.y-Pfinal.y;
-    deltaU->z=Pfinalu0.z-Pfinal.z;
-    
-    deltaV->x=Pfinalv0.x-Pfinal.x;
-    deltaV->y=Pfinalv0.y-Pfinal.y;
-    deltaV->z=Pfinalv0.z-Pfinal.z;
-    
-    *source=S;
-}
-void CreateTextureParallelInterp(float* image,Geometry geo,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage,cudaStream_t* stream){    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    
-    
-    const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
-    
-    //cudaArray Descriptor
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-    //cuda Array
-    cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
-    
-    
-    cudaMemcpy3DParms copyParams = {0};
-    //Array creation
-    copyParams.srcPtr   = make_cudaPitchedPtr((void *)image, extent.width*sizeof(float), extent.width, extent.height);
-    copyParams.dstArray = d_cuArrTex[0];
-    copyParams.extent   = extent;
-    copyParams.kind     = cudaMemcpyHostToDevice;
-    cudaMemcpy3DAsync(&copyParams,stream[1]);
-    
-    
-    //Array creation End
-    
-    cudaResourceDesc    texRes;
-    memset(&texRes, 0, sizeof(cudaResourceDesc));
-    texRes.resType = cudaResourceTypeArray;
-    texRes.res.array.array  = d_cuArrTex[0];
-    cudaTextureDesc     texDescr;
-    memset(&texDescr, 0, sizeof(cudaTextureDesc));
-    texDescr.normalizedCoords = false;
-    texDescr.filterMode = cudaFilterModeLinear;
-    texDescr.addressMode[0] = cudaAddressModeBorder;
-    texDescr.addressMode[1] = cudaAddressModeBorder;
-    texDescr.addressMode[2] = cudaAddressModeBorder;
-    texDescr.readMode = cudaReadModeElementType;
-    cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
-    
-}
\ No newline at end of file
diff --git a/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip b/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip
deleted file mode 100644
index 1280b6ed..00000000
--- a/Common/CUDA/ray_interpolated_projection_parallel.hpp.prehip
+++ /dev/null
@@ -1,65 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA functions for texture-memory interpolation based projection
- *
- *
- * CODE by       Ander Biguri
- *               Sepideh Hatamikia (arbitrary rotation)
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-#include "ray_interpolated_projection.hpp"
-
-#include "types_TIGRE.hpp"
-#include "GpuIds.hpp"
-
-#ifndef PROJECTION_PARALLEL_HPP
-#define PROJECTION_PARALLEL_HPP
-
-int interpolation_projection_parallel(float* img, Geometry geo, float** result,float const * const alphas,int nalpha, const GpuIds& gpuids);
-// float computeMaxLength(Geometry geo, float alpha);
-void computeDeltas_parallel(Geometry geo, float alpha,unsigned int i, Point3D* uvorigin, Point3D* deltaU, Point3D* deltaV, Point3D* source);
-
-// float maxDistanceCubeXY(Geometry geo, float alpha,int i);
-
-// below, not used
-Geometry nomralizeGeometryImage(Geometry geo);
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/tv_proximal.cu.prehip b/Common/CUDA/tv_proximal.cu.prehip
deleted file mode 100644
index 32ae99c2..00000000
--- a/Common/CUDA/tv_proximal.cu.prehip
+++ /dev/null
@@ -1,693 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX  functions for TV image denoising. Check inputs and parses
- * MATLAB data to C++ data.
- *
- *
- * CODE by   Imanol Luengo
- *           PhD student University of Nottingham
- *           imaluengo@gmail.com
- *           2015
- *           Modified by Ander Biguri for multi-GPU 
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-
-// http://gpu4vision.icg.tugraz.at/papers/2010/knoll.pdf#pub47
-#define MAXTREADS 1024
-#define MAX_BUFFER 60
-#define BLOCK_SIZE 10  // BLOCK_SIZE^3 must be smaller than MAXTREADS
-
-#include "tv_proximal.hpp"
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                cudaDeviceReset();\
-                mexPrintf("%s \n",msg);\
-                        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising",cudaGetErrorString(__err));\
-        } \
-} while (0)
-void cpy_from_host(float* device_array,float* host_array, 
-                   unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
-                   unsigned long long pixels_per_slice, unsigned int buffer_length, 
-                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size);  
-    
-    
-    __global__ void multiplyArrayScalar(float* vec,float scalar,const size_t n)
-    {
-        unsigned long long i = (blockIdx.x * blockDim.x) + threadIdx.x;
-        for(; i<n; i+=gridDim.x*blockDim.x) {
-            vec[i]*=scalar;
-        }
-    }
-
-    __device__ __inline__
-            float divergence(const float* pz, const float* py, const float* px,
-            long z, long y, long x, long depth, long rows, long cols,
-            float dz, float dy, float dx)
-    {
-        long size2d = rows*cols;
-        long idx = z * size2d + y * cols + x;
-        float _div = 0.0f;
-        
-        if ( z - 1 >= 0 ) {
-            _div += (pz[idx] - pz[(z-1)*size2d + y*cols + x]) / dz;
-        } else {
-            _div += pz[idx];
-        }
-        
-        if ( y - 1 >= 0 ) {
-            _div += (py[idx] - py[z*size2d + (y-1)*cols + x]) / dy;
-        } else {
-            _div += py[idx];
-        }
-        
-        if ( x - 1 >= 0 ) {
-            _div += (px[idx] - px[z*size2d + y*cols + (x-1)]) / dx;
-        } else {
-            _div += px[idx];
-        }
-        
-        return _div;
-    }
-    
-    __device__ __inline__
-            void gradient(const float* u, float* grad,
-            long z, long y, long x,
-            long depth, long rows, long cols,
-            float dz, float dy, float dx)
-    {
-        long size2d = rows*cols;
-        long idx = z * size2d + y * cols + x;
-        
-        float uidx = u[idx];
-        
-        if ( z + 1 < depth ) {
-            grad[0] = (u[(z+1)*size2d + y*cols + x] - uidx) / dz;
-        }
-        
-        if ( y + 1 < rows ) {
-            grad[1] = (u[z*size2d + (y+1)*cols + x] - uidx) / dy;
-        }
-        
-        if ( x + 1 < cols ) {
-            grad[2] = (u[z*size2d + y*cols + (x+1)] - uidx) / dx;
-        }
-    }
-    
-    
-    __global__
-            void update_u(const float* f, const float* pz, const float* py, const float* px, float* u,
-            float tau, float lambda,
-            long depth, long rows, long cols,
-            float dz, float dy, float dx)
-    {
-        long x = threadIdx.x + blockIdx.x * blockDim.x;
-        long y = threadIdx.y + blockIdx.y * blockDim.y;
-        long z = threadIdx.z + blockIdx.z * blockDim.z;
-        long idx = z * rows * cols + y * cols + x;
-        
-        if ( x >= cols || y >= rows || z >= depth )
-            return;
-        
-        float _div = divergence(pz, py, px, z, y, x, depth, rows, cols, dz, dy, dx);
-        
-        u[idx] = u[idx] * (1.0f - tau) + tau * (f[idx] + (1.0f/lambda) * _div);
-    }
-    
-    
-    __global__
-            void update_p(const float* u, float* pz, float* py, float* px,
-            float tau, long depth, long rows, long cols,
-            float dz, float dy, float dx)
-    {
-        long x = threadIdx.x + blockIdx.x * blockDim.x;
-        long y = threadIdx.y + blockIdx.y * blockDim.y;
-        long z = threadIdx.z + blockIdx.z * blockDim.z;
-        long idx = z * rows * cols + y * cols + x;
-        
-        if ( x >= cols || y >= rows || z >= depth )
-            return;
-        
-        float grad[3] = {0,0,0}, q[3];
-        gradient(u, grad, z, y, x, depth, rows, cols, dz, dy, dx);
-        
-        q[0] = pz[idx] + tau * grad[0];
-        q[1] = py[idx] + tau * grad[1];
-        q[2] = px[idx] + tau * grad[2];
-        
-        float norm = fmaxf(1.0f, sqrtf(q[0] * q[0] + q[1] * q[1] + q[2] * q[2]));
-        
-        pz[idx] = q[0] / norm;
-        py[idx] = q[1] / norm;
-        px[idx] = q[2] / norm;
-    }
-    
-    
-// Main function
-    void tvdenoising(float* src, float* dst, float lambda,
-            const float* spacing, const long* image_size, int maxIter, const GpuIds& gpuids) {
-        
-        // Prepare for MultiGPU
-        int deviceCount = gpuids.GetLength();
-        cudaCheckErrors("Device query fail");
-        if (deviceCount == 0) {
-            mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","There are no available device(s) that support CUDA\n");
-        }
-        //
-        // CODE assumes
-        // 1.-All available devices are usable by this code
-        // 2.-All available devices are equal, they are the same machine (warning thrown)
-        // Check the available devices, and if they are the same
-        if (!gpuids.AreEqualDevices()) {
-            mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
-        }
-        int dev;
-
-        // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
-        
-        size_t mem_GPU_global;
-        checkFreeMemory(gpuids, &mem_GPU_global);
-        
-        
-        // %5 of free memory should be enough, we have almost no variables in these kernels
-       size_t total_pixels           = image_size[0] * image_size[1] * image_size[2] ;
-       const size_t pixels_per_slice = image_size[0] * image_size[1] ;
-       const size_t mem_slice_image  = sizeof(float)* pixels_per_slice  ;
-       const size_t mem_size_image   = sizeof(float)* total_pixels;
-        
-        // Decide how are we handling the distribution of computation
-        size_t mem_img_each_GPU;
-        
-        unsigned int buffer_length=1;
-        //Does everything fit in the GPU?
-        unsigned int slices_per_split;
-        unsigned int splits=1; // if the number does not fit in an uint, you have more serious trouble than this.
-        if(mem_GPU_global> 5*mem_size_image+5*mem_slice_image*buffer_length*2){
-            // We only need to split if we have extra GPUs
-            slices_per_split=(image_size[2]+deviceCount-1)/deviceCount;
-            mem_img_each_GPU=mem_slice_image*(  (image_size[2]+deviceCount-1)/deviceCount  + buffer_length*2);
-        }else{
-            // As mem_auxiliary is not expected to be a large value (for a 2000^3 image is around 28Mbytes), lets for now assume we need it all
-            size_t mem_free=mem_GPU_global;
-            
-            splits=(unsigned int)(ceil(((float)(5*mem_size_image)/(float)(deviceCount))/mem_free));
-            // Now, there is an overhead here, as each splits should have 2 slices more, to accoutn for overlap of images.
-            // lets make sure these 2 slices fit, if they do not, add 1 to splits.
-            slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits);
-            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
-            
-            // if the new stuff does not fit in the GPU, it measn we are in the edge case where adding that extra slice will overflow memory
-            if (mem_GPU_global< 5*mem_img_each_GPU){
-                // one more split should do the job, as its an edge case.
-                splits++;
-                //recompute for later
-                slices_per_split=(image_size[2]+deviceCount*splits-1)/(deviceCount*splits); // amount of slices that fit on a GPU. Later we add 2 to these, as we need them for overlap
-                mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
-            }
-            
-            // How many EXTRA buffer slices should be able to fit in here??!?!
-            mem_free=mem_GPU_global-(5*mem_img_each_GPU);
-            unsigned int extra_buff=(mem_free/mem_slice_image);
-            buffer_length=(extra_buff/2)/5; // we need double whatever this results in, rounded down.
-            
-            buffer_length=min(MAX_BUFFER,buffer_length);
-            
-            mem_img_each_GPU=(mem_slice_image*(slices_per_split+buffer_length*2));
-            
-            // Assert
-            if (mem_GPU_global< 5*mem_img_each_GPU){
-                mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","Bad assert. Logic behind splitting flawed! Please tell: ander.biguri@gmail.com\n");
-            }
-        }
-        
-        
-        // Lets try to make the host memory pinned:
-        // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-        int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-        cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-        if (isHostRegisterSupported & splits>1){
-            cudaHostRegister(src ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-            cudaHostRegister(dst ,image_size[2]*image_size[1]*image_size[0]*sizeof(float),cudaHostRegisterPortable);
-        }
-        cudaCheckErrors("Error pinning memory");
-        
-        
-        
-        // Lets allocate auxiliary  variables.
-        float* buffer_u, *buffer_px, *buffer_py, *buffer_pz;
-        float* h_px, *h_py, *h_pz, *h_u;
-        if(splits>1){
-            
-            //These take A LOT of memory and A LOT of time to use. If we can avoid using them, better.
-            if (buffer_length<maxIter){ // if we do only 1 big iter, they are not needed.
-                mexWarnMsgIdAndTxt("tvDenoise:tvdenoising:Memory","TV dneoising requires 5 times the image memory. Your GPU(s) do not have the required memory.\n This memory will be attempted to allocate on the CPU, Whic may fail or slow the computation by a very significant amount.\n If you want to kill the execution: CTRL+C");
-                
-                cudaMallocHost((void**)&h_px,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
-                cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
-                
-                cudaMallocHost((void**)&h_py,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
-                cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
-                
-                cudaMallocHost((void**)&h_pz,image_size[0]*image_size[1]*image_size[2]*sizeof(float));
-                cudaCheckErrors("Malloc error on auxiliary variables on CPU.\n Your image is too big to use SART_TV or im3Ddenoise in your current machine");
-            }
-            h_u=dst;
-        }else{
-            cudaMallocHost((void**)&buffer_u,  pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_px, pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_py, pixels_per_slice*sizeof(float));
-            cudaMallocHost((void**)&buffer_pz, pixels_per_slice*sizeof(float));
-            
-        }
-        // We should be good to go memory wise.
-        
-        
-        float** d_src   =(float**)malloc(deviceCount*sizeof(float*));
-        float** d_u     =(float**)malloc(deviceCount*sizeof(float*));
-        float** d_px    =(float**)malloc(deviceCount*sizeof(float*));
-        float** d_py    =(float**)malloc(deviceCount*sizeof(float*));
-        float** d_pz    =(float**)malloc(deviceCount*sizeof(float*));
-        
-        //Malloc
-        for(dev=0;dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            // F
-            cudaMalloc((void**)&d_src[dev], mem_img_each_GPU);
-            // U
-            cudaMalloc((void**)&d_u [dev],  mem_img_each_GPU);
-            // PX
-            cudaMalloc((void**)&d_px[dev],  mem_img_each_GPU);
-            // PY
-            cudaMalloc((void**)&d_py[dev],  mem_img_each_GPU);
-            // PZ
-            cudaMalloc((void**)&d_pz[dev],  mem_img_each_GPU);
-        }
-        cudaDeviceSynchronize();
-        cudaCheckErrors("Malloc  error");
-        
-        
-        // Create streams
-        int nStream_device=5;
-        int nStreams=deviceCount*nStream_device;
-        cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));
-        
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            for (int i = 0; i < nStream_device; ++i){
-                cudaStreamCreate(&stream[i+dev*nStream_device]);
-            }
-        }
-        cudaCheckErrors("Stream creation fail");
-        
-        
-        
-        
-        // Allocate CPU buffer if needed, warn user if not.
-        
-        
-        
-        unsigned int curr_slices;
-        unsigned long long curr_pixels;
-        size_t linear_idx_start;
-        
-        unsigned long long buffer_pixels=buffer_length*pixels_per_slice;
-        
-        unsigned long long* offset_device=(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        unsigned long long* offset_host  =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        unsigned long long* bytes_device =(unsigned long long*)malloc(deviceCount*sizeof(unsigned long long));
-        bool is_first_chunk;
-        bool is_last_chunk;
-        
-        float tau2, tau1;
-
-        for(unsigned int i=0;i<maxIter;i+=(buffer_length)){
-           
-            for(unsigned int sp=0;sp<splits;sp++){
-                
-                // For each iteration we need to compute all the image. The ordering of these loops
-                // need to be like this due to the bounding layers between splits. If more than 1 split is needed
-                // for each GPU then there is no other way that taking the entire memory out of GPU and putting it back.
-                // If the memory can be shared between GPUs fully without extra splits, then there is an easy way of synchronizing the memory
-                
-                // Copy image to memory
-                for (dev = 0; dev < deviceCount; dev++){
-                    // Precompute indices and needed bytes
-                    curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                    curr_pixels=curr_slices*pixels_per_slice;
-                    linear_idx_start=pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
-                    
-                    // Check if its the first or last chunck
-                    is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                    is_first_chunk=!(sp*deviceCount+dev);
-                    
-                    // lets compute where we start copies and how much. This avoids 3 calls to Memcpy
-                    offset_device[dev]=buffer_pixels*is_first_chunk;
-                    offset_host[dev]=linear_idx_start-buffer_pixels*!is_first_chunk;
-                    bytes_device[dev]=curr_pixels+buffer_pixels*!is_first_chunk+buffer_pixels*!is_last_chunk;
-                }
-                // copy data to the GPU if we are just starting
-                if(i==0){
-                    for (dev = 0; dev < deviceCount; dev++){
-                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                        is_first_chunk=!(sp*deviceCount+dev);
-
-                        cudaSetDevice(gpuids[dev]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_src[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
-                        cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-
-                        cudaMemcpyAsync(d_u[dev], d_src[dev], mem_img_each_GPU, cudaMemcpyDeviceToDevice,stream[dev*nStream_device+1]);
-                        cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
-                        cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
-                        cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device]);
-                    }
-
-                    // Sync
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                    }
-                    cudaCheckErrors("Memcpy failure");
-                    
-                }
-                // if we need to split and its not the first iteration, then we need to copy from Host memory.
-                // d_src is the original image, with no change.
-                if (splits>1 & i>0){
-
-                    for (dev = 0; dev < deviceCount; dev++){ 
-                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                        is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_u[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
-                        cpy_from_host(d_u[dev],h_u,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
-                    }
-
-                    for (dev = 0; dev < deviceCount; dev++){ 
-                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                        is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+2]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_px[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+2]);}
-                        cpy_from_host(d_px[dev],h_px,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+2],  is_first_chunk,  is_last_chunk, image_size);
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){ 
-                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                        is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+3]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_py[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+3]);}
-                        cpy_from_host(d_py[dev],h_py,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+3],  is_first_chunk,  is_last_chunk, image_size);
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){ 
-                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                        is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+4]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+4]);}
-                        cpy_from_host(d_pz[dev],h_pz,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+4],  is_first_chunk,  is_last_chunk, image_size);
-                        // Z derivative must be negated in sign to keep Neumman conditions
-                        if (is_first_chunk){
-                            multiplyArrayScalar<<<60,MAXTREADS,0,stream[dev*nStream_device+4]>>>(d_pz[dev],             -1,  pixels_per_slice*buffer_length);    
-                        }
-                        if (is_last_chunk){
-                            multiplyArrayScalar<<<60,MAXTREADS,0,stream[dev*nStream_device+4]>>>(d_pz[dev]+bytes_device[dev],-1,  pixels_per_slice*buffer_length);    
-                        }
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){ 
-                        is_last_chunk=!((sp*deviceCount+dev)<deviceCount*splits-1);
-                        is_first_chunk=!(sp*deviceCount+dev);
-                        cudaSetDevice(gpuids[dev]);
-                        cudaStreamSynchronize(stream[dev*nStream_device+1]);
-                        if (is_last_chunk) {cudaMemsetAsync(d_pz[dev], 0, mem_img_each_GPU,stream[dev*nStream_device+1]);}
-                        cpy_from_host(d_src[dev],src,bytes_device[dev], offset_device[dev],offset_host[dev], pixels_per_slice, buffer_length, stream[dev*nStream_device+1],  is_first_chunk,  is_last_chunk, image_size);
-                    }
-
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        cudaDeviceSynchronize();
-                        cudaCheckErrors("Memcpy failure on multi split");
-                    }
-                }
-                
-                // Inter interations.
-                for(unsigned int ib=0;  (ib<(buffer_length)) && ((i+ib)<maxIter);  ib++){
-
-                    tau2 = 0.3f + 0.02f * (i+ib);
-                    tau1 = (1.f/tau2) * ((1.f/6.f) - (5.f/(15.f+(i+ib))));
-                    // bdim and gdim
-                    
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
-                        dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
-                        
-                        update_u<<<grid, block,0,stream[dev*nStream_device]>>>(d_src[dev], d_pz[dev], d_py[dev], d_px[dev], d_u[dev], tau1, lambda,
-                                (long)(curr_slices+buffer_length*2), image_size[1],image_size[0],
-                                spacing[2], spacing[1], spacing[0]);
-                    }
-                    for (dev = 0; dev < deviceCount; dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices=((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
-                        dim3 grid((image_size[0]+block.x-1)/block.x, (image_size[1]+block.y-1)/block.y, (curr_slices+buffer_length*2+block.z-1)/block.z);
-                        
-                        update_p<<<grid, block,0,stream[dev*nStream_device]>>>(d_u[dev], d_pz[dev], d_py[dev], d_px[dev], tau2,
-                                (long)(curr_slices+buffer_length*2), image_size[1], image_size[0],
-                                spacing[2], spacing[1], spacing[0]);
-                    }
-                }// END internal iter
-                
-                // Synchronize mathematics, make sure bounding pixels are correct
-                for(dev=0; dev<deviceCount;dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaDeviceSynchronize();
-                }
-
-                // We have done as many iterations as our buffer allowed. We now need to syncronize the buffers.
-            
-                if(splits==1){
-                    // If everything fits in the GPUs, we can just share the updates between GPUs directly.
-                    // We iterate for each device, and we copy the buffer pixels from each device. 
-                    // "buffer" variables are just host auxiliary variables to allow the copy from GPU to GPU. 
-                    // Essentially this code takes for each device (exceptions for the first and last devices included) "buffer_pixels" amount 
-                    // of the beggining and end of each important variable and passes it to the next/previous GPU (the one containing the next/previous chunck)
-
-                    // Pass buffer_pixels amount of data from the start of the image to the previous GPU
-                    for(dev=0; dev<deviceCount;dev++){
-                        if (dev<deviceCount-1){
-                            cudaSetDevice(gpuids[dev+1]);
-                            cudaMemcpyAsync(buffer_u , d_u[dev+1] , buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+1]);
-                            cudaMemcpyAsync(buffer_px, d_px[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+2]);
-                            cudaMemcpyAsync(buffer_py, d_py[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+3]);
-                            cudaMemcpyAsync(buffer_pz, d_pz[dev+1], buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev+1)*nStream_device+4]);
-
-                            
-                            cudaSetDevice(gpuids[dev]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+1]);
-                            cudaMemcpyAsync(d_u[dev] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+2]);
-                            cudaMemcpyAsync(d_px[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+3]);
-                            cudaMemcpyAsync(d_py[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
-                            cudaStreamSynchronize(stream[(dev+1)*nStream_device+4]);
-                            cudaMemcpyAsync(d_pz[dev]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
-                            
-                            
-                        }
-                        cudaDeviceSynchronize();
-                        // Pass buffer_pixels amoung of data of the end part of the image to the next GPU.
-                        if (dev>0){
-                            // U
-                            cudaSetDevice(gpuids[dev-1]);
-                            cudaMemcpyAsync(buffer_u,  d_u[dev-1] +slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+1]);
-                            cudaMemcpyAsync(buffer_px, d_px[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+2]);
-                            cudaMemcpyAsync(buffer_py, d_py[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+3]);
-                            cudaMemcpyAsync(buffer_pz, d_pz[dev-1]+slices_per_split*pixels_per_slice+buffer_pixels, buffer_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[(dev-1)*nStream_device+4]);
-                            
-                            
-                            cudaSetDevice(gpuids[dev]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+1]);
-                            cudaMemcpyAsync(d_u[dev] ,buffer_u , buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+1]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+2]);
-                            cudaMemcpyAsync(d_px[dev],buffer_px, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+2]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+3]);
-                            cudaMemcpyAsync(d_py[dev],buffer_py, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+3]);
-                            cudaStreamSynchronize(stream[(dev-1)*nStream_device+4]);
-                            cudaMemcpyAsync(d_pz[dev],buffer_pz, buffer_pixels*sizeof(float), cudaMemcpyHostToDevice,stream[(dev)*nStream_device+4]);
-                            
-                            
-                        }
-                    }
-                // This is the case when we can't solely use GPU memory, as the total size of the images+variables exceeds total amounf of memory among GPUs.
-                // This situation requires partial results and full memory allocation in the host. 
-                }else{
-                    // Vopy all the U variable into the host.
-                    for(dev=0; dev<deviceCount;dev++){
-                        cudaSetDevice(gpuids[dev]);
-                        curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                        linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
-                        total_pixels     = curr_slices*pixels_per_slice;
-                        cudaMemcpyAsync(&h_u[linear_idx_start],  d_u [dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-                    }
-                    if ((i+buffer_length)<maxIter){ // If its the last iteration, we don't need to get these out.
-                        // if its not, copy them to host fully. 
-                        for(dev=0; dev<deviceCount;dev++){
-                            cudaSetDevice(gpuids[dev]);
-                            curr_slices      = ((sp*deviceCount+dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*(sp*deviceCount+dev);
-                            linear_idx_start = pixels_per_slice*slices_per_split*(sp*deviceCount+dev);
-                            total_pixels     = curr_slices*pixels_per_slice;
-                            cudaMemcpyAsync(&h_px[linear_idx_start], d_px[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+2]);
-                            cudaMemcpyAsync(&h_py[linear_idx_start], d_py[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+3]);
-                            cudaMemcpyAsync(&h_pz[linear_idx_start], d_pz[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+4]);
-                            
-                        }
-                    }
-
-                }
-            }//END splits
-        }//END main iter
-        
-        for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
-        }
-        cudaCheckErrors("TV minimization");
-        
-        // We are done. If we were solely using GPU memory, because the problem fitted fully on all GPU memory available, then the result is still inside the GPU.
-        // lets get it out. 
-        if(splits==1){
-            for(dev=0; dev<deviceCount;dev++){
-                cudaSetDevice(gpuids[dev]);
-                curr_slices  = ((dev+1)*slices_per_split<image_size[2])?  slices_per_split:  image_size[2]-slices_per_split*dev;
-                total_pixels = curr_slices*pixels_per_slice;
-                cudaMemcpyAsync(dst+slices_per_split*pixels_per_slice*dev, d_u[dev]+buffer_pixels,total_pixels*sizeof(float), cudaMemcpyDeviceToHost,stream[dev*nStream_device+1]);
-            }
-        } // done, everything in GPU and auxiliary variables are good to go. 
-
-        for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
-        }
-        cudaCheckErrors("Copy result back");
-        for(dev=0; dev<deviceCount;dev++){
-            
-            cudaFree(d_src[dev]);
-            cudaFree(d_u [dev]);
-            cudaFree(d_pz[dev]);
-            cudaFree(d_py[dev]);
-            cudaFree(d_px[dev]);
-        }
-        if(splits>1 && buffer_length<maxIter){
-            cudaFreeHost(h_px);
-            cudaFreeHost(h_py);
-            cudaFreeHost(h_pz);
-        }else if(splits==1){
-            cudaFreeHost(buffer_u);
-            cudaFreeHost(buffer_px);
-            cudaFreeHost(buffer_py);
-            cudaFreeHost(buffer_pz);
-        }
-        
-        for (int i = 0; i < nStreams; ++i)
-           cudaStreamDestroy(stream[i]) ;
-
-        if (isHostRegisterSupported & splits>1){
-            cudaHostUnregister(src);
-            cudaHostUnregister(dst);
-        }
-        for(dev=0; dev<deviceCount;dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
-        }
-        cudaCheckErrors("Copy free ");
-        
-    }
-    
-
-void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
-        size_t memfree;
-        size_t memtotal;
-        const int deviceCount = gpuids.GetLength();
-
-        for (int dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemGetInfo(&memfree,&memtotal);
-            if(dev==0) *mem_GPU_global=memfree;
-            if(memfree<memtotal/2){
-                mexErrMsgIdAndTxt("tvDenoise:tvdenoising:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
-            }
-            cudaCheckErrors("Check mem error");
-            
-            *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
-        }
-        *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
-        
-        //*mem_GPU_global= insert your known number here, in bytes.
-}
-
-void cpy_from_host(float* device_array,float* host_array, 
-                   unsigned long long bytes_device,unsigned long long offset_device,unsigned long long offset_host, 
-                   unsigned long long pixels_per_slice, unsigned int buffer_length, 
-                   cudaStream_t stream, bool is_first_chunk, bool is_last_chunk,const long* image_size)
-{
-
-    // Initial and last cases are special. These define the boundary condition. In our case, we are using Neumann boundary condition
-    // so we need to copy the edge slice into the buffer
-    if(is_first_chunk){
-        for (unsigned int j=0;j<buffer_length;j++){
-            cudaMemcpyAsync(device_array+pixels_per_slice*j, host_array+pixels_per_slice*(buffer_length-j), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream); 
-        }       
-    }
-    if(is_last_chunk){  
-
-        for (unsigned int j=0;j<buffer_length;j++){
-           cudaMemcpyAsync(device_array+bytes_device+pixels_per_slice*j, host_array+pixels_per_slice*(image_size[2]-j-2), pixels_per_slice*sizeof(float), cudaMemcpyHostToDevice,stream);
-        }
-    }
-    cudaStreamSynchronize(stream);
-    cudaMemcpyAsync(device_array +offset_device, host_array +offset_host,  bytes_device*sizeof(float), cudaMemcpyHostToDevice,stream);
-}
\ No newline at end of file
diff --git a/Common/CUDA/tv_proximal.hpp.prehip b/Common/CUDA/tv_proximal.hpp.prehip
deleted file mode 100644
index d65f7f50..00000000
--- a/Common/CUDA/tv_proximal.hpp.prehip
+++ /dev/null
@@ -1,57 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header MATLAB MEX  functions for TV image denoising. Check inputs and parses 
- * MATLAB data to C++ data.
- *
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-#ifndef TVDENOISE
-#define TVDENOISE
-#include "TIGRE_common.hpp"
-#include "GpuIds.hpp"
-
-void tvdenoising(float* src, float* dst, float lambda,
-                 const float* spacing,const long* image_size, int maxIter, const GpuIds& gpuids);
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
-
-
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/types_TIGRE.hpp.prehip b/Common/CUDA/types_TIGRE.hpp.prehip
deleted file mode 100644
index 0a3abc4d..00000000
--- a/Common/CUDA/types_TIGRE.hpp.prehip
+++ /dev/null
@@ -1,109 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA functions for texture-memory interpolation based projection
- *
- *
- * CODE by       Ander Biguri
- *               Sepideh Hatamikia (arbitrary rotation)
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-#ifndef TYPES_CBCT
-#define TYPES_CBCT
-struct  Geometry {
-    // Geometry assumptions:
-    //  -> Origin is at (0,0,0). Image center is there +offOrig
-    //  -> at angle 0, source + image centre (without the offset) + detector centre (without offset) 
-    //     are aligned in the Y_Z plane.
-    //  -> detector is orthonormal to projection plane.
-    
-    //Parameters part of the image geometry
-    int   nVoxelX, nVoxelY, nVoxelZ;
-    float sVoxelX, sVoxelY, sVoxelZ;
-    float dVoxelX, dVoxelY, dVoxelZ;
-    float *offOrigX,*offOrigY,*offOrigZ;
-    float* DSO;
-    // Parameters  of the Detector.
-    int   nDetecU, nDetecV;
-    float sDetecU, sDetecV;
-    float dDetecU, dDetecV;
-    float *offDetecU, *offDetecV;
-    float* DSD;
-    float* dRoll;
-    float* dPitch;
-    float* dYaw;
-    // The base unit we are working with in mm. 
-    float unitX;
-    float unitY;
-    float unitZ;
-    
-    //rotation angle for e uler (ZYZ)
-    float alpha;
-    float theta;
-    float psi;
-    // Centre of Rotation correction.
-    float* COR;
-    //Maximum length of cube
-    float maxLength;
-    //User option
-    float accuracy;
-};
-
- struct Point3D{
-    float x;
-    float y;
-    float z;
-};
-
-struct Point3Ddouble{
-    double x;
-    double y;
-    double z;
-
-    // cast to float member function for "copying" Point3Ddouble to Point3D
-    Point3D to_float()
-    {
-        Point3D castToFloat;
-        castToFloat.x = (float)x;
-        castToFloat.y = (float)y;
-        castToFloat.z = (float)z;
-        return(castToFloat);
-    }
-};
-
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection.cu.prehip b/Common/CUDA/voxel_backprojection.cu.prehip
deleted file mode 100644
index bec4d909..00000000
--- a/Common/CUDA/voxel_backprojection.cu.prehip
+++ /dev/null
@@ -1,920 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA function for backrpojection using FDK weigts for CBCT
- *
- *
- * CODE by  Ander Biguri
- *          Optimized and modified by RB
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-#define  PI_2 1.57079632679489661923
-#include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include "voxel_backprojection.hpp"
-#include "TIGRE_common.hpp"
-#include <math.h>
-#include "GpuIds.hpp"
-
-// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-    
-#define MAXTREADS 1024
-    /*GEOMETRY DEFINITION
-     *
-     *                Detector plane, behind
-     *            |-----------------------------|
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |      +--------+             |
-     *            |     /        /|             |
-     *   A Z      |    /        / |*D           |
-     *   |        |   +--------+  |             |
-     *   |        |   |        |  |             |
-     *   |        |   |     *O |  +             |
-     *   *--->y   |   |        | /              |
-     *  /         |   |        |/               |
-     * V X        |   +--------+                |
-     *            |-----------------------------|
-     *
-     *           *S
-     *
-     *
-     *
-     *
-     *
-     **/
-    
-    void CreateTexture(const GpuIds& gpuids,float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, int nStreamDevice,bool allocate);
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were:
-// PROJ_PER_KERNEL = 32 or 16 (very similar times)
-// VOXELS_PER_THREAD = 8
-// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code.
-// (e.g. 16.2 s vs. ~62 s).
-
-const int PROJ_PER_KERNEL = 32;  // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck.
-const int VOXELS_PER_THREAD = 8;  // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck.
-
-// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection:
-//   deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec
-// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel
-// (they will be updated in the main loop before each kernel call).
-
-__constant__ Point3D projParamsArrayDev[6*PROJ_PER_KERNEL];  // Dev means it is on device
-
-// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above)
-// Point3D projParamsArrayHost[6*PROJ_PER_KERNEL];   // Host means it is host memory
-
-// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection)
-__constant__ float projSinCosArrayDev[5*PROJ_PER_KERNEL];
-
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-//______________________________________________________________________________
-//
-//      Function:       kernelPixelBackprojectionFDK
-//
-//      Description:    Main FDK backprojection kernel
-//______________________________________________________________________________
-
-__global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
-{
-    
-    // Old kernel call signature:
-    // kernelPixelBackprojectionFDK<<<grid,block>>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha);
-    // We just read in most of the params from the constant memory instead of getting them from the param list.
-    // This is because we now have MANY params, since single kernel processes more than one projection!
-    /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo,
-     * float* image,
-     * const int indAlpha,
-     * const Point3D deltaX ,
-     * const Point3D deltaY,
-     * const Point3D deltaZ,
-     * const Point3D xyzOrigin,
-     * const Point3D xyzOffset,
-     * const Point3D uv0Offset,
-     * const float sinalpha,
-     * const float cosalpha){
-     */
-    unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x;
-    // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
-    unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
-    //Make sure we don't go out of bounds
-    if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ)
-        return;
-    
-    // We'll keep a local auxiliary array of values of a column of voxels that this thread will update
-    float voxelColumn[VOXELS_PER_THREAD];
-    
-    // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then
-    // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes
-    
-    unsigned long colIdx;
-#pragma unroll
-    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-    {
-        unsigned long long indZ = startIndZ + colIdx;
-        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the updated values go back to the main volume)
-        if(indZ>=geo.nVoxelZ)
-            break;   // break the loop.
-        
-        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
-        voxelColumn[colIdx] = image[idx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
-        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
-    }  // END copy 3D volume voxels to local array
-    
-    // Now iterate through projections
-#pragma unroll
-    for(unsigned long projNumber=0; projNumber<PROJ_PER_KERNEL; projNumber++)
-    {
-        // Get the current parameters from parameter arrays in constant memory.
-        unsigned long indAlpha = currProjSetNumber*PROJ_PER_KERNEL+projNumber;  // This is the ABSOLUTE projection number in the projection array
-        
-        // Our currImageVal will be updated by hovewer many projections we had left in the "remainder" - that's OK.
-        if(indAlpha>=totalNoOfProjections)
-            break;
-        
-        Point3D deltaX = projParamsArrayDev[6*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
-        Point3D deltaY = projParamsArrayDev[6*projNumber+1];
-        Point3D deltaZ = projParamsArrayDev[6*projNumber+2];
-        Point3D xyzOrigin = projParamsArrayDev[6*projNumber+3];
-        Point3D xyzOffset = projParamsArrayDev[6*projNumber+4];
-        Point3D S = projParamsArrayDev[6*projNumber+5];
-        
-        float sinalpha = projSinCosArrayDev[5*projNumber];     // 2*projNumber because we have 2 float (sin or cos angle) values per projection
-        float cosalpha = projSinCosArrayDev[5*projNumber+1];
-        float COR = projSinCosArrayDev[5*projNumber+2];
-        float DSD = projSinCosArrayDev[5*projNumber+3];
-        float DSO = projSinCosArrayDev[5*projNumber+4];
-        
-        float auxCOR=COR/geo.dDetecU;
-        // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION
-#pragma unroll
-        for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-        {
-            unsigned long long indZ = startIndZ + colIdx;
-            
-            // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-            // be trying to copy the out of bounds values anyway (bounds checks will be done in the final loop where the values go to the main volume)
-            if(indZ>=geo.nVoxelZ)
-                break;   // break the loop.
-            
-            // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles.
-            Point3D P;
-            P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x);
-            P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-auxCOR;
-            P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z);
-            
-            // This is the vector defining the line from the source to the Voxel
-            float vectX,vectY,vectZ;
-            vectX=(P.x -S.x);
-            vectY=(P.y -S.y);
-            vectZ=(P.z -S.z);
-            
-            // Get the coordinates in the detector UV where the mid point of the voxel is projected.
-            float t=__fdividef(DSO-DSD-S.x,vectX);
-            float y,z;
-            y=vectY*t+S.y;
-            z=vectZ*t+S.z;
-            float u,v;
-            u=y+(float)geo.nDetecU*0.5f;
-            v=z+(float)geo.nDetecV*0.5f;
-            
-            float weight;
-            float realx,realy;
-            realx=-(geo.sVoxelX-geo.dVoxelX)*0.5f  +indX*geo.dVoxelX   +xyzOffset.x;
-            realy=-(geo.sVoxelY-geo.dVoxelY)*0.5f  +indY*geo.dVoxelY   +xyzOffset.y+COR;
-            
-            weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO);
-            
-            weight=__frcp_rd(weight*weight);
-            
-            // Get Value in the computed (U,V) and multiply by the corresponding weight.
-            // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
-            
-#if IS_FOR_MATLAB_TIGRE
-            voxelColumn[colIdx]+=tex3D<float>(tex, v, u ,indAlpha+0.5f)*weight;
-#else
-            voxelColumn[colIdx]+=tex3D<float>(tex, u, v ,indAlpha+0.5f)*weight;
-#endif
-        }  // END iterating through column of voxels
-        
-    }  // END iterating through multiple projections
-    
-    // And finally copy the updated local voxelColumn array back to our 3D volume (main memory)
-#pragma unroll
-    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-    {
-        unsigned long long indZ = startIndZ + colIdx;
-        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the values go to the main volume)
-        if(indZ>=geo.nVoxelZ)
-            break;   // break the loop.
-        
-        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
-        image[idx] = voxelColumn[colIdx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
-        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
-        // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write.
-        // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is
-        // better for avoiding memory congestion.
-    }  // END copy updated voxels from local array to our 3D volume
-    
-}  // END kernelPixelBackprojectionFDK
-
-
-
-
-//______________________________________________________________________________
-//
-//      Function:       voxel_backprojection
-//
-//      Description:    Main host function for FDK backprojection (invokes the kernel)
-//______________________________________________________________________________
-
-int voxel_backprojection(float  *  projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids)
-{
-    // printf("voxel_backprojection(geo.nDetector = %d, %d)\n", geo.nDetecU, geo.nDetecV);
-    // printf("geo.nVoxel    = %d, %d, %d\n", geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
-    
-    // Prepare for MultiGPU
-    int deviceCount = gpuids.GetLength();
-    cudaCheckErrors("Device query fail");
-    if (deviceCount == 0) {
-        mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n");
-    }
-
-    // CODE assumes
-    // 1.-All available devices are usable by this code
-    // 2.-All available devices are equal, they are the same machine (warning thrown)
-    // Check the available devices, and if they are the same
-    if (!gpuids.AreEqualDevices()) {
-        mexWarnMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
-    }
-
-    int dev;
-    // Split the CT problem
-    unsigned int split_image;
-    unsigned int split_projections;
-    splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections);
-    
-    
-    cudaCheckErrors("Error");
-    //Pagelock memory for synchronous copy.
-    // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
-    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
-#ifndef NO_PINNED_MEMORY    
-    if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
-        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
-    }
-    if (isHostRegisterSupported ){ 
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable); 
-    } 
-#endif
-    cudaCheckErrors("Error pinning memory");
-    
-    
-    // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the
-    // image slices. The rest of the Geometry is the same
-    Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry));
-    createGeoArray(split_image*deviceCount,geo,geoArray,nalpha);
-    
-    // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly
-    // in the previous section this should leave enough space for the textures.
-    size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
-    float** dimage=(float**)malloc(deviceCount*sizeof(float*));
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMalloc((void**)&dimage[dev], num_bytes_img);
-        cudaCheckErrors("cudaMalloc fail");
-    }
-    
-    //If it is the first time, lets make sure our image is zeroed.
-    int nStreamDevice=2;
-    int nStreams=deviceCount*nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
-    
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        for (int i = 0; i < nStreamDevice; ++i){
-            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
-            
-        }
-    }
-    
-
-     
-    
-    // Kernel auxiliary variables
-    Point3D* projParamsArrayHost;
-    cudaMallocHost((void**)&projParamsArrayHost,6*PROJ_PER_KERNEL*sizeof(Point3D));
-    float* projSinCosArrayHost;
-    cudaMallocHost((void**)&projSinCosArrayHost,5*PROJ_PER_KERNEL*sizeof(float));
-    
-    
-    // Texture object variables
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
-    
-    // Auxiliary Host page-locked memory for fast and asycnornous memcpy.
-
-    // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop
-    // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution
-    // of the code, as repeated allocation and deallocation only happens when the projection data is very very big,
-    // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time.
-    unsigned long long proj_linear_idx_start;
-    unsigned int proj_split_overlap_number;
-    unsigned int current_proj_split_size,current_proj_overlap_split_size;
-    size_t num_bytes_img_curr;
-    size_t img_linear_idx_start;
-    float** partial_projection;
-    size_t* proj_split_size;
-    
-    
-    
-    for(unsigned int img_slice=0;img_slice<split_image;img_slice++){
-        // Initialize the memory if its the first time.
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemset(dimage[dev],0,num_bytes_img);
-            cudaCheckErrors("memset fail");
-        }
-        
-        for( unsigned int proj=0;proj<split_projections;proj++){
-            
-            
-            // What is the size of the current chunk of proejctions we need in?
-            current_proj_split_size=(nalpha+split_projections-1)/split_projections;
-            // if its the last one its probably less
-            current_proj_split_size=((proj+1)*current_proj_split_size<nalpha)?  current_proj_split_size:  nalpha-current_proj_split_size*proj;
-            
-            // We are going to split it in the same amount of kernels we need to execute.
-            proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;
-            
-            // Create pointer to pointers of projections and precompute their location and size.
-            if(!proj && !img_slice){
-                partial_projection=(float**)malloc(proj_split_overlap_number*sizeof(float*));
-                proj_split_size=(size_t*)malloc(proj_split_overlap_number*sizeof(size_t*));
-            }
-            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
-                // Crop the last one, as its likely its not completely divisible.
-                // now lets split this for simultanoeus memcopy and compute.
-                // We want to make sure that if we can, we run PROJ_PER_KERNEL projections, to maximize kernel acceleration
-                // current_proj_overlap_split_size units = angles
-                current_proj_overlap_split_size=max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL);
-                current_proj_overlap_split_size=(proj_block_split<proj_split_overlap_number-1)?current_proj_overlap_split_size:current_proj_split_size-(proj_split_overlap_number-1)*current_proj_overlap_split_size;
-                //Get the linear index where the current memory chunk starts.
-                
-                proj_linear_idx_start=(unsigned long long)((nalpha+split_projections-1)/split_projections)*(unsigned long long)proj*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
-                proj_linear_idx_start+=proj_block_split*max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL)*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
-                //Store result
-                proj_split_size[proj_block_split]=current_proj_overlap_split_size;
-                partial_projection[proj_block_split]=&projections[proj_linear_idx_start];
-                
-            }                
-            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
-
-                // Now get the projections on memory
-
-                CreateTexture(gpuids,
-                        partial_projection[proj_block_split],geo,
-                        &d_cuArrTex[(proj_block_split%2)*deviceCount],
-                        proj_split_size[proj_block_split],
-                        &texProj   [(proj_block_split%2)*deviceCount],
-                        stream, nStreamDevice,
-                        (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
-                
-                for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
-                 }
-                               
-                // Pin the next chunk of projection data, unpin the current one.
-                for (dev = 0; dev < deviceCount; dev++){
-                    //Safety:
-                    // Depends on the amount of GPUs, the case where a image slice is zero hight can happen.
-                    // Just break the loop if we reached that point
-                    if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
-                        break;
-                    
-                    cudaSetDevice(gpuids[dev]);
-                    
-                    
-                    
-                    int divx,divy,divz;
-                    // RB: Use the optimal (in their tests) block size from paper by Zinsser and Keck (16 in x and 32 in y).
-                    // I tried different sizes and shapes of blocks (tiles), but it does not appear to significantly affect throughput, so
-                    // let's stick with the values from Zinsser and Keck.
-                    divx=16;
-                    divy=32;
-                    divz=VOXELS_PER_THREAD;      // We now only have 32 x 16 threads per block (flat tile, see below), BUT each thread works on a Z column of VOXELS_PER_THREAD voxels, so we effectively need fewer blocks!
-                    
-                    
-                    dim3 grid((geo.nVoxelX+divx-1)/divx,
-                            (geo.nVoxelY+divy-1)/divy,
-                            (geoArray[img_slice*deviceCount+dev].nVoxelZ+divz-1)/divz);
-                    
-                    dim3 block(divx,divy,1);    // Note that we have 1 in the Z size, not divz, since each thread works on a vertical set of VOXELS_PER_THREAD voxels (so we only need a "flat" tile of threads, with depth of 1)
-                    //////////////////////////////////////////////////////////////////////////////////////
-                    // Main reconstruction loop: go through projections (rotation angles) and backproject
-                    //////////////////////////////////////////////////////////////////////////////////////
-                    
-                    // Since we'll have multiple projections processed by a SINGLE kernel call, compute how many
-                    // kernel calls we'll need altogether.
-                    unsigned int noOfKernelCalls = (proj_split_size[proj_block_split]+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_KERNEL
-                    for (unsigned int i=0; i<noOfKernelCalls; i++){
-                        
-                        // Now we need to generate and copy all data for PROJ_PER_KERNEL projections to constant memory so that our kernel can use it
-                        unsigned int j;
-                        for(j=0; j<PROJ_PER_KERNEL; j++){
-                            
-                            unsigned int currProjNumber_slice=i*PROJ_PER_KERNEL+j;
-                            unsigned int currProjNumber_global=i*PROJ_PER_KERNEL+j                                                                          // index within kernel
-                                                               +proj*(nalpha+split_projections-1)/split_projections                                          // index of the global projection split
-                                                               +proj_block_split*max(current_proj_split_size/proj_split_overlap_number,PROJ_PER_KERNEL); // indexof overlap current split
-                            
-                            if(currProjNumber_slice>=proj_split_size[proj_block_split])
-                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
-                            if(currProjNumber_global>=nalpha)
-                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
-                            
-                            Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source;
-                            float sinalpha,cosalpha;
-                            
-                            geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now.
-                            geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1];
-                            geoArray[img_slice*deviceCount+dev].psi  =-alphas[currProjNumber_global*3+2];
-                            
-//                             mexPrintf("%u %f \n",i,geoArray[img_slice*deviceCount+dev].alpha);
-//                             mexPrintf("%u \n",currProjNumber_global);
-                            
-                            sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha);
-                            cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha);
-                            
-                            projSinCosArrayHost[5*j]=sinalpha;  // 2*j because we have 2 float (sin or cos angle) values per projection
-                            projSinCosArrayHost[5*j+1]=cosalpha;
-                            projSinCosArrayHost[5*j+2]=geo.COR[currProjNumber_global];
-                            projSinCosArrayHost[5*j+3]=geo.DSD[currProjNumber_global];
-                            projSinCosArrayHost[5*j+4]=geo.DSO[currProjNumber_global];
-                            
-                            computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
-                            
-                            offOrig.x=geo.offOrigX[currProjNumber_global];
-                            offOrig.y=geo.offOrigY[currProjNumber_global];
-                            offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global];
-                            
-                            projParamsArrayHost[6*j]=deltaX;		// 6*j because we have 6 Point3D values per projection
-                            projParamsArrayHost[6*j+1]=deltaY;
-                            projParamsArrayHost[6*j+2]=deltaZ;
-                            projParamsArrayHost[6*j+3]=xyzOrigin;
-                            projParamsArrayHost[6*j+4]=offOrig;
-                            projParamsArrayHost[6*j+5]=source;
-                        }   // END for (preparing params for kernel call)
-                        
-                        // Copy the prepared parameter arrays to constant memory to make it available for the kernel
-                        cudaMemcpyToSymbolAsync(projSinCosArrayDev, projSinCosArrayHost, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaMemcpyToSymbolAsync(projParamsArrayDev, projParamsArrayHost, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
-                        
-                        kernelPixelBackprojectionFDK<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
-                    }  // END for
-                    //////////////////////////////////////////////////////////////////////////////////////
-                    // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject
-                    //////////////////////////////////////////////////////////////////////////////////////
-                }// END for deviceCount
-            } // END sub-split of current projection chunk
-            
-            for (dev = 0; dev < deviceCount; dev++){
-                cudaSetDevice(gpuids[dev]);
-                cudaDeviceSynchronize();
-            }
-            
-        } // END projection splits
-        
-       
-        // Now we need to take the image out of the GPU
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            // We do not need to sycnronize because the array dealocators already do.
-            num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
-            img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
-            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
-        }
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDeviceSynchronize();
-            cudaCheckErrors("Main loop fail");
-        }
-        
-    } // end image splits
-
-    ///////// Cleaning:
-    
-    
-    bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1;
-    for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
-        if (!two_buffers_used && i==1)
-            break;
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
-            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
-        }
-    }
-    cudaCheckErrors("cudadestroy textures result fail");
-    
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dimage[dev]);
-    }
-    cudaFreeHost(projSinCosArrayHost);
-    cudaFreeHost(projParamsArrayHost);
-    free(partial_projection);
-    free(proj_split_size);
-    
-    freeGeoArray(split_image*deviceCount,geoArray);
-#ifndef NO_PINNED_MEMORY        
-    if (isHostRegisterSupported & (split_image>1 |deviceCount>1)){
-        cudaHostUnregister(result);
-    }
-    if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
-    }
-#endif
-    
-    for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
-    
-    cudaCheckErrors("cudaFree fail");
-    
-    //cudaDeviceReset(); // For the Nvidia Visual Profiler
-    return 0;
-    
-}  // END voxel_backprojection
-//
-
-void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){
-    
-    
-    // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
-    
-    size_t mem_GPU_global;
-    checkFreeMemory(gpuids, &mem_GPU_global);
-
-    const int deviceCount = gpuids.GetLength();
-        
-    // Compute how much memory each of the relevant memory pieces need
-    size_t mem_image=       (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
-    size_t mem_proj=        (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float);
-    
-    
-    
-    
-    // Does everything fit in the GPU?
-    
-    if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2<mem_GPU_global){
-        // We only need to split if we have extra GPUs
-        *split_image=1;
-        *split_projections=1;
-    }
-    // We know we need to split, but:
-    // Does all the image fit in the GPU, with some slack for a stack of projections??
-    else
-    {
-        // As we can overlap memcpys from H2D of the projections, we should then minimize the amount of image splits.
-        // Lets assume to start with that we only need 1 stack of PROJ_PER_KERNEL projections. The rest is for the image.
-        size_t mem_free=mem_GPU_global-2*mem_proj*PROJ_PER_KERNEL;
-        
-        *split_image=(mem_image/deviceCount+mem_free-1)/mem_free;
-        // Now knowing how many splits we have for images, we can recompute how many slices of projections actually
-        // fit on the GPU. Must be more than 0 obviously.
-        
-        mem_free=mem_GPU_global-(mem_image/deviceCount)/(*split_image); // NOTE: There is some rounding error, but its in the order of bytes, and we have 5% of GPU free jsut in case. We are safe
-        
-        
-        *split_projections=(mem_proj*PROJ_PER_KERNEL*2+mem_free-1)/mem_free;
-        
-    }
-}
-
-
-void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
-    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-#if IS_FOR_MATLAB_TIGRE
-    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
-#else
-    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
-#endif
-    const unsigned int num_devices = gpuids.GetLength();
-    if (allocate){
-        for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
-            
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-            //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
-            
-        }
-    }
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
-        //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
-        copyParams.dstArray = d_cuArrTex[dev];
-        copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
-    }
-
-    //Array creation End
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
-        texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
-        texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
-    }
-}
-
-//______________________________________________________________________________
-//
-//      Function:       createGeoArray
-//
-//      Description:    This code generates the geometries needed to split the image properly in
-//                      cases where the entire image does not fit in the memory of the GPU
-//______________________________________________________________________________
-
-void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles){
-    
-    
-    unsigned int  splitsize=(geo.nVoxelZ+image_splits-1)/image_splits;
-    
-    for(unsigned int sp=0;sp<image_splits;sp++){
-        geoArray[sp]=geo;
-        // All of them are splitsize, but the last one, possible
-        geoArray[sp].nVoxelZ=((sp+1)*splitsize<geo.nVoxelZ)?  splitsize:  max(geo.nVoxelZ-splitsize*sp,0);
-        geoArray[sp].sVoxelZ= geoArray[sp].nVoxelZ* geoArray[sp].dVoxelZ;
-        
-        // We need to redefine the offsets, as now each subimage is not aligned in the origin.
-        geoArray[sp].offOrigZ=(float *)malloc(nangles*sizeof(float));
-        for (unsigned int i=0;i<nangles;i++){
-            geoArray[sp].offOrigZ[i]=geo.offOrigZ[i]-geo.sVoxelZ/2+sp*geoArray[0].sVoxelZ+geoArray[sp].sVoxelZ/2;
-        }
-    }
-    
-}
-//______________________________________________________________________________
-//
-//      Function:       freeGeoArray
-//
-//      Description:    Frees the memory from the geometry array for multiGPU.
-//______________________________________________________________________________
-void freeGeoArray(unsigned int splits,Geometry* geoArray){
-    for(unsigned int sp=0;sp<splits;sp++){
-        free(geoArray[sp].offOrigZ);
-    }
-    free(geoArray);
-}
-
-
-//______________________________________________________________________________
-//
-//      double precision functions for rotating Point3Ddouble coordinates
-//______________________________________________________________________________
-
-void eulerZYZT(Geometry geo, Point3Ddouble* point){
-    
-    Point3Ddouble auxPoint;
-    auxPoint.x=point->x;
-    auxPoint.y=point->y;
-    auxPoint.z=point->z;
-
-    // calculate sin and cos of 3 angles (used multiple times)
-    double sin_alpha, cos_alpha, sin_theta, cos_theta, sin_psi, cos_psi;
-    sin_alpha = sin((double)geo.alpha);
-    cos_alpha = cos((double)geo.alpha);
-    sin_theta = sin((double)geo.theta);
-    cos_theta = cos((double)geo.theta);
-    sin_psi = sin((double)geo.psi);
-    cos_psi = cos((double)geo.psi);
-    
-    point->x = auxPoint.x*(cos_psi*cos_theta*cos_alpha-sin_psi*sin_alpha)
-    +auxPoint.y*(-cos_psi*cos_theta*sin_alpha-sin_psi*cos_alpha)
-    +auxPoint.z*cos_psi*sin_theta;
-    point->y = auxPoint.x*(sin_psi*cos_theta*cos_alpha+cos_psi*sin_alpha)
-    +auxPoint.y*(-sin_psi*cos_theta*sin_alpha+cos_psi*cos_alpha)
-    +auxPoint.z*sin_psi*sin_theta;
-    point->z =-auxPoint.x*sin_theta*cos_alpha
-    +auxPoint.y*sin_theta*sin_alpha
-    +auxPoint.z*cos_theta;
-}
-
-void rollPitchYawT(Geometry geo,int i, Point3Ddouble* point){
-
-    Point3Ddouble auxPoint;
-    auxPoint.x=point->x;
-    auxPoint.y=point->y;
-    auxPoint.z=point->z;
-
-    // calculate sin and cos of 3 angles (used multiple times)
-    double sin_dRoll, cos_dRoll, sin_dPitch, cos_dPitch, sin_dYaw, cos_dYaw;
-    sin_dRoll = sin((double)geo.dRoll[i]);
-    cos_dRoll = cos((double)geo.dRoll[i]);
-    sin_dPitch = sin((double)geo.dPitch[i]);
-    cos_dPitch = cos((double)geo.dPitch[i]);
-    sin_dYaw = sin((double)geo.dYaw[i]);
-    cos_dYaw = cos((double)geo.dYaw[i]);
-    
-    point->x=cos_dRoll*cos_dPitch*auxPoint.x
-            +sin_dRoll*cos_dPitch*auxPoint.y
-            -sin_dPitch*auxPoint.z;
-    
-    point->y=(cos_dRoll*sin_dPitch*sin_dYaw - sin_dRoll*cos_dYaw)*auxPoint.x
-            +(sin_dRoll*sin_dPitch*sin_dYaw + cos_dRoll*cos_dYaw)*auxPoint.y
-            +cos_dPitch*sin_dYaw*auxPoint.z;
-    
-    point->z=(cos_dRoll*sin_dPitch*cos_dYaw + sin_dRoll*sin_dYaw)*auxPoint.x
-            +(sin_dRoll*sin_dPitch*cos_dYaw - cos_dRoll*sin_dYaw)*auxPoint.y
-            +cos_dPitch*cos_dYaw*auxPoint.z;
-}
-
-//______________________________________________________________________________
-//
-//      Function:       computeDeltasCube
-//
-//      Description:    Computes relative increments for each projection (volume rotation).
-//						Increments get passed to the backprojection kernel.
-//______________________________________________________________________________
-
-void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S)
-{
-    
-    // initialize points with double precision
-    Point3Ddouble P, Px,Py,Pz;
-
-    // Get coords of Img(0,0,0)
-    P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i];
-    P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i];
-    P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i];
-    
-    // Get coords from next voxel in each direction
-    Px.x=P.x+geo.dVoxelX;       Py.x=P.x;                Pz.x=P.x;
-    Px.y=P.y;                   Py.y=P.y+geo.dVoxelY;    Pz.y=P.y;
-    Px.z=P.z;                   Py.z=P.z;                Pz.z=P.z+geo.dVoxelZ;
-    
-    // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ
-    eulerZYZT(geo,&P);
-    eulerZYZT(geo,&Px);
-    eulerZYZT(geo,&Py);
-    eulerZYZT(geo,&Pz);
-    
-    //detector offset
-    P.z =P.z-geo.offDetecV[i];            P.y =P.y-geo.offDetecU[i];
-    Px.z =Px.z-geo.offDetecV[i];          Px.y =Px.y-geo.offDetecU[i];
-    Py.z =Py.z-geo.offDetecV[i];          Py.y =Py.y-geo.offDetecU[i];
-    Pz.z =Pz.z-geo.offDetecV[i];          Pz.y =Pz.y-geo.offDetecU[i];
-    
-    //Detector Roll pitch Yaw
-    //
-    // first, we need to offset everything so (0,0,0) is the center of the detector
-    // Only X is required for that
-    P.x=P.x+(geo.DSD[i]-geo.DSO[i]);
-    Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]);
-    Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]);
-    Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]);
-    rollPitchYawT(geo,i,&P);
-    rollPitchYawT(geo,i,&Px);
-    rollPitchYawT(geo,i,&Py);
-    rollPitchYawT(geo,i,&Pz);
-    
-    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
-    Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]);
-    Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]);
-    Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]);
-    //Done for P, now source
-    Point3Ddouble source;
-    source.x=geo.DSD[i]; //already offseted for rotation
-    source.y=-geo.offDetecU[i];
-    source.z=-geo.offDetecV[i];
-    rollPitchYawT(geo,i,&source);
-    
-    source.x=source.x-(geo.DSD[i]-geo.DSO[i]);//   source.y=source.y-auxOff.y;    source.z=source.z-auxOff.z;
-    
-//       mexPrintf("%f,%f,%f\n",source.x,source.y,source.z);
-    // Scale coords so detector pixels are 1x1
-    
-    P.z =P.z /geo.dDetecV;                          P.y =P.y/geo.dDetecU;
-    Px.z=Px.z/geo.dDetecV;                          Px.y=Px.y/geo.dDetecU;
-    Py.z=Py.z/geo.dDetecV;                          Py.y=Py.y/geo.dDetecU;
-    Pz.z=Pz.z/geo.dDetecV;                          Pz.y=Pz.y/geo.dDetecU;
-    
-    source.z=source.z/geo.dDetecV;                  source.y=source.y/geo.dDetecU;
-    
-    // get deltas of the changes in voxels
-    deltaX->x=Px.x-P.x;   deltaX->y=Px.y-P.y;    deltaX->z=Px.z-P.z;
-    deltaY->x=Py.x-P.x;   deltaY->y=Py.y-P.y;    deltaY->z=Py.z-P.z;
-    deltaZ->x=Pz.x-P.x;   deltaZ->y=Pz.y-P.y;    deltaZ->z=Pz.z-P.z;
-    
-    // cast the results from the double precision calculations back to float
-    *xyzorigin=P.to_float();
-    *S=source.to_float();
-}
-
-void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
-    size_t memfree;
-    size_t memtotal;
-    const int deviceCount = gpuids.GetLength();
-    
-    for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
-        if(dev==0) *mem_GPU_global=memfree;
-        if(memfree<memtotal/2){
-            mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
-        }
-        cudaCheckErrors("Check mem error");
-        
-        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
-    }
-    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
-    
-    //*mem_GPU_global= insert your known number here, in bytes.
-}
-
diff --git a/Common/CUDA/voxel_backprojection.hpp.prehip b/Common/CUDA/voxel_backprojection.hpp.prehip
deleted file mode 100644
index a4ea464f..00000000
--- a/Common/CUDA/voxel_backprojection.hpp.prehip
+++ /dev/null
@@ -1,59 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * HEader CUDA function for backrpojection using FDK weigts for CBCT
- *
- *
- * CODE by  Ander Biguri
- *          Optimized and modified by RB
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-#include "types_TIGRE.hpp"
-#include "GpuIds.hpp"
-
-#ifndef BACKPROJECTION_HPP
-#define BACKPROJECTION_HPP
-void rollPitchYawT(Geometry geo,int i, Point3Ddouble* point);
-int  voxel_backprojection(float* projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids);
-void splitCTbackprojection(const GpuIds& gpuids,Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections);
-void eulerZYZT(Geometry geo, Point3Ddouble* point);
-void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S);
-void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles);
-void freeGeoArray(unsigned int splits,Geometry* geoArray);
-void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global);
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection2.cu.prehip b/Common/CUDA/voxel_backprojection2.cu.prehip
deleted file mode 100644
index c9dcc957..00000000
--- a/Common/CUDA/voxel_backprojection2.cu.prehip
+++ /dev/null
@@ -1,844 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA function for backrpojection using FDK weigts for CBCT
- *
- *
- * CODE by  Ander Biguri
- *          Optimized and modified by RB
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-#define  PI_2 1.57079632679489661923
-#include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include "voxel_backprojection2.hpp"
-#include "TIGRE_common.hpp"
-#include <math.h>
-#include "GpuIds.hpp"
-
-// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-    
-#define MAXTREADS 1024
-    /*GEOMETRY DEFINITION
-     *
-     *                Detector plane, behind
-     *            |-----------------------------|
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |      +--------+             |
-     *            |     /        /|             |
-     *   A Z      |    /        / |*D           |
-     *   |        |   +--------+  |             |
-     *   |        |   |        |  |             |
-     *   |        |   |     *O |  +             |
-     *   *--->y   |   |        | /              |
-     *  /         |   |        |/               |
-     * V X        |   +--------+                |
-     *            |-----------------------------|
-     *
-     *           *S
-     *
-     *
-     *
-     *
-     *
-     **/
-    
-// this definitionmust go here.
-void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate);
-
-__global__ void matrixConstantMultiply(const Geometry geo,float* image,float constant){
-    size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-    for(; idx<geo.nVoxelX* geo.nVoxelY *geo.nVoxelZ; idx+=gridDim.x*blockDim.x) {
-        image[idx]*=constant;
-    }
-    
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were:
-// PROJ_PER_KERNEL = 32 or 16 (very similar times)
-// VOXELS_PER_THREAD = 8
-// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code.
-// (e.g. 16.2 s vs. ~62 s).
-
-const int PROJ_PER_KERNEL = 32;  // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck.
-const int VOXELS_PER_THREAD = 8;  // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck.
-
-// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection:
-//   deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec
-// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel
-// (they will be updated in the main loop before each kernel call).
-
-__constant__ Point3D projParamsArray2Dev[7*PROJ_PER_KERNEL];  // Dev means it is on device
-
-// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above)
-
-// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection)
-__constant__ float projSinCosArray2Dev[5*PROJ_PER_KERNEL];
-
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-//______________________________________________________________________________
-//
-//      Function:       kernelPixelBackprojectionFDK
-//
-//      Description:    Main FDK backprojection kernel
-//______________________________________________________________________________
-
-__global__ void kernelPixelBackprojection(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections, cudaTextureObject_t tex)
-{
-    
-    unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x;
-    // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
-    unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
-    //Make sure we don't go out of bounds
-    if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ)
-        return;
-    
-    // We'll keep a local auxiliary array of values of a column of voxels that this thread will update
-    float voxelColumn[VOXELS_PER_THREAD];
-    
-    // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then
-    // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes
-    
-    unsigned long colIdx;
-#pragma unroll
-    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-    {
-        unsigned long long indZ = startIndZ + colIdx;
-        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the updated values go back to the main volume)
-        if(indZ>=geo.nVoxelZ)
-            break;   // break the loop.
-        
-        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
-        voxelColumn[colIdx] = image[idx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
-        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
-    }  // END copy 3D volume voxels to local array
-    
-    // Now iterate through projections
-#pragma unroll
-    for(unsigned long projNumber=0; projNumber<PROJ_PER_KERNEL; projNumber++)
-    {
-        // Get the current parameters from parameter arrays in constant memory.
-        unsigned long indAlpha = currProjSetNumber*PROJ_PER_KERNEL+projNumber;  // This is the ABSOLUTE projection number in the projection array
-        
-        // Our currImageVal will be updated by hovewer many projections we had left in the "remainder" - that's OK.
-        if(indAlpha>=totalNoOfProjections)
-            break;
-        
-        Point3D deltaX = projParamsArray2Dev[7*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
-        Point3D deltaY = projParamsArray2Dev[7*projNumber+1];
-        Point3D deltaZ = projParamsArray2Dev[7*projNumber+2];
-        Point3D xyzOrigin = projParamsArray2Dev[7*projNumber+3];
-        Point3D xyzOffset = projParamsArray2Dev[7*projNumber+4];
-        Point3D uv0Offset = projParamsArray2Dev[7*projNumber+5];
-        Point3D S = projParamsArray2Dev[7*projNumber+6];
-        
-        float sinalpha = projSinCosArray2Dev[5*projNumber];     // 2*projNumber because we have 2 float (sin or cos angle) values per projection
-        float cosalpha = projSinCosArray2Dev[5*projNumber+1];
-        float COR = projSinCosArray2Dev[5*projNumber+2];
-        float DSD = projSinCosArray2Dev[5*projNumber+3];
-        float DSO = projSinCosArray2Dev[5*projNumber+4];
-        // Precomputations for the weights:
-        //Real coords of Source
-        // We already have S.x (geo.DSO), and S.y and S.z are always zero. we just need to rotate
-        Point3D realS;
-        realS.x= DSO*cosalpha;
-        realS.y=-DSO*sinalpha;
-        realS.z=0;
-        
-        
-        Point3D realvoxel_init;
-        realvoxel_init.x=-geo.sVoxelX/2+geo.dVoxelX/2+xyzOffset.x;
-        realvoxel_init.y=-geo.sVoxelY/2+geo.dVoxelY/2+xyzOffset.y;
-        realvoxel_init.z=-geo.sVoxelZ/2+geo.dVoxelZ/2+xyzOffset.z;
-        // Real XYZ coordinates of Detector.
-        Point3D realD, realDaux;
-        // We know the index of the detector (u,v). Start from there.
-        realDaux.x=-(DSD-DSO);
-        
-        // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION
-#pragma unroll
-        for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-        {
-            unsigned long long indZ = startIndZ + colIdx;
-            
-            // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-            // be trying to copy the out of bounds values anyway (bounds checks will be done in the final loop where the values go to the main volume)
-            if(indZ>=geo.nVoxelZ)
-                break;   // break the loop.
-            
-            // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles.
-            Point3D P;
-            P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x);
-            P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU;
-            P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z);
-            
-            // This is the vector defining the line from the source to the Voxel
-            float vectX,vectY,vectZ;
-            vectX=(P.x -S.x);
-            vectY=(P.y -S.y);
-            vectZ=(P.z -S.z);
-            
-            // Get the coordinates in the detector UV where the mid point of the voxel is projected.
-            float t=__fdividef(DSO-DSD-S.x,vectX);
-            float y,z;
-            y=vectY*t+S.y;
-            z=vectZ*t+S.z;
-            float u,v;
-            u=y+(float)geo.nDetecU*0.5f;
-            v=z+(float)geo.nDetecV*0.5f;
-#if IS_FOR_MATLAB_TIGRE
-            float sample=tex3D<float>(tex, v, u ,indAlpha+0.5f);
-#else
-            float sample=tex3D<float>(tex, u, v ,indAlpha+0.5f);
-#endif
-            float weight=0;
-            //
-            //
-            //
-            // IMPORTANT: The weights are almost 50% of the computational time. Is there a way of speeding this up??
-            //
-            //Real coordinates of Voxel. Instead of reverting the transformation, its less math (faster) to compute it from the indexes.
-            Point3D realvoxel;
-            
-            realvoxel.x=realvoxel_init.x+indX*geo.dVoxelX;
-            realvoxel.y=realvoxel_init.y+indY*geo.dVoxelY;
-            realvoxel.z=realvoxel_init.z+indZ*geo.dVoxelZ;
-            
-            
-            
-            realDaux.y=(-geo.sDetecU+geo.dDetecU)*0.5f + u*geo.dDetecU +uv0Offset.x;
-            realD.z   =(-geo.sDetecV+geo.dDetecV)*0.5f + v*geo.dDetecV +uv0Offset.y;
-            //rotate the detector
-            realD.x= realDaux.x*cosalpha  + realDaux.y*sinalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x)
-            realD.y=-realDaux.x*sinalpha  + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x)
-            float L,lsq;
-            
-            L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always.
-            lsq =  (realS.x-realvoxel.x)*(realS.x-realvoxel.x)
-            + (realS.y-realvoxel.y)*(realS.y-realvoxel.y)
-            + (realS.z-realvoxel.z)*(realS.z-realvoxel.z);
-            
-            weight=__fdividef(L*L*L,(DSD*lsq));
-//             weight=1;
-            // Get Value in the computed (U,V) and multiply by the corresponding weight.
-            // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
-            voxelColumn[colIdx]+=sample* weight;
-        }  // END iterating through column of voxels
-        
-    }  // END iterating through multiple projections
-    
-    // And finally copy the updated local voxelColumn array back to our 3D volume (main memory)
-#pragma unroll
-    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-    {
-        unsigned long long indZ = startIndZ + colIdx;
-        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the values go to the main volume)
-        if(indZ>=geo.nVoxelZ)
-            break;   // break the loop.
-        
-        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
-        image[idx] = voxelColumn[colIdx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
-        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
-        // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write.
-        // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is
-        // better for avoiding memory congestion.
-    }  // END copy updated voxels from local array to our 3D volume
-    
-}  // END kernelPixelBackprojectionFDK
-
-
-
-
-//______________________________________________________________________________
-//
-//      Function:       voxel_backprojection
-//
-//      Description:    Main host function for FDK backprojection (invokes the kernel)
-//______________________________________________________________________________
-
-int voxel_backprojection2(float * projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids){
-    
-    
-    
-    
-    // Prepare for MultiGPU
-    int deviceCount = gpuids.GetLength();
-    cudaCheckErrors("Device query fail");
-    if (deviceCount == 0) {
-        mexErrMsgIdAndTxt("Atb:Voxel_backprojection:GPUselect","There are no available device(s) that support CUDA\n");
-    }
-    
-    
-    // CODE assumes
-    // 1.-All available devices are usable by this code
-    // 2.-All available devices are equal, they are the same machine (warning thrown)
-    // Check the available devices, and if they are the same
-    if (!gpuids.AreEqualDevices()) {
-        mexWarnMsgIdAndTxt("Atb:Voxel_backprojection2:GPUselect","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
-    }
-
-    int dev;
-
-    
-    // Split the CT problem
-    unsigned int split_image;
-    unsigned int split_projections;
-    splitCTbackprojection(gpuids,geo,nalpha,&split_image,&split_projections);
-    
-    
-    // Create the arrays for the geometry. The main difference is that geo.offZ has been tuned for the
-    // image slices. The rest of the Geometry is the same
-    Geometry* geoArray=(Geometry*)malloc(split_image*deviceCount*sizeof(Geometry));
-    createGeoArray(split_image*deviceCount,geo,geoArray,nalpha);
-    
-    // Now lest allocate all the image memory on the GPU, so we can use it later. If we have made our numbers correctly
-    // in the previous section this should leave enough space for the textures.
-    size_t num_bytes_img = (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ* sizeof(float);
-    float** dimage=(float**)malloc(deviceCount*sizeof(float*));
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMalloc((void**)&dimage[dev], num_bytes_img);
-        cudaCheckErrors("cudaMalloc fail");
-    }
-        
-    
-    //Pagelock memory for synchronous copy.
-    // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-    // empirical testing shows that when the image split is smaller than 1 (also implies the image is not very big), the time to
-    // pin the memory is greater than the lost time in Synchronously launching the memcpys. This is only worth it when the image is too big.
-    if (isHostRegisterSupported & split_image>1){
-        cudaHostRegister(result, (size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geo.nVoxelZ*(size_t)sizeof(float),cudaHostRegisterPortable);
-    }
-    if (isHostRegisterSupported ){
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
-    }
-    cudaCheckErrors("Error pinning memory");
-
-    
-    
-    
-
-    //If it is the first time, lets make sure our image is zeroed.
-    int nStreamDevice=2;
-    int nStreams=deviceCount*nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
-    
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        for (int i = 0; i < nStreamDevice; ++i){
-            cudaStreamCreate(&stream[i+dev*nStreamDevice]);
-            
-        }
-    }
-    
-    // Kernel auxiliary variables
-    Point3D* projParamsArray2Host;
-    cudaMallocHost((void**)&projParamsArray2Host,7*PROJ_PER_KERNEL*sizeof(Point3D));
-    float* projSinCosArray2Host;
-    cudaMallocHost((void**)&projSinCosArray2Host,5*PROJ_PER_KERNEL*sizeof(float));
-    
-    // Texture object variables
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(deviceCount*2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(deviceCount*2*sizeof(cudaArray*));
-    
-    
-    
-    unsigned int proj_split_overlap_number;
-    // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop
-    // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution
-    // of the code, as repeated allocation and deallocation only happens when the projection data is very very big,
-    // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time.
-    unsigned long long proj_linear_idx_start;
-    unsigned int current_proj_split_size,current_proj_overlap_split_size;
-    size_t num_bytes_img_curr;
-    size_t img_linear_idx_start;
-    float** partial_projection;
-    size_t* proj_split_size;
-    
-    for(unsigned int img_slice=0;img_slice<split_image;img_slice++){
-//
-        // Initialize the memory if its the first time.
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaMemset(dimage[dev],0,num_bytes_img);
-            cudaCheckErrors("memset fail");
-        }
-        
-        for( unsigned int proj=0;proj<split_projections;proj++){
-            
-            
-            // What is the size of the current chunk of proejctions we need in?
-            current_proj_split_size=(nalpha+split_projections-1)/split_projections;
-            // if its the last one its probably less
-            current_proj_split_size=((proj+1)*current_proj_split_size<nalpha)?  current_proj_split_size:  nalpha-current_proj_split_size*proj;
-            
-            // We are going to split it in the same amount of kernels we need to execute.
-            proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;
-            
-            // Create pointer to pointers of projections and precompute their location and size.
-            if(!proj && !img_slice){
-                partial_projection=(float**)malloc(current_proj_split_size*sizeof(float*));
-                proj_split_size=(size_t*)malloc(current_proj_split_size*sizeof(size_t*));
-            }
-            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
-                // Crop the last one, as its likely its not completely divisible.
-                // now lets split this for simultanoeus memcopy and compute.
-                // We want to make sure that if we can, we run PROJ_PER_KERNEL projections, to maximize kernel acceleration
-                // current_proj_overlap_split_size units = angles
-                current_proj_overlap_split_size=max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL);
-                current_proj_overlap_split_size=(proj_block_split<proj_split_overlap_number-1)?current_proj_overlap_split_size:current_proj_split_size-(proj_split_overlap_number-1)*current_proj_overlap_split_size;
-                //Get the linear index where the current memory chunk starts.
-                
-                proj_linear_idx_start=(unsigned long long)((nalpha+split_projections-1)/split_projections)*(unsigned long long)proj*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
-                proj_linear_idx_start+=proj_block_split*max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL)*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
-                //Store result
-                proj_split_size[proj_block_split]=current_proj_overlap_split_size;
-                partial_projection[proj_block_split]=&projections[proj_linear_idx_start];
-                
-            }                
-
-            
-            for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
-
-                
-                // Now get the projections on memory
-
-                CreateTexture2(gpuids,
-                        partial_projection[proj_block_split],geo,
-                        &d_cuArrTex[(proj_block_split%2)*deviceCount],
-                        proj_split_size[proj_block_split],
-                        &texProj   [(proj_block_split%2)*deviceCount],
-                        stream, nStreamDevice,
-                        (proj_block_split<2)&!proj&!img_slice);// Only allocate if its the first 2 calls
-                
-                for (dev = 0; dev < deviceCount; dev++){
-                    cudaSetDevice(gpuids[dev]);
-                    cudaStreamSynchronize(stream[dev*nStreamDevice+1]);
-                 }
-
-                for (dev = 0; dev < deviceCount; dev++){
-                    //Safety:
-                    // Depends on the amount of GPUs, the case where a image slice is zero hight can happen.
-                    // Just break the loop if we reached that point
-                    if(geoArray[img_slice*deviceCount+dev].nVoxelZ==0)
-                        break;
-                    
-                    cudaSetDevice(gpuids[dev]);
-                    
-                    
-                    
-                    int divx,divy,divz;
-                    // RB: Use the optimal (in their tests) block size from paper by Zinsser and Keck (16 in x and 32 in y).
-                    // I tried different sizes and shapes of blocks (tiles), but it does not appear to significantly affect throughput, so
-                    // let's stick with the values from Zinsser and Keck.
-                    divx=16;
-                    divy=32;
-                    divz=VOXELS_PER_THREAD;      // We now only have 32 x 16 threads per block (flat tile, see below), BUT each thread works on a Z column of VOXELS_PER_THREAD voxels, so we effectively need fewer blocks!
-                    
-                    
-                    dim3 grid((geo.nVoxelX+divx-1)/divx,
-                            (geo.nVoxelY+divy-1)/divy,
-                            (geoArray[img_slice*deviceCount+dev].nVoxelZ+divz-1)/divz);
-                    
-                    dim3 block(divx,divy,1);    // Note that we have 1 in the Z size, not divz, since each thread works on a vertical set of VOXELS_PER_THREAD voxels (so we only need a "flat" tile of threads, with depth of 1)
-                    //////////////////////////////////////////////////////////////////////////////////////
-                    // Main reconstruction loop: go through projections (rotation angles) and backproject
-                    //////////////////////////////////////////////////////////////////////////////////////
-                    
-                    // Since we'll have multiple projections processed by a SINGLE kernel call, compute how many
-                    // kernel calls we'll need altogether.
-                    unsigned int noOfKernelCalls = (proj_split_size[proj_block_split]+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_KERNEL
-                    for (unsigned int i=0; i<noOfKernelCalls; i++){
-                        
-                        // Now we need to generate and copy all data for PROJ_PER_KERNEL projections to constant memory so that our kernel can use it
-                        unsigned int j;
-                        for(j=0; j<PROJ_PER_KERNEL; j++){
-                            
-                            unsigned int currProjNumber_slice=i*PROJ_PER_KERNEL+j;
-                            unsigned int currProjNumber_global=i*PROJ_PER_KERNEL+j                                                                          // index within kernel
-                                    +proj*(nalpha+split_projections-1)/split_projections                                          // index of the global projection split
-                                    +proj_block_split*max(current_proj_split_size/proj_split_overlap_number,PROJ_PER_KERNEL); // indexof overlap current split
-                            if(currProjNumber_slice>=proj_split_size[proj_block_split])
-                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
-                            if(currProjNumber_global>=nalpha)
-                                break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
-                            
-                            Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, offDetec,source;
-                            float sinalpha,cosalpha;
-                            
-                            geoArray[img_slice*deviceCount+dev].alpha=-alphas[currProjNumber_global*3];//we got 3 angles now.
-                            geoArray[img_slice*deviceCount+dev].theta=-alphas[currProjNumber_global*3+1];
-                            geoArray[img_slice*deviceCount+dev].psi  =-alphas[currProjNumber_global*3+2];
-                            
-                            sinalpha=sin(geoArray[img_slice*deviceCount+dev].alpha);
-                            cosalpha=cos(geoArray[img_slice*deviceCount+dev].alpha);
-                            
-                            projSinCosArray2Host[5*j]=sinalpha;  // 2*j because we have 2 float (sin or cos angle) values per projection
-                            projSinCosArray2Host[5*j+1]=cosalpha;
-                            projSinCosArray2Host[5*j+2]=geo.COR[currProjNumber_global];
-                            projSinCosArray2Host[5*j+3]=geo.DSD[currProjNumber_global];
-                            projSinCosArray2Host[5*j+4]=geo.DSO[currProjNumber_global];
-                            
-                            computeDeltasCube(geoArray[img_slice*deviceCount+dev],currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
-                            
-                            offOrig.x=geo.offOrigX[currProjNumber_global];
-                            offOrig.y=geo.offOrigY[currProjNumber_global];
-                            offOrig.z=geoArray[img_slice*deviceCount+dev].offOrigZ[currProjNumber_global];
-                            
-                            offDetec.x=geo.offDetecU[currProjNumber_global];
-                            offDetec.y=geo.offDetecV[currProjNumber_global];
-                            offDetec.z=0;//unused
-                            
-                            projParamsArray2Host[7*j]  =deltaX;		// 7*j because we have 7 Point3D values per projection
-                            projParamsArray2Host[7*j+1]=deltaY;
-                            projParamsArray2Host[7*j+2]=deltaZ;
-                            projParamsArray2Host[7*j+3]=xyzOrigin;
-                            projParamsArray2Host[7*j+4]=offOrig;
-                            projParamsArray2Host[7*j+5]=offDetec;
-                            projParamsArray2Host[7*j+6]=source;
-                            
-                        }   // END for (preparing params for kernel call)
-                        
-                        // Copy the prepared parameter arrays to constant memory to make it available for the kernel
-                        cudaMemcpyToSymbolAsync(projSinCosArray2Dev, projSinCosArray2Host, sizeof(float)*5*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaMemcpyToSymbolAsync(projParamsArray2Dev, projParamsArray2Host, sizeof(Point3D)*7*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[dev*nStreamDevice]);
-                        cudaStreamSynchronize(stream[dev*nStreamDevice]);
-                        kernelPixelBackprojection<<<grid,block,0,stream[dev*nStreamDevice]>>>(geoArray[img_slice*deviceCount+dev],dimage[dev],i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)*deviceCount+dev]);
-                        
-                    }  // END for
-                    //////////////////////////////////////////////////////////////////////////////////////
-                    // END RB code, Main reconstruction loop: go through projections (rotation angles) and backproject
-                    //////////////////////////////////////////////////////////////////////////////////////
-                }
-            } // END sub-split of current projection chunk
-
-        } // END projection splits
-        
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            matrixConstantMultiply<<<60,MAXTREADS,0,stream[dev*nStreamDevice]>>>(  geoArray[img_slice*deviceCount+dev],dimage[dev],geo.dVoxelX*geo.dVoxelY*geo.dVoxelZ/(geo.dDetecU*geo.dDetecV));
-        }
-
-        // Now we need to take the image out of the GPU
-        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaStreamSynchronize(stream[dev*nStreamDevice]);
-            
-            num_bytes_img_curr=(size_t)geoArray[img_slice*deviceCount+dev].nVoxelX*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelY*(size_t)geoArray[img_slice*deviceCount+dev].nVoxelZ*sizeof(float);
-            img_linear_idx_start=(size_t)geo.nVoxelX*(size_t)geo.nVoxelY*(size_t)geoArray[0].nVoxelZ*(size_t)(img_slice*deviceCount+dev);
-            cudaMemcpyAsync(&result[img_linear_idx_start], dimage[dev], num_bytes_img_curr, cudaMemcpyDeviceToHost,stream[dev*nStreamDevice+1]);
-        }
-    } // end image splits
-    
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaDeviceSynchronize();
-    }  
-    
-    
-    // Clean the GPU
-    bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1;
-    for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
-        if (!two_buffers_used && i==1)
-            break;        for (dev = 0; dev < deviceCount; dev++){
-            cudaSetDevice(gpuids[dev]);
-            cudaDestroyTextureObject(texProj[i*deviceCount+dev]);
-            cudaFreeArray(d_cuArrTex[i*deviceCount+dev]);
-        }
-    }
-    free(d_cuArrTex);
-    free(texProj);
-
-    for (dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaFree(dimage[dev]);
-    }
-    free(dimage);
-    
-    cudaFreeHost(projSinCosArray2Host);
-    cudaFreeHost(projParamsArray2Host);
-    free(partial_projection);
-    free(proj_split_size);
-    
-    freeGeoArray(split_image*deviceCount,geoArray);
-#ifndef NO_PINNED_MEMORY     
-    if (isHostRegisterSupported & split_image>1){
-        cudaHostUnregister(result);
-    }
-    if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
-    }
-#endif 
-    for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
-    
-    cudaCheckErrors("cudaFree fail");
-    
-//     cudaDeviceReset(); // For the Nvidia Visual Profiler
-    return 0;
-    
-}  // END voxel_backprojection
-
-
-
-
-
-void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream,int nStreamDevice,bool allocate){
-    //size_t size_image=geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ;
-    int num_devices = gpuids.GetLength();
-#if IS_FOR_MATLAB_TIGRE
-    const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
-#else
-    const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
-#endif
-    if (allocate){
-        for (unsigned int dev = 0; dev < num_devices; dev++){
-            cudaSetDevice(gpuids[dev]);
-            
-            //cudaArray Descriptor
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-            //cuda Array
-            cudaMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
-            
-        }
-    }
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemcpy3DParms copyParams = {0};
-        //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
-        copyParams.dstArray = d_cuArrTex[dev];
-        copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[dev*nStreamDevice+1]);
-    }
-
-    //Array creation End
-    for (unsigned int dev = 0; dev < num_devices; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
-        texRes.res.array.array  = d_cuArrTex[dev];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
-        texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[dev], &texRes, &texDescr, NULL);
-    }
-}
-#ifndef BACKPROJECTION_HPP
-void splitCTbackprojection(const GpuIds& gpuids, Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections){
-    
-    
-    // We don't know if the devices are being used. lets check that. and only use the amount of memory we need.
-    
-    size_t mem_GPU_global;
-    checkFreeMemory(gpuids, &mem_GPU_global);
-    const int deviceCount = gpuids.GetLength();
-    
-    // Compute how much memory each of the relevant memory pieces need
-    size_t mem_image=       (unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY*(unsigned long long)geo.nVoxelZ*sizeof(float);
-    size_t mem_proj=        (unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV*sizeof(float);
-    
-    
-    
-    
-    // Does everything fit in the GPU?
-    
-    if(mem_image/deviceCount+mem_proj*PROJ_PER_KERNEL*2<mem_GPU_global){
-        // We only need to split if we have extra GPUs
-        *split_image=1;
-        *split_projections=1;
-    }
-    // We know we need to split, but:
-    // Does all the image fit in the GPU, with some slack for a stack of projections??
-    else
-    {
-        // As we can overlap memcpys from H2D of the projections, we should then minimize the amount of image splits.
-        // Lets assume to start with that we only need 1 stack of PROJ_PER_KERNEL projections. The rest is for the image.
-        size_t mem_free=mem_GPU_global-2*mem_proj*PROJ_PER_KERNEL;
-        
-        *split_image=(mem_image/deviceCount+mem_free-1)/mem_free;
-        // Now knowing how many splits we have for images, we can recompute how many slices of projections actually
-        // fit on the GPU. Must be more than 0 obviously.
-        
-        mem_free=mem_GPU_global-(mem_image/deviceCount)/(*split_image); // NOTE: There is some rounding error, but its in the order of bytes, and we have 5% of GPU free jsut in case. We are safe
-        
-        
-        *split_projections=(mem_proj*PROJ_PER_KERNEL*2+mem_free-1)/mem_free;
-        
-    }
-}
-
-
-void computeDeltasCube(Geometry geo,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S)
-{
-    Point3Ddouble P, Px,Py,Pz;
-    // Get coords of Img(0,0,0)
-    P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i];
-    P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i];
-    P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i];
-    
-    // Get coors from next voxel in each direction
-    Px.x=P.x+geo.dVoxelX;      Py.x=P.x;                Pz.x=P.x;
-    Px.y=P.y;                   Py.y=P.y+geo.dVoxelY;    Pz.y=P.y;
-    Px.z=P.z;                   Py.z=P.z;                Pz.z=P.z+geo.dVoxelZ;
-    
-    
-    
-// Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ
-    
-    eulerZYZT(geo,&P);
-    eulerZYZT(geo,&Px);
-    eulerZYZT(geo,&Py);
-    eulerZYZT(geo,&Pz);
-    
-    
-    
-    //detector offset
-    P.z =P.z-geo.offDetecV[i];            P.y =P.y-geo.offDetecU[i];
-    Px.z =Px.z-geo.offDetecV[i];          Px.y =Px.y-geo.offDetecU[i];
-    Py.z =Py.z-geo.offDetecV[i];          Py.y =Py.y-geo.offDetecU[i];
-    Pz.z =Pz.z-geo.offDetecV[i];          Pz.y =Pz.y-geo.offDetecU[i];
-    
-    //Detector Roll pitch Yaw
-    //
-    //
-    // first, we need to offset everything so (0,0,0) is the center of the detector
-    // Only X is required for that
-    P.x=P.x+(geo.DSD[i]-geo.DSO[i]);
-    Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]);
-    Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]);
-    Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]);
-    rollPitchYawT(geo,i,&P);
-    rollPitchYawT(geo,i,&Px);
-    rollPitchYawT(geo,i,&Py);
-    rollPitchYawT(geo,i,&Pz);
-    
-    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
-    Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]);
-    Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]);
-    Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]);
-    //Done for P, now source
-    Point3Ddouble source;
-    source.x=geo.DSD[i]; //already offseted for rotation
-    source.y=-geo.offDetecU[i];
-    source.z=-geo.offDetecV[i];
-    rollPitchYawT(geo,i,&source);
-    
-    
-    source.x=source.x-(geo.DSD[i]-geo.DSO[i]);//   source.y=source.y-auxOff.y;    source.z=source.z-auxOff.z;
-    
-//       mexPrintf("%f,%f,%f\n",source.x,source.y,source.z);
-    // Scale coords so detector pixels are 1x1
-    
-    P.z =P.z /geo.dDetecV;                          P.y =P.y/geo.dDetecU;
-    Px.z=Px.z/geo.dDetecV;                          Px.y=Px.y/geo.dDetecU;
-    Py.z=Py.z/geo.dDetecV;                          Py.y=Py.y/geo.dDetecU;
-    Pz.z=Pz.z/geo.dDetecV;                          Pz.y=Pz.y/geo.dDetecU;
-    
-    source.z=source.z/geo.dDetecV;                  source.y=source.y/geo.dDetecU;
-    
-    // get deltas of the changes in voxels
-    deltaX->x=Px.x-P.x;   deltaX->y=Px.y-P.y;    deltaX->z=Px.z-P.z;
-    deltaY->x=Py.x-P.x;   deltaY->y=Py.y-P.y;    deltaY->z=Py.z-P.z;
-    deltaZ->x=Pz.x-P.x;   deltaZ->y=Pz.y-P.y;    deltaZ->z=Pz.z-P.z;
-    
-    
-    *xyzorigin=P.to_float();
-    *S=source.to_float();
-}  // END computeDeltasCube
-
-void checkFreeMemory(const GpuIds& gpuids,size_t *mem_GPU_global){
-    size_t memfree;
-    size_t memtotal;
-    const int gpuids.GetLength();
-    
-    for (int dev = 0; dev < deviceCount; dev++){
-        cudaSetDevice(gpuids[dev]);
-        cudaMemGetInfo(&memfree,&memtotal);
-        if(dev==0) *mem_GPU_global=memfree;
-        if(memfree<memtotal/2){
-            mexErrMsgIdAndTxt("voxel_backprojection:Atb:GPU","One (or more) of your GPUs is being heavily used by another program (possibly graphics-based).\n Free the GPU to run TIGRE\n");
-        }
-        cudaCheckErrors("Check mem error");
-        
-        *mem_GPU_global=(memfree<*mem_GPU_global)?memfree:*mem_GPU_global;
-    }
-    *mem_GPU_global=(size_t)((double)*mem_GPU_global*0.95);
-    
-    //*mem_GPU_global= insert your known number here, in bytes.
-}
-
-#endif
diff --git a/Common/CUDA/voxel_backprojection2.hpp.prehip b/Common/CUDA/voxel_backprojection2.hpp.prehip
deleted file mode 100644
index 314de4f2..00000000
--- a/Common/CUDA/voxel_backprojection2.hpp.prehip
+++ /dev/null
@@ -1,64 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA function for backrpojection using matched weigts for CBCT
- *
- *
- * CODE by  Ander Biguri
- *          Optimized and modified by RB
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-#include "voxel_backprojection.hpp"
-#include "types_TIGRE.hpp"
-#include "GpuIds.hpp"
-
-
-#ifndef BACKPROJECTION2_HPP
-#define BACKPROJECTION2_HPP
-
-int voxel_backprojection2(float  *  projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids);
-void computeDeltasCube(Geometry geo, float alpha,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S);
-void splitCTbackprojection(const GpuIds& gpuids,Geometry geo,int nalpha, unsigned int* split_image, unsigned int * split_projections);
-void computeDeltasCube(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D* S);
-void createGeoArray(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles);
-void freeGeoArray(unsigned int splits,Geometry* geoArray);
-void checkFreeMemory(const GpuIds& gpuids, size_t *mem_GPU_global);
-#endif
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection_parallel.cu.prehip b/Common/CUDA/voxel_backprojection_parallel.cu.prehip
deleted file mode 100644
index 03703576..00000000
--- a/Common/CUDA/voxel_backprojection_parallel.cu.prehip
+++ /dev/null
@@ -1,627 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * CUDA function for backrpojection  for parallel beam
- *
- *
- * CODE by  Ander Biguri
- *          Optimized and modified by RB
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-#define  PI_2 1.57079632679489661923
-#include <algorithm>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include "voxel_backprojection.hpp"
-#include "voxel_backprojection_parallel.hpp"
-
-#include "TIGRE_common.hpp"
-#include <math.h>
-
-// https://stackoverflow.com/questions/16282136/is-there-a-cuda-equivalent-of-perror
-#define cudaCheckErrors(msg) \
-do { \
-        cudaError_t __err = cudaGetLastError(); \
-        if (__err != cudaSuccess) { \
-                mexPrintf("%s \n",msg);\
-                mexErrMsgIdAndTxt("CBCT:CUDA:Atb",cudaGetErrorString(__err));\
-        } \
-} while (0)
-    
-    
-#define MAXTREADS 1024
-    /*GEOMETRY DEFINITION
-     *
-     *                Detector plane, behind
-     *            |-----------------------------|
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |                             |
-     *            |      +--------+             |
-     *            |     /        /|             |
-     *   A Z      |    /        / |*D           |
-     *   |        |   +--------+  |             |
-     *   |        |   |        |  |             |
-     *   |        |   |     *O |  +             |
-     *   *--->y   |   |        | /              |
-     *  /         |   |        |/               |
-     * V X        |   +--------+                |
-     *            |-----------------------------|
-     *
-     *           *S
-     *
-     *
-     *
-     *
-     *
-     **/
-void CreateTextureParallel( float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool allocate);
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// The optimal values of two constants obtained by RB on NVIDIA Quadro K2200 (4 GB RAM, 640 CUDA cores) for 512^3 volume and 512^3 projections (512 proj, each 512 x 512) were:
-// PROJ_PER_KERNEL = 32 or 16 (very similar times)
-// VOXELS_PER_THREAD = 8
-// Speedup of the entire FDK backprojection (not only kernel run, also memcpy etc.) was nearly 4x relative to the original (single projection, single voxel per thread) code.
-// (e.g. 16.2 s vs. ~62 s).
-
-const int PROJ_PER_KERNEL = 32;  // Number of 2D projections to be analyzed by a single thread. This can be tweaked to see what works best. 32 was the optimal value in the paper by Zinsser and Keck.
-const int VOXELS_PER_THREAD = 8;  // Number of voxels to be computed by s single thread. Can be tweaked to see what works best. 4 was the optimal value in the paper by Zinsser and Keck.
-
-// We have PROJ_PER_KERNEL projections and we need 6 parameters for each projection:
-//   deltaX, deltaY, deltaZ, xyzOrigin, offOrig, offDetec
-// So we need to keep PROJ_PER_KERNEL*6 values in our deltas array FOR EACH CALL to our main kernel
-// (they will be updated in the main loop before each kernel call).
-
-__constant__ Point3D projParamsArrayDevParallel[6*PROJ_PER_KERNEL];  // Dev means it is on device
-
-// We also need a corresponding array on the host side to be filled before each kernel call, then copied to the device (array in constant memory above)
-// Point3D projParamsArrayHostParallel[6*PROJ_PER_KERNEL];   // Host means it is host memory
-
-// Now we also need to store sinAlpha and cosAlpha for each projection (two floats per projection)
-__constant__ float projSinCosArrayDevParallel[3*PROJ_PER_KERNEL];
-
-// float projSinCosArrayHostParallel[3*PROJ_PER_KERNEL];
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// END RB, 10/31/2016: Add constant memory arrays to store parameters for all projections to be analyzed during a single kernel call
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
-//______________________________________________________________________________
-//
-//      Function:       kernelPixelBackprojectionFDK
-//
-//      Description:    Main FDK backprojection kernel
-//______________________________________________________________________________
-
-__global__ void kernelPixelBackprojection_parallel(const Geometry geo, float* image,const int currProjSetNumber, const int totalNoOfProjections,cudaTextureObject_t tex)
-{
-    
-    // Old kernel call signature:
-    // kernelPixelBackprojectionFDK<<<grid,block>>>(geo,dimage,i,deltaX,deltaY,deltaZ,xyzOrigin,offOrig,offDetec,sinalpha,cosalpha);
-    // We just read in most of the params from the constant memory instead of getting them from the param list.
-    // This is because we now have MANY params, since single kernel processes more than one projection!
-    /* __global__ void kernelPixelBackprojectionFDK(const Geometry geo,
-     * float* image,
-     * const int indAlpha,
-     * const Point3D deltaX ,
-     * const Point3D deltaY,
-     * const Point3D deltaZ,
-     * const Point3D xyzOrigin,
-     * const Point3D xyzOffset,
-     * const Point3D uv0Offset,
-     * const float sinalpha,
-     * const float cosalpha){
-     */
-    unsigned long long indY = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned long long indX = blockIdx.x * blockDim.x + threadIdx.x;
-    // unsigned long startIndZ = blockIdx.z * blockDim.z + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
-    unsigned long long startIndZ = blockIdx.z * VOXELS_PER_THREAD + threadIdx.z;  // This is only STARTING z index of the column of voxels that the thread will handle
-    //Make sure we don't go out of bounds
-    if (indX>=geo.nVoxelX || indY>=geo.nVoxelY || startIndZ>=geo.nVoxelZ)
-        return;
-    
-    // We'll keep a local auxiliary array of values of a column of voxels that this thread will update
-    float voxelColumn[VOXELS_PER_THREAD];
-    
-    // First we need to copy the curent 3D volume values from the column to our auxiliary array so that we can then
-    // work on them (update them by computing values from multiple projections) locally - avoiding main memory reads/writes
-    
-    unsigned long colIdx;
-    
-    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-    {
-        unsigned long long indZ = startIndZ + colIdx;
-        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the updated values go back to the main volume)
-        if(indZ>=geo.nVoxelZ)
-            break;   // break the loop.
-        
-        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
-        voxelColumn[colIdx] = image[idx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
-        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
-    }  // END copy 3D volume voxels to local array
-    
-    // Now iterate through projections
-    for(unsigned long projNumber=0; projNumber<PROJ_PER_KERNEL; projNumber++)
-    {
-        // Get the current parameters from parameter arrays in constant memory.
-        unsigned long indAlpha = currProjSetNumber*PROJ_PER_KERNEL+projNumber;  // This is the ABSOLUTE projection number in the projection array
-        
-        // Our currImageVal will be updated by hovewer many projections we had left in the "remainder" - that's OK.
-        if(indAlpha>=totalNoOfProjections)
-            break;
-        
-        Point3D deltaX = projParamsArrayDevParallel[6*projNumber];  // 6*projNumber because we have 6 Point3D values per projection
-        Point3D deltaY = projParamsArrayDevParallel[6*projNumber+1];
-        Point3D deltaZ = projParamsArrayDevParallel[6*projNumber+2];
-        Point3D xyzOrigin = projParamsArrayDevParallel[6*projNumber+3];
-        Point3D xyzOffset = projParamsArrayDevParallel[6*projNumber+4];
-        Point3D S = projParamsArrayDevParallel[6*projNumber+5];
-        
-        float DSD = projSinCosArrayDevParallel[3*projNumber];     // 2*projNumber because we have 2 float (sin or cos angle) values per projection
-        float DSO = projSinCosArrayDevParallel[3*projNumber+1];
-        float COR = projSinCosArrayDevParallel[3*projNumber+2];
-        
-        // Geometric trasnformations:
-        //Source, scaled XYZ coordinates
-        
-        // Now iterate through Z in our voxel column FOR A GIVEN PROJECTION
-        for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-        {
-            unsigned long long indZ = startIndZ + colIdx;
-            
-            // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-            // be trying to copy the out of bounds values anyway (bounds checks will be done in the final loop where the values go to the main volume)
-            if(indZ>=geo.nVoxelZ)
-                break;   // break the loop.
-            
-            // "XYZ" in the scaled coordinate system of the current point. The image is rotated with the projection angles.
-            Point3D P;
-            S.x=DSO;
-            P.x=(xyzOrigin.x+indX*deltaX.x+indY*deltaY.x+indZ*deltaZ.x);
-            P.y=(xyzOrigin.y+indX*deltaX.y+indY*deltaY.y+indZ*deltaZ.y)-COR/geo.dDetecU;
-            P.z=(xyzOrigin.z+indX*deltaX.z+indY*deltaY.z+indZ*deltaZ.z);
-            S.y=P.y;S.z=P.z;
-            
-            // This is the vector defining the line from the source to the Voxel
-            float vectX,vectY,vectZ;
-            vectX=(P.x -S.x);
-            vectY=(P.y -S.y);
-            vectZ=(P.z -S.z);
-            
-            // Get the coordinates in the detector UV where the mid point of the voxel is projected.
-            float t=(DSO-DSD /*-DOD*/ - S.x)/vectX;
-            float y,z;
-            y=vectY*t+S.y;
-            z=vectZ*t+S.z;
-            float u,v;
-            u=y+geo.nDetecU/2.0f-0.5f;
-            v=z+geo.nDetecV/2.0f-0.5f;
-            
-            
-            
-            // Get Value in the computed (U,V) and multiply by the corresponding weight.
-            // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
-#if IS_FOR_MATLAB_TIGRE
-            voxelColumn[colIdx]+=tex3D<float>(tex, v+0.5f, u+0.5f ,indAlpha+0.5f);
-#else
-            voxelColumn[colIdx]+=tex3D<float>(tex, u+0.5f, v+0.5f ,indAlpha+0.5f);
-#endif
-            
-        }  // END iterating through column of voxels
-        
-    }  // END iterating through multiple projections
-    
-    // And finally copy the updated local voxelColumn array back to our 3D volume (main memory)
-    for(colIdx=0; colIdx<VOXELS_PER_THREAD; colIdx++)
-    {
-        unsigned long long indZ = startIndZ + colIdx;
-        // If we are out of bounds, break the loop. The voxelColumn array will be updated partially, but it is OK, because we won't
-        // be trying to copy the out of bounds values back to the 3D volume anyway (bounds checks will be done in the final loop where the values go to the main volume)
-        if(indZ>=geo.nVoxelZ)
-            break;   // break the loop.
-        
-        unsigned long long idx =indZ*(unsigned long long)geo.nVoxelX*(unsigned long long)geo.nVoxelY+indY*(unsigned long long)geo.nVoxelX + indX;
-        image[idx] = voxelColumn[colIdx];   // Read the current volume value that we'll update by computing values from MULTIPLE projections (not just one)
-        // We'll be updating the local (register) variable, avoiding reads/writes from the slow main memory.
-        // According to references (Papenhausen), doing = is better than +=, since += requires main memory read followed by a write.
-        // We did all the reads into the local array at the BEGINNING of this kernel. According to Papenhausen, this type of read-write split is
-        // better for avoiding memory congestion.
-    }  // END copy updated voxels from local array to our 3D volume
-    
-}  // END kernelPixelBackprojectionFDK
-
-
-
-
-//______________________________________________________________________________
-//
-//      Function:       voxel_backprojection_parallel
-//
-//      Description:    Main host function for FDK backprojection (invokes the kernel)
-//______________________________________________________________________________
-
-int voxel_backprojection_parallel(float  *  projections, Geometry geo, float* result,float const * const alphas, int nalpha, const GpuIds& gpuids)
-{
-    if (gpuids.GetLength() == 0) {
-        cudaSetDevice(0);
-    } else {
-        cudaSetDevice(gpuids[0]);
-    }
-    
-    /*
-     * Allocate texture memory on the device
-     */
-    // copy data to CUDA memory
-    //If it is the first time, lets make sure our image is zeroed.
-    int nStreamDevice=2;
-    int nStreams=nStreamDevice;
-    cudaStream_t* stream=(cudaStream_t*)malloc(nStreams*sizeof(cudaStream_t));;
-    
-    for (int i = 0; i < nStreamDevice; ++i){
-        cudaStreamCreate(&stream[i]);
-        
-        
-    }
-    //Pagelock memory for synchronous copy.
-    // Lets try to make the host memory pinned:
-    // We laredy queried the GPU and assuemd they are the same, thus should have the same attributes.
-    int isHostRegisterSupported = 0;
-#if CUDART_VERSION >= 9020
-    cudaDeviceGetAttribute(&isHostRegisterSupported,cudaDevAttrHostRegisterSupported,gpuids[0]);
-#endif
-    if (isHostRegisterSupported){
-        cudaHostRegister(projections, (size_t)geo.nDetecU*(size_t)geo.nDetecV*(size_t)nalpha*(size_t)sizeof(float),cudaHostRegisterPortable);
-    }
-    cudaCheckErrors("Error pinning memory");
-    
-    
-    // Allocate result image memory
-    size_t num_bytes = geo.nVoxelX*geo.nVoxelY*geo.nVoxelZ * sizeof(float);
-    float* dimage;
-    cudaMalloc((void**)&dimage, num_bytes);
-    cudaMemset(dimage,0,num_bytes);
-    cudaCheckErrors("cudaMalloc fail");
-    
-    
-    Point3D* projParamsArrayHostParallel;
-    cudaMallocHost((void**)&projParamsArrayHostParallel,6*PROJ_PER_KERNEL*sizeof(Point3D));
-    float* projSinCosArrayHostParallel;
-    cudaMallocHost((void**)&projSinCosArrayHostParallel,3*PROJ_PER_KERNEL*sizeof(float));
-    
-    
-    // Texture buffer objects
-    cudaTextureObject_t *texProj;
-    cudaArray **d_cuArrTex;
-    texProj =(cudaTextureObject_t*)malloc(2*sizeof(cudaTextureObject_t));
-    d_cuArrTex =(cudaArray**)malloc(2*sizeof(cudaArray*));
-
-    
-    
-    unsigned int proj_split_overlap_number;
-    unsigned int split_projections=1;
-    // Start with the main loop. The Projection data needs to be allocated and dealocated in the main loop
-    // as due to the nature of cudaArrays, we can not reuse them. This should not be a problem for the fast execution
-    // of the code, as repeated allocation and deallocation only happens when the projection data is very very big,
-    // and therefore allcoation time should be negligible, fluctuation of other computations should mask the time.
-    unsigned long long proj_linear_idx_start;
-    unsigned int current_proj_split_size,current_proj_overlap_split_size;
-    size_t num_bytes_img_curr;
-    size_t img_linear_idx_start;
-    
-    
-    current_proj_split_size=nalpha;
-    // We are going to split it in the same amount of kernels we need to execute.
-    proj_split_overlap_number=(current_proj_split_size+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;
-    
-    
-    // Create pointer to pointers of projections and precompute their location and size.
-    
-    float ** partial_projection=(float**)malloc(current_proj_split_size*sizeof(float*));
-    size_t * proj_split_size=(size_t*)malloc(current_proj_split_size*sizeof(size_t*));
-    
-    for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
-        // Crop the last one, as its likely its not completely divisible.
-        // now lets split this for simultanoeus memcopy and compute.
-        // We want to make sure that if we can, we run PROJ_PER_KERNEL projections, to maximize kernel acceleration
-        // current_proj_overlap_split_size units = angles
-        current_proj_overlap_split_size=max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL);
-        current_proj_overlap_split_size=(proj_block_split<proj_split_overlap_number-1)?current_proj_overlap_split_size:current_proj_split_size-(proj_split_overlap_number-1)*current_proj_overlap_split_size;
-        //Get the linear index where the current memory chunk starts.
-        
-        proj_linear_idx_start=proj_block_split*max((current_proj_split_size+proj_split_overlap_number-1)/proj_split_overlap_number,PROJ_PER_KERNEL)*(unsigned long long)geo.nDetecU*(unsigned long long)geo.nDetecV;
-        //Store result
-        proj_split_size[proj_block_split]=current_proj_overlap_split_size;
-        partial_projection[proj_block_split]=&projections[proj_linear_idx_start];
-        
-    }
-    for(unsigned int proj_block_split=0; proj_block_split<proj_split_overlap_number;proj_block_split++){
-        
-        // Now get the projections on memory
-        
-        CreateTextureParallel(partial_projection[proj_block_split],geo,
-                &d_cuArrTex[(proj_block_split%2)],
-                proj_split_size[proj_block_split],
-                &texProj   [(proj_block_split%2)],
-                stream,
-                (proj_block_split<2));// Only allocate if its the first 2 calls
-        
-  
-        cudaStreamSynchronize(stream[0+1]);
-        
-        
-
-        int divx,divy,divz;
-        
-        // RB: Use the optimal (in their tests) block size from paper by Zinsser and Keck (16 in x and 32 in y).
-        // I tried different sizes and shapes of blocks (tiles), but it does not appear to significantly affect throughput, so
-        // let's stick with the values from Zinsser and Keck.
-        divx=16;
-        divy=32;
-        divz=VOXELS_PER_THREAD;      // We now only have 32 x 16 threads per block (flat tile, see below), BUT each thread works on a Z column of VOXELS_PER_THREAD voxels, so we effectively need fewer blocks!
-        dim3 grid((geo.nVoxelX+divx-1)/divx,
-                (geo.nVoxelY+divy-1)/divy,
-                (geo.nVoxelZ+divz-1)/divz);
-        
-        dim3 block(divx,divy,1);    // Note that we have 1 in the Z size, not divz, since each thread works on a vertical set of VOXELS_PER_THREAD voxels (so we only need a "flat" tile of threads, with depth of 1)
-        
-        
-        
-        
-        //////////////////////////////////////////////////////////////////////////////////////
-        // Main reconstruction loop: go through projections (rotation angles) and backproject
-        //////////////////////////////////////////////////////////////////////////////////////
-        
-        // Since we'll have multiple projections processed by a SINGLE kernel call, compute how many
-        // kernel calls we'll need altogether.
-        int noOfKernelCalls = (proj_split_size[proj_block_split]+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL;  // We'll take care of bounds checking inside the loop if nalpha is not divisible by PROJ_PER_KERNEL
-        for (unsigned int i=0; i<noOfKernelCalls; i++)
-        {
-            // Now we need to generate and copy all data for PROJ_PER_KERNEL projections to constant memory so that our kernel can use it
-            int j;
-            for(j=0; j<PROJ_PER_KERNEL; j++)
-            {
-                int currProjNumber=i*PROJ_PER_KERNEL+j;
-                unsigned int currProjNumber_slice=i*PROJ_PER_KERNEL+j;
-                unsigned int currProjNumber_global=i*PROJ_PER_KERNEL+j                                                                          // index within kernel
-                        +proj_block_split*max(current_proj_split_size/proj_split_overlap_number,PROJ_PER_KERNEL); // indexof overlap current split
-                if(currProjNumber_slice>=proj_split_size[proj_block_split])
-                    break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
-                
-                if(currProjNumber_global>=nalpha)
-                    break;  // Exit the loop. Even when we leave the param arrays only partially filled, this is OK, since the kernel will check bounds anyway.
-                
-                Point3D deltaX,deltaY,deltaZ,xyzOrigin, offOrig, /*offDetec,*/source;
-                float sinalpha,cosalpha;
-                
-                geo.alpha=-alphas[currProjNumber_global*3];
-                geo.theta=-alphas[currProjNumber_global*3+1];
-                geo.psi  =-alphas[currProjNumber_global*3+2];
-                
-                //sinalpha=sin(geo.alpha);
-//            cosalpha=cos(geo.alpha);
-                
-                projSinCosArrayHostParallel[3*j]=geo.DSD[currProjNumber_global];  // 3*j because we have 3 float (sin or cos angle) values per projection
-                projSinCosArrayHostParallel[3*j+1]=geo.DSO[currProjNumber_global];
-                projSinCosArrayHostParallel[3*j+2]=geo.COR[currProjNumber_global];
-                
-                //computeDeltasCubeParallel(geo,geo.alpha,currProjNumber,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
-                computeDeltasCubeParallel(geo,currProjNumber_global,&xyzOrigin,&deltaX,&deltaY,&deltaZ,&source);
-                
-                offOrig.x=geo.offOrigX[currProjNumber_global];
-                offOrig.y=geo.offOrigY[currProjNumber_global];
-                
-                
-                projParamsArrayHostParallel[6*j]=deltaX;		// 6*j because we have 6 Point3D values per projection
-                projParamsArrayHostParallel[6*j+1]=deltaY;
-                projParamsArrayHostParallel[6*j+2]=deltaZ;
-                projParamsArrayHostParallel[6*j+3]=xyzOrigin;
-                projParamsArrayHostParallel[6*j+4]=offOrig;
-                projParamsArrayHostParallel[6*j+5]=source;
-            }   // END for (preparing params for kernel call)
-            
-            // Copy the prepared parameter arrays to constant memory to make it available for the kernel
-            
-            cudaMemcpyToSymbolAsync(projSinCosArrayDevParallel, projSinCosArrayHostParallel, sizeof(float)*3*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
-            cudaMemcpyToSymbolAsync(projParamsArrayDevParallel, projParamsArrayHostParallel, sizeof(Point3D)*6*PROJ_PER_KERNEL,0,cudaMemcpyHostToDevice,stream[0]);
-            cudaStreamSynchronize(stream[0]);
-
-            kernelPixelBackprojection_parallel<<<grid,block,0,stream[0]>>>(geo,dimage,i,proj_split_size[proj_block_split],texProj[(proj_block_split%2)]);
-        }  // END for
-        
-        //////////////////////////////////////////////////////////////////////////////////////
-        // END Main reconstruction loop: go through projections (rotation angles) and backproject
-        //////////////////////////////////////////////////////////////////////////////////////
-    }
-    cudaDeviceSynchronize();
-    cudaMemcpy(result, dimage, num_bytes, cudaMemcpyDeviceToHost);
-    cudaCheckErrors("cudaMemcpy result fail");
-    
-    free(partial_projection);
-    free(proj_split_size);
-        
-    bool two_buffers_used=((((nalpha+split_projections-1)/split_projections)+PROJ_PER_KERNEL-1)/PROJ_PER_KERNEL)>1;
-    for(unsigned int i=0; i<2;i++){ // 2 buffers (if needed, maybe only 1)
-        if (!two_buffers_used && i==1)
-            break;            
-            cudaDestroyTextureObject(texProj[i]);
-            cudaFreeArray(d_cuArrTex[i]);
-    }
-    free(texProj);
-    
-    free(d_cuArrTex);
-    cudaFreeHost(projSinCosArrayHostParallel);
-    cudaFreeHost(projParamsArrayHostParallel);
-    
-    cudaFree(dimage);
-    if (isHostRegisterSupported){
-        cudaHostUnregister(projections);
-    }
-    for (int i = 0; i < nStreams; ++i)
-        cudaStreamDestroy(stream[i]);
-
-//     cudaDeviceReset();
-    return 0;
-    
-}  // END voxel_backprojection
-
-void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S)
-{
-    
-    Point3Ddouble P, Px,Py,Pz;
-    // Get coords of Img(0,0,0)
-    P.x=-(geo.sVoxelX/2-geo.dVoxelX/2)+geo.offOrigX[i];
-    P.y=-(geo.sVoxelY/2-geo.dVoxelY/2)+geo.offOrigY[i];
-    P.z=-(geo.sVoxelZ/2-geo.dVoxelZ/2)+geo.offOrigZ[i];
-    
-    // Get coors from next voxel in each direction
-    Px.x=P.x+geo.dVoxelX;       Py.x=P.x;                Pz.x=P.x;
-    Px.y=P.y;                   Py.y=P.y+geo.dVoxelY;    Pz.y=P.y;
-    Px.z=P.z;                   Py.z=P.z;                Pz.z=P.z+geo.dVoxelZ;
-    
-    
-    
-   // Rotate image around X axis (this is equivalent of rotating the source and detector) RZ RY RZ
-    eulerZYZT(geo,&P);
-    eulerZYZT(geo,&Px);
-    eulerZYZT(geo,&Py);
-    eulerZYZT(geo,&Pz);
-    
-    //detector offset
-    P.z =P.z-geo.offDetecV[i];            P.y =P.y-geo.offDetecU[i];
-    Px.z =Px.z-geo.offDetecV[i];          Px.y =Px.y-geo.offDetecU[i];
-    Py.z =Py.z-geo.offDetecV[i];          Py.y =Py.y-geo.offDetecU[i];
-    Pz.z =Pz.z-geo.offDetecV[i];          Pz.y =Pz.y-geo.offDetecU[i];
-    
-    //Detector Roll pitch Yaw
-    //
-    //
-    // first, we need to offset everything so (0,0,0) is the center of the detector
-    // Only X is required for that
-    P.x=P.x+(geo.DSD[i]-geo.DSO[i]);
-    Px.x=Px.x+(geo.DSD[i]-geo.DSO[i]);
-    Py.x=Py.x+(geo.DSD[i]-geo.DSO[i]);
-    Pz.x=Pz.x+(geo.DSD[i]-geo.DSO[i]);
-
-    rollPitchYawT(geo,i,&P);
-    rollPitchYawT(geo,i,&Px);
-    rollPitchYawT(geo,i,&Py);
-    rollPitchYawT(geo,i,&Pz);
-
-    P.x=P.x-(geo.DSD[i]-geo.DSO[i]);
-    Px.x=Px.x-(geo.DSD[i]-geo.DSO[i]);
-    Py.x=Py.x-(geo.DSD[i]-geo.DSO[i]);
-    Pz.x=Pz.x-(geo.DSD[i]-geo.DSO[i]);
-    
-    
-    Point3Ddouble source;
-    source.x=0;
-    source.y=-geo.offDetecU[i];
-    source.z=-geo.offDetecV[i];
-    
-    rollPitchYawT(geo,i,&source);
-    source.x=source.x-(geo.DSD[i]-geo.DSO[i]);
-            
-    P.z =P.z /geo.dDetecV;                          P.y =P.y/geo.dDetecU;
-    Px.z=Px.z/geo.dDetecV;                          Px.y=Px.y/geo.dDetecU;
-    Py.z=Py.z/geo.dDetecV;                          Py.y=Py.y/geo.dDetecU;
-    Pz.z=Pz.z/geo.dDetecV;                          Pz.y=Pz.y/geo.dDetecU;
-    
-    source.z=source.z/geo.dDetecV;                  source.y=source.y/geo.dDetecU;
-    
-    // get deltas of the changes in voxels
-    deltaX->x=Px.x-P.x;   deltaX->y=Px.y-P.y;    deltaX->z=Px.z-P.z;
-    deltaY->x=Py.x-P.x;   deltaY->y=Py.y-P.y;    deltaY->z=Py.z-P.z;
-    deltaZ->x=Pz.x-P.x;   deltaZ->y=Pz.y-P.y;    deltaZ->z=Pz.z-P.z;
-    
-    
-    // cast the results from the double precision calculations back to float
-    *xyzorigin=P.to_float();
-    *S=source.to_float();
-
-    
-}  // END computeDeltasCube
-void CreateTextureParallel(float* projectiondata,Geometry geo,cudaArray** d_cuArrTex,unsigned int nangles, cudaTextureObject_t *texImage,cudaStream_t* stream, bool alloc)
-{
-        //cudaArray Descriptor
-#if IS_FOR_MATLAB_TIGRE
-        const cudaExtent extent =make_cudaExtent(geo.nDetecV, geo.nDetecU, nangles);
-#else
-        const cudaExtent extent =make_cudaExtent(geo.nDetecU, geo.nDetecV, nangles);
-#endif
-        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-        //cuda Array
-        if (alloc){
-        cudaMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
-        cudaCheckErrors("Texture memory allocation fail");
-        }
-        cudaMemcpy3DParms copyParams = {0};
-        
-        
-        //Array creation
-        copyParams.srcPtr   = make_cudaPitchedPtr((void *)projectiondata, extent.width*sizeof(float), extent.width, extent.height);
-        copyParams.dstArray = d_cuArrTex[0];
-        copyParams.extent   = extent;
-        copyParams.kind     = cudaMemcpyHostToDevice;
-        cudaMemcpy3DAsync(&copyParams,stream[0+1]);
-        cudaCheckErrors("Texture memory data copy fail");
-        //Array creation End
-        
-        cudaResourceDesc    texRes;
-        memset(&texRes, 0, sizeof(cudaResourceDesc));
-        texRes.resType = cudaResourceTypeArray;
-        texRes.res.array.array  = d_cuArrTex[0];
-        cudaTextureDesc     texDescr;
-        memset(&texDescr, 0, sizeof(cudaTextureDesc));
-        texDescr.normalizedCoords = false;
-        texDescr.filterMode = cudaFilterModeLinear;
-        texDescr.addressMode[0] = cudaAddressModeBorder;
-        texDescr.addressMode[1] = cudaAddressModeBorder;
-        texDescr.addressMode[2] = cudaAddressModeBorder;
-        texDescr.readMode = cudaReadModeElementType;
-        cudaCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
-        cudaCheckErrors("Texture object creation fail");
-    
-}
\ No newline at end of file
diff --git a/Common/CUDA/voxel_backprojection_parallel.hpp.prehip b/Common/CUDA/voxel_backprojection_parallel.hpp.prehip
deleted file mode 100644
index 92b72023..00000000
--- a/Common/CUDA/voxel_backprojection_parallel.hpp.prehip
+++ /dev/null
@@ -1,57 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * Header CUDA function for backrpojection  for parallel beam
- *
- *
- * CODE by  Ander Biguri
- *          Optimized and modified by RB
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-#include "types_TIGRE.hpp"
-#include "GpuIds.hpp"
-
-
-#ifndef BACKPROJECTION_PARALLEL_HPP
-#define BACKPROJECTION_PARALLEL_HPP
-
-int  voxel_backprojection_parallel(float  *  projections, Geometry geo, float* result,float const * const alphas,int nalpha, const GpuIds& gpuids);
-void computeDeltasCubeParallel(Geometry geo, int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ,Point3D *S);
-void createGeoArrayParallel(unsigned int image_splits, Geometry geo,Geometry* geoArray, unsigned int nangles);
-//  void computeDeltasCube(Geometry geo, float alpha,int i, Point3D* xyzorigin, Point3D* deltaX, Point3D* deltaY, Point3D* deltaZ);
-#endif
\ No newline at end of file
diff --git a/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip
deleted file mode 100644
index e38db7d9..00000000
--- a/MATLAB/Utilities/cuda_interface/AddNoise.cpp.prehip
+++ /dev/null
@@ -1,126 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX  functions for Random Number Generator. Check inputs and parses 
- * MATLAB data to C++ data.
- *
- *
- * CODE by       Tomoyuki SADAKANE
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-#include <math.h>
-#include <string.h>
-#include <tmwtypes.h>
-#include <mex.h>
-#include <matrix.h>
-#include <CUDA/RandomNumberGenerator.hpp>
-#include <CUDA/GpuIds.hpp>
-#include <CUDA/gpuUtils.hpp>
-/**
- * MEX gateway
- * AddNoise(Im, mu, sigma, "gpuids", gpuids);
- *   poissrnd(Im)+randn(size(Im)).*sigma + mu;
- */
-
-void mexFunction(int nlhs, mxArray *plhs[],
-                 int nrhs, mxArray const *prhs[])
-{
-    size_t uiLen = 0;
-    float fGaussMu = 0;
-    float fGaussSigma = 0;
-
-    GpuIds gpuids;
-    if (nrhs==5) {
-        size_t iM = mxGetM(prhs[4]);
-        if (iM != 1) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector.");
-            return;
-        }
-        size_t uiGpuCount = mxGetN(prhs[4]);
-        if (uiGpuCount == 0) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:RNG:unknown","5th parameter must be a row vector.");
-            return;
-        }
-        int* piGpuIds = (int*)mxGetData(prhs[4]);
-        gpuids.SetIds(uiGpuCount, piGpuIds);
-    } else {
-        int iGpuCount = GetGpuCount();
-        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
-        for (int iI = 0; iI < iGpuCount; ++iI) {
-            piDev[iI] = iI;
-        }
-        gpuids.SetIds(iGpuCount, piDev);
-        free(piDev); piDev = 0;
-    }
-    if (nrhs < 3) {
-        mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "At least three input argumet required.");
-    } else if (nrhs==3 || nrhs==5){
-        size_t mrows = mxGetM(prhs[1]);
-        size_t ncols = mxGetN(prhs[1]);
-        if (mrows!=1 || ncols !=1) {
-            mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "2nd parameter should be 1x1");
-        }
-        mrows = mxGetM(prhs[2]);
-        ncols = mxGetN(prhs[2]);
-        if (mrows!=1 || ncols !=1) {
-            mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "3rd parameter should be 1x1");
-        }
-        fGaussMu    = (float)mxGetScalar(prhs[1]);
-        fGaussSigma = (float)mxGetScalar(prhs[2]);
-    } else if (nrhs>4) {
-        mexErrMsgIdAndTxt("CBCT:CUDA:RNG", "Too many input arguments");
-    }
-    /////////////// First input argumet.
-    // First input should be an array, whose elements are lambda.
-    mxArray const * const image = prhs[0];
-    float* pfLambdas = static_cast<float*>(mxGetData(image));
-    mwSize const numDims = mxGetNumberOfDimensions(image);  // get dim of image
-    const mwSize *size_img= mxGetDimensions(image); //get size of image
-    uiLen = size_img[0];    // calculate the total length
-    for (int iI = 1; iI < numDims; ++iI) {
-        uiLen *= size_img[iI];
-    }
-    //////////////
-    //prepare outputs
-    // Allocte output image
-    plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL);
-    float *imgout =(float*) mxGetPr(plhs[0]);
-    // call CUDA rng
-    poisson_gaussian_1d(pfLambdas, uiLen, fGaussMu, fGaussSigma, imgout, gpuids);
-}
diff --git a/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip
deleted file mode 100644
index da78bfce..00000000
--- a/MATLAB/Utilities/cuda_interface/Atb_mex.cpp.prehip
+++ /dev/null
@@ -1,367 +0,0 @@
-
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX gateway for backprojection
- *
- * This file gets the data from MATLAB, checks it for errors and then
- * parses it to C and calls the relevant C/CUDA functions.
- *
- * CODE by       Ander Biguri
- *
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-
-#include <math.h>
-#include <string.h>
-#include <tmwtypes.h>
-#include <mex.h>
-#include <matrix.h>
-#include <CUDA/voxel_backprojection.hpp>
-#include <CUDA/voxel_backprojection2.hpp>
-#include <CUDA/voxel_backprojection_parallel.hpp>
-#include <CUDA/GpuIds.hpp>
-
-
-
-
-
-/**
- * MEX gateway
- *
- * This function takes data from MATLAB and passes it to the MEX code.
- * It checks and casts the inputs and prepares teh outputs for MATLAB.
- *
- *
- */
-
-void mexFunction(int  nlhs , mxArray *plhs[],
-        int nrhs, mxArray const *prhs[]){
-    
-    //Check amount of inputs
-    if (nrhs != 5) {
-        mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput", "Wrong number of inputs provided");
-    }
-    ////////////////////////////
-    // 5th argument is array of GPU-IDs.
-    GpuIds gpuids;
-    {
-        size_t iM = mxGetM(prhs[4]);
-        if (iM != 1) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector.");
-            return;
-        }
-        size_t uiGpuCount = mxGetN(prhs[4]);
-        if (uiGpuCount == 0) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","5th parameter must be a row vector.");
-            return;
-        }
-        int* piGpuIds = (int*)mxGetData(prhs[4]);
-        gpuids.SetIds(uiGpuCount, piGpuIds);
-    }
-    
-    /*
-     ** 4th argument is matched or un matched.
-     */
-    bool pseudo_matched=false; // Caled krylov, because I designed it for krylov case....
-    /* copy the string data from prhs[0] into a C string input_ buf.    */
-    char *krylov = mxArrayToString(prhs[3]);
-    if (!strcmp(krylov,"matched")) // if its 0, they are the same
-        pseudo_matched=true;
-
-    /*
-     ** Third argument: angle of projection.
-     */
-    size_t mrows,nangles;
-    
-    mrows = mxGetM(prhs[2]);
-    nangles = mxGetN(prhs[2]);
-    
-    
-    mxArray const * const ptrangles=prhs[2];
-    
-    
-    double const * const anglesM= static_cast<double const *>(mxGetData(ptrangles));
-    // just copy paste the data to a float array
-    float  *  angles= (float*)malloc(nangles*mrows*sizeof(float));
-    for (int i=0;i<nangles*mrows;i++){
-        angles[i]=(float)anglesM[i];
-    }
-    /**
-     *
-     * First input: The projections
-     */
-    
-    // First input should be b from (Ax=b) i.e. the projections.
-    mxArray const * const image = prhs[0];                 // Get pointer of the data
-    mwSize const numDims = mxGetNumberOfDimensions(image); // Get numer of Dimensions of input matrix.
-    // Image should be dim 3
-    if (!(numDims==3 && nangles>1) && !(numDims==2 && nangles==1) ){
-        mexErrMsgIdAndTxt("CBCT:MEX:Atb:InvalidInput",  "Projection data is not the right size");
-    }
-    if( !mxIsSingle(prhs[0])) {
-        mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput",
-                "Input image must be a single noncomplex array.");
-    }
-    // Now that input is ok, parse it to C data types.
-    // NOTE: while Number of dimensions is the size of the matrix in Matlab, the data is 1D row-wise mayor.
-    
-    // We need a float image, and, unfortunately, the only way of casting it is by value
-//     const mwSize *size_proj= mxGetDimensions(image); //get size of image
-//     mrows = mxGetM(image);
-//     nangles = mxGetN(image);
-//     size_t size_proj2;
-//     if (nangles==1)
-//         size_proj2=1;
-//     else
-//         size_proj2=size_proj[2];
-    
-    
-    float  *  projections= static_cast<float *>(mxGetData(image));
-    
-    
-    
-    
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    /**
-     * Second input: Geometry structure
-     */
-    mxArray * geometryMex=(mxArray*)prhs[1];
-    
-    // IMPORTANT-> Make sure Matlab creates the struct in this order.
-    const char *fieldnames[14];
-    fieldnames[0] = "nVoxel";
-    fieldnames[1] = "sVoxel";
-    fieldnames[2] = "dVoxel";
-    fieldnames[3] = "nDetector";
-    fieldnames[4] = "sDetector";
-    fieldnames[5] = "dDetector";
-    fieldnames[6] = "DSD";
-    fieldnames[7] = "DSO";
-    fieldnames[8] = "offOrigin";
-    fieldnames[9] = "offDetector";
-    fieldnames[10]= "accuracy";
-    fieldnames[11]= "mode";
-    fieldnames[12]= "COR";
-    fieldnames[13]= "rotDetector";
-    // Make sure input is structure
-    
-    mxArray    *tmp;
-    
-    // Now we know that all the input struct is good! Parse it from mxArrays to
-    // C structures that MEX can understand.
-    
-    double * nVoxel, *nDetec; //we need to cast these to int
-    double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD,*offOrig,*offDetec;
-    double *acc, *COR,*rotDetector;
-    const char* mode;
-    bool coneBeam=true;
-    Geometry geo;
-    int c;
-    geo.unitX=1;geo.unitY=1;geo.unitZ=1;
-    for(int ifield=0; ifield<14; ifield++) {
-        tmp=mxGetField(geometryMex,0,fieldnames[ifield]);
-        if(tmp==NULL){
-            //tofix
-            continue;
-        }
-        switch(ifield){
-            case 0:
-                nVoxel=(double *)mxGetData(tmp);
-                // copy data to MEX memory
-                geo.nVoxelX=(int)nVoxel[0];
-                geo.nVoxelY=(int)nVoxel[1];
-                geo.nVoxelZ=(int)nVoxel[2];
-                break;
-            case 1:
-                sVoxel=(double *)mxGetData(tmp);
-                geo.sVoxelX=(float)sVoxel[0];
-                geo.sVoxelY=(float)sVoxel[1];
-                geo.sVoxelZ=(float)sVoxel[2];
-                break;
-            case 2:
-                dVoxel=(double *)mxGetData(tmp);
-                geo.dVoxelX=(float)dVoxel[0];
-                geo.dVoxelY=(float)dVoxel[1];
-                geo.dVoxelZ=(float)dVoxel[2];
-                break;
-            case 3:
-                nDetec=(double *)mxGetData(tmp);
-                geo.nDetecU=(int)nDetec[0];
-                geo.nDetecV=(int)nDetec[1];
-                break;
-            case 4:
-                sDetec=(double *)mxGetData(tmp);
-                geo.sDetecU=(float)sDetec[0];
-                geo.sDetecV=(float)sDetec[1];
-                break;
-            case 5:
-                dDetec=(double *)mxGetData(tmp);
-                geo.dDetecU=(float)dDetec[0];
-                geo.dDetecV=(float)dDetec[1];
-                break;
-           case 6:
-                geo.DSD=(float*)malloc(nangles * sizeof(float));
-                DSD=(double *)mxGetData(tmp);
-                for (int i=0;i<nangles;i++){
-                    geo.DSD[i]=(float)DSD[i];
-                }
-                break;
-            case 7:
-                geo.DSO=(float*)malloc(nangles * sizeof(float));
-                DSO=(double *)mxGetData(tmp);
-                for (int i=0;i<nangles;i++){
-                    geo.DSO[i]=(float)DSO[i];
-                }
-                break;
-            case 8:
-                
-                geo.offOrigX=(float*)malloc(nangles * sizeof(float));
-                geo.offOrigY=(float*)malloc(nangles * sizeof(float));
-                geo.offOrigZ=(float*)malloc(nangles * sizeof(float));
-                
-                offOrig=(double *)mxGetData(tmp);
-                
-                for (int i=0;i<nangles;i++){
-                    c=i;
-                    
-                    geo.offOrigX[i]=(float)offOrig[0+3*c];
-                    geo.offOrigY[i]=(float)offOrig[1+3*c];
-                    geo.offOrigZ[i]=(float)offOrig[2+3*c];
-                }
-                break;
-            case 9:
-                geo.offDetecU=(float*)malloc(nangles * sizeof(float));
-                geo.offDetecV=(float*)malloc(nangles * sizeof(float));
-                
-                offDetec=(double *)mxGetData(tmp);
-                for (int i=0;i<nangles;i++){
-                    c=i;
-                    
-                    geo.offDetecU[i]=(float)offDetec[0+2*c];
-                    geo.offDetecV[i]=(float)offDetec[1+2*c];
-                }
-                break;
-            case 10:
-                acc=(double*)mxGetData(tmp);
-                geo.accuracy=(float)acc[0];
-                if (acc[0]<0.001)
-                    mexErrMsgIdAndTxt( "CBCT:MEX:Ax:Accuracy","Accuracy should be bigger than 0.001");
-                
-                break;
-            case 11:
-                mode="";
-                mode=mxArrayToString(tmp);
-                if (!strcmp(mode,"parallel"))
-                    coneBeam=false;
-                break;
-            case 12:
-                COR=(double*)mxGetData(tmp);
-                geo.COR=(float*)malloc(nangles * sizeof(float));
-                for (int i=0;i<nangles;i++){
-                    
-                    geo.COR[i]  = (float)COR[i];
-                }
-                break;
-            case 13:
-                geo.dRoll= (float*)malloc(nangles * sizeof(float));
-                geo.dPitch=(float*)malloc(nangles * sizeof(float));
-                geo.dYaw=  (float*)malloc(nangles * sizeof(float));
-                
-                rotDetector=(double *)mxGetData(tmp);
-                
-                for (int i=0;i<nangles;i++){
-                    c=i;
-                    geo.dYaw[i]  = (float)rotDetector[0+3*c];
-                    geo.dPitch[i]= (float)rotDetector[1+3*c];
-                    geo.dRoll[i] = (float)rotDetector[2+3*c];
-                    
-                }
-                break;
-            default:
-                mexErrMsgIdAndTxt( "CBCT:MEX:Atb:unknown","This should not happen. Weird");
-                break;
-                
-        }
-    }
-    
-    /*
-     * allocate memory for the output (No longer needed)
-     */
-    
-//     float* result = (float*)malloc(geo.nVoxelX *geo.nVoxelY*geo.nVoxelZ*sizeof(float));
-    
-    
-    /*
-     * Call the CUDA kernel
-     */
-    mwSize imgsize[3];
-    imgsize[0]=geo.nVoxelX;
-    imgsize[1]=geo.nVoxelY;
-    imgsize[2]=geo.nVoxelZ;
-    plhs[0] = mxCreateNumericArray(3,imgsize, mxSINGLE_CLASS, mxREAL);
-    float *result = (float *)mxGetPr(plhs[0]);
-    
-    
-    
-    // To know which backprojection to call, we also need to know if the rotation is the orthodox/standard circular
-    // rotation around the Z axis, or of its something else. This is because the current backprojection for
-    // circular scans is optimized with a trick that assumes that the voxels in Z direction
-    // on the image are aligned with the axis of rotation, to increase memory latency.
-    // This however does not apply in arbitrary axis of rotation cases.
-    // TODO: test if we really need 2 different codes, or if running the accelerated code
-    // with the wrong assumptions will just result in a speed like the non-accelerated code,
-    // without sacrificing speedup in the standard case.
-    
-   
-    // Run the CUDA code.
-    if (coneBeam){
-        if (pseudo_matched){
-            voxel_backprojection2(projections,geo,result,angles,nangles, gpuids);
-        }else{
-            voxel_backprojection(projections,geo,result,angles,nangles, gpuids);
-        }
-    }else{
-        voxel_backprojection_parallel(projections,geo,result,angles,nangles, gpuids);
-    }
-
-
-    
-    return;
-}
diff --git a/MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip b/MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip
deleted file mode 100644
index 50d631df..00000000
--- a/MATLAB/Utilities/cuda_interface/AwminTV.cpp.prehip
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX gateway for Total variation minimization via Steepest descend
- *
- * This file gets the data from MATLAB, checks it for errors and then 
- * parses it to C and calls the relevant C/CUDA functions.
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-#include <math.h>
-#include <string.h>
-#include <tmwtypes.h>
-#include <mex.h>
-#include <matrix.h>
-#include <CUDA/GD_AwTV.hpp>
-#include <CUDA/GpuIds.hpp>
-#include <CUDA/gpuUtils.hpp>
-void mexFunction(int  nlhs , mxArray *plhs[],
-        int nrhs, mxArray const *prhs[])
-{
-///////// First check if the amount of inputs is right.    
-    int maxIter;
-    float alpha;
-    GpuIds gpuids;
-    if (nrhs==5) {
-        size_t iM = mxGetM(prhs[4]);
-        if (iM != 1) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
-            return;
-        }
-        size_t uiGpuCount = mxGetN(prhs[4]);
-        if (uiGpuCount == 0) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
-            return;
-        }
-        int* piGpuIds = (int*)mxGetData(prhs[4]);
-        gpuids.SetIds(uiGpuCount, piGpuIds);
-    } else {
-        int iGpuCount = GetGpuCount();
-        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
-        for (int iI = 0; iI < iGpuCount; ++iI) {
-            piDev[iI] = iI;
-        }
-        gpuids.SetIds(iGpuCount, piDev);
-        free(piDev); piDev = 0;
-    }    
-    if (nrhs==1){
-        maxIter=100;
-        alpha=15.0f;
-    } else if (nrhs==2){
-       mexErrMsgIdAndTxt("err", "Only 1 POCS hyperparameter inputted");
-    } else if (nrhs==4 || nrhs==5){
-        size_t mrows = mxGetM(prhs[1]);
-        size_t ncols = mxGetN(prhs[1]);
-        if (mrows!=1 || ncols !=1) {
-            mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1");
-        }
-        mrows = mxGetM(prhs[2]);
-        ncols = mxGetN(prhs[2]);
-        if (mrows!=1 || ncols !=1) {
-            mexErrMsgIdAndTxt("err", "POCS parameters should be 1x1");
-        }
-        alpha= (float)(mxGetScalar(prhs[1]));
-        maxIter=(int)floor(mxGetScalar(prhs[2])+0.5);
-    } else {
-       mexErrMsgIdAndTxt("err", "Too many input arguments");
-    }
-    float delta=(float)(mxGetScalar(prhs[3]));
-////////////////////////// First input.
-    // First input should be x from (Ax=b), or the image.
-    mxArray const * const image = prhs[0];
-    mwSize const numDims = mxGetNumberOfDimensions(image);
-    mwSize third_dim = 1;
-    
-    // Now that input is ok, parse it to C data types.
-    float  *  img = static_cast<float  *>(mxGetData(image));
-    const mwSize *size_img= mxGetDimensions(image); //get size of image
-
-    // Image should be dim 3
-    if (numDims==3){
-        third_dim = size_img[2];
-    }
-    
-    // Allocte output image
-    plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL);
-    float *imgout =(float*) mxGetPr(plhs[0]);
-    // call C function with the CUDA denoising
-  
-    const long imageSize[3]={size_img[0], size_img[1], third_dim };
-    
-    aw_pocs_tv(img,imgout, alpha, imageSize, maxIter, delta, gpuids); 
-    
-    //prepareotputs
-}
diff --git a/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip
deleted file mode 100644
index 3c6f3670..00000000
--- a/MATLAB/Utilities/cuda_interface/Ax_mex.cpp.prehip
+++ /dev/null
@@ -1,338 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX gateway for projection
- *
- * This file gets the data from MATLAB, checks it for errors and then
- * parses it to C and calls the relevant C/CUDA functions.
- *
- * CODE by       Ander Biguri
- *
- * ---------------------------------------------------------------------------
- * ---------------------------------------------------------------------------
- * Copyright (c) 2015, University of Bath and CERN- European Organization for
- * Nuclear Research
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- * may be used to endorse or promote products derived from this software without
- * specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * ---------------------------------------------------------------------------
- *
- * Contact: tigre.toolbox@gmail.com
- * Codes  : https://github.com/CERN/TIGRE
- * ---------------------------------------------------------------------------
- */
-
-
-
-#include <string.h>
-#include <tmwtypes.h>
-#include <mex.h>
-#include <matrix.h>
-#include <CUDA/ray_interpolated_projection.hpp>
-#include <CUDA/ray_interpolated_projection_parallel.hpp>
-#include <CUDA/Siddon_projection.hpp>
-#include <CUDA/Siddon_projection_parallel.hpp>
-#include <CUDA/GpuIds.hpp>
-
-/**
- * MEX gateway
- */
-
-
-
-void mexFunction(int  nlhs , mxArray *plhs[],
-        int nrhs, mxArray const *prhs[])
-{
-//     clock_t begin, end;
-//     begin = clock();
-    
-    
-    //Check amount of inputs
-    if (nrhs != 5) {
-        mexErrMsgIdAndTxt("CBCT:MEX:Ax:InvalidInput", "Invalid number of inputs to MEX file.");
-    }
-    ////////////////////////////
-    // 5th argument is array of GPU-IDs.
-    GpuIds gpuids;
-    {
-        size_t iM = mxGetM(prhs[4]);
-        if (iM != 1) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector.");
-            return;
-        }
-        size_t uiGpuCount = mxGetN(prhs[4]);
-        if (uiGpuCount == 0) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","5th parameter must be a row vector.");
-            return;
-        }
-        int* piGpuIds = (int*)mxGetData(prhs[4]);
-        gpuids.SetIds(uiGpuCount, piGpuIds);
-    }
-    ////////////////////////////
-    // 4th argument is interpolated or ray-voxel/Siddon
-    bool rayvoxel=false;
-    if ( mxIsChar(prhs[3]) != 1)
-        mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be a string");
-    
-    /* copy the string data from prhs[0] into a C string input_ buf.    */
-    char *krylov = mxArrayToString(prhs[3]);
-    if (strcmp(krylov,"interpolated") && strcmp(krylov,"Siddon") && strcmp(krylov,"ray-voxel"))
-        mexErrMsgIdAndTxt( "CBCT:MEX:Ax:InvalidInput","4rd input should be either 'interpolated' or 'Siddon'");
-    else
-        // If its not ray-voxel, its "interpolated"
-        if (strcmp(krylov,"Siddon") == 0 || strcmp(krylov,"ray-voxel") == 0) //strcmp returs 0 if they are equal
-            rayvoxel=true;
-    ///////////////////////// 3rd argument: angle of projection.
-    
-    size_t mrows = mxGetM(prhs[2]);
-    size_t nangles = mxGetN(prhs[2]);
-
-    mxArray const * const ptrangles=prhs[2];
-    
-    
-    double const * const anglesM= static_cast<double const *>(mxGetData(ptrangles));
-    // just copy paste the data to a float array
-    float  *  angles= (float*)malloc(nangles*mrows*sizeof(float));
-    for (int i=0;i<nangles*mrows;i++){
-        angles[i]=(float)anglesM[i];
-    }
-    
-    
-    ////////////////////////// First input.
-    // First input should be x from (Ax=b), or the image.
-    mxArray const * const image = prhs[0];
-    mwSize const numDims = mxGetNumberOfDimensions(image);
-    
-
-    // Now that input is ok, parse it to C data types.
-    float  *  img = static_cast<float  *>(mxGetData(image));
-    // We need a float image, and, unfortunately, the only way of casting it is by value
-    const mwSize *size_img= mxGetDimensions(image); //get size of image
-    
-    
-    
-    ///////////////////// Second input argument,
-    // Geometry structure that has all the needed geometric data.
-    
-    
-    mxArray * geometryMex=(mxArray*)prhs[1];
-    
-    // IMPORTANT-> Make sure Matlab creates the struct in this order.
-    const char *fieldnames[14];
-    fieldnames[0] = "nVoxel";
-    fieldnames[1] = "sVoxel";
-    fieldnames[2] = "dVoxel";
-    fieldnames[3] = "nDetector";
-    fieldnames[4] = "sDetector";
-    fieldnames[5] = "dDetector";
-    fieldnames[6] = "DSD";
-    fieldnames[7] = "DSO";
-    fieldnames[8] = "offOrigin";
-    fieldnames[9] = "offDetector";
-    fieldnames[10]= "accuracy";
-    fieldnames[11]= "mode";
-    fieldnames[12]= "COR";
-    fieldnames[13]= "rotDetector";
-    
-    // Now we know that all the input struct is good! Parse it from mxArrays to
-    // C structures that MEX can understand.
-    double * nVoxel, *nDetec; //we need to cast these to int
-    double * sVoxel, *dVoxel,*sDetec,*dDetec, *DSO, *DSD;
-    double *offOrig,*offDetec,*rotDetector;
-    double *  acc, *COR;
-    const char* mode;
-    int c;
-    mxArray    *tmp;
-    Geometry geo;
-    geo.unitX=1;geo.unitY=1;geo.unitZ=1;
-    bool coneBeam=true;
-//     mexPrintf("%d \n",nfields);
-    for(int ifield=0; ifield<14; ifield++) {
-        tmp=mxGetField(geometryMex,0,fieldnames[ifield]);
-        if(tmp==NULL){
-            //tofix
-            continue;
-        }
-        switch(ifield){
-            case 0:
-                nVoxel=(double *)mxGetData(tmp);
-                // copy data to MEX memory
-                geo.nVoxelX=(int)nVoxel[0];
-                geo.nVoxelY=(int)nVoxel[1];
-                geo.nVoxelZ=(int)nVoxel[2];
-                break;
-            case 1:
-                sVoxel=(double *)mxGetData(tmp);
-                geo.sVoxelX=(float)sVoxel[0];
-                geo.sVoxelY=(float)sVoxel[1];
-                geo.sVoxelZ=(float)sVoxel[2];
-                break;
-            case 2:
-                dVoxel=(double *)mxGetData(tmp);
-                geo.dVoxelX=(float)dVoxel[0];
-                geo.dVoxelY=(float)dVoxel[1];
-                geo.dVoxelZ=(float)dVoxel[2];
-                break;
-            case 3:
-                nDetec=(double *)mxGetData(tmp);
-                geo.nDetecU=(int)nDetec[0];
-                geo.nDetecV=(int)nDetec[1];
-                break;
-            case 4:
-                sDetec=(double *)mxGetData(tmp);
-                geo.sDetecU=(float)sDetec[0];
-                geo.sDetecV=(float)sDetec[1];
-                break;
-            case 5:
-                dDetec=(double *)mxGetData(tmp);
-                geo.dDetecU=(float)dDetec[0];
-                geo.dDetecV=(float)dDetec[1];
-                break;
-            case 6:
-                geo.DSD=(float*)malloc(nangles * sizeof(float));
-                DSD=(double *)mxGetData(tmp);
-                for (int i=0;i<nangles;i++){
-                    geo.DSD[i]=(float)DSD[i];
-                }
-                break;
-            case 7:
-                geo.DSO=(float*)malloc(nangles * sizeof(float));
-                DSO=(double *)mxGetData(tmp);
-                for (int i=0;i<nangles;i++){
-                    geo.DSO[i]=(float)DSO[i];
-                }
-                break;
-            case 8:
-                
-                geo.offOrigX=(float*)malloc(nangles * sizeof(float));
-                geo.offOrigY=(float*)malloc(nangles * sizeof(float));
-                geo.offOrigZ=(float*)malloc(nangles * sizeof(float));
-                
-                offOrig=(double *)mxGetData(tmp);
-                
-                for (int i=0;i<nangles;i++){
-                    c=i;
-                    geo.offOrigX[i]=(float)offOrig[0+3*c];
-                    geo.offOrigY[i]=(float)offOrig[1+3*c];
-                    geo.offOrigZ[i]=(float)offOrig[2+3*c];
-                }
-                break;
-            case 9:
-                geo.offDetecU=(float*)malloc(nangles * sizeof(float));
-                geo.offDetecV=(float*)malloc(nangles * sizeof(float));
-                
-                offDetec=(double *)mxGetData(tmp);
-                for (int i=0;i<nangles;i++){
-                    c=i;
-                    geo.offDetecU[i]=(float)offDetec[0+2*c];
-                    geo.offDetecV[i]=(float)offDetec[1+2*c];
-                }
-                break;
-            case 10:
-                acc=(double*)mxGetData(tmp);
-                if (acc[0]<0.001)
-                    mexErrMsgIdAndTxt( "CBCT:MEX:Ax:Accuracy","Accuracy should be bigger than 0.001");
-                
-                geo.accuracy=(float)acc[0];
-                break;
-            case 11:
-                mode="";
-                mode=mxArrayToString(tmp);
-                if (!strcmp(mode,"parallel"))
-                    coneBeam=false;
-                break;
-            case 12:
-                COR=(double*)mxGetData(tmp);
-                geo.COR=(float*)malloc(nangles * sizeof(float));
-                for (int i=0;i<nangles;i++){
-
-                    c=i;
-                    geo.COR[i]  = (float)COR[0+c];
-                }
-                break;
-                
-            case 13:
-                geo.dRoll= (float*)malloc(nangles * sizeof(float));
-                geo.dPitch=(float*)malloc(nangles * sizeof(float));
-                geo.dYaw=  (float*)malloc(nangles * sizeof(float));
-                
-                rotDetector=(double *)mxGetData(tmp);
-                
-                for (int i=0;i<nangles;i++){
-                  
-                    c=i;
-                    geo.dYaw[i]  = (float)rotDetector[0+3*c];
-                    geo.dPitch[i]= (float)rotDetector[1+3*c];
-                    geo.dRoll[i] = (float)rotDetector[2+3*c];
-                    
-                }
-                break;
-            default:
-                mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","This should not happen. Weird");
-                break;
-                
-        }
-    }
-    
- 
-    size_t num_bytes = geo.nDetecU*geo.nDetecV * sizeof(float);
-    
-    
-    mwSize outsize[3];
-    outsize[0]=geo.nDetecV;
-    outsize[1]=geo.nDetecU;
-    outsize[2]= nangles;
-    plhs[0] = mxCreateNumericArray(3, outsize, mxSINGLE_CLASS, mxREAL);
-    float *outProjections = (float*)mxGetPr(plhs[0]);  // WE will NOT be freeing this pointer!
-    
-    // MODIFICATION, RB, 5/12/2017: As said above, we do not allocate anything, just
-    // set pointers in result to point to outProjections
-    float** result = (float**)malloc(nangles * sizeof(float*)); // This only allocates memory for pointers
-    unsigned long long projSizeInPixels = geo.nDetecU * geo.nDetecV;
-    for (int i = 0; i < nangles; i++)
-    {
-        unsigned long long currProjIndex = projSizeInPixels*i;
-        result[i] = &outProjections[currProjIndex]; // now the pointers are the same
-    }
-    
-    // call the real function
-    if (coneBeam){
-        if (rayvoxel){
-            siddon_ray_projection(img,geo,result,angles,nangles, gpuids);
-        }else{
-            interpolation_projection(img,geo,result,angles,nangles, gpuids);
-        }
-    }else{
-        if (rayvoxel){
-            siddon_ray_projection_parallel(img,geo,result,angles,nangles, gpuids);
-        }else{
-            interpolation_projection_parallel(img,geo,result,angles,nangles, gpuids);
-        }
-    }
-    
-    return;
-    
-}
diff --git a/MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip b/MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip
deleted file mode 100644
index ee5d0fdc..00000000
--- a/MATLAB/Utilities/cuda_interface/minPICCS.cpp.prehip
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX gateway for Total variation minimization via Steepest descend
- *
- * This file gets the data from MATLAB, checks it for errors and then 
- * parses it to C and calls the relevant C/CUDA fucntions.
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-#include <tmwtypes.h>
-#include <mex.h>
-#include <math.h>
-#include <matrix.h>
-#include <CUDA/PICCS.hpp>
-#include <CUDA/GpuIds.hpp>
-#include <CUDA/gpuUtils.hpp>
-#include <string.h>
-// #include <time.h>
-void mexFunction(int  nlhs , mxArray *plhs[],
-        int nrhs, mxArray const *prhs[])
-{
-///////// First check if the amount of imputs is rigth.    
-    int maxIter;
-    float alpha;
-    float ratio;
-    GpuIds gpuids;
-    if (nrhs<5)
-        mexErrMsgIdAndTxt("TIGRE:minPICCS", "At least 2 inputs needed: Image and prior image");
-    if (nrhs>6){
-       mexErrMsgIdAndTxt("TIGRE:minPICCS", "Too many imput argumets");
-    }
-    if (nrhs==6){
-     size_t mrows = mxGetM(prhs[2]);
-     size_t ncols = mxGetN(prhs[2]);
-     if (mrows!=1 || ncols !=1)
-        mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1");
-     mrows = mxGetM(prhs[3]);
-     ncols = mxGetN(prhs[3]);
-     if (mrows!=1 || ncols !=1)
-        mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1");
-     mrows = mxGetM(prhs[4]);
-     ncols = mxGetN(prhs[4]);
-     if (mrows!=1 || ncols !=1)
-        mexErrMsgIdAndTxt("TIGRE:minPICCS", "PICCS parameters shoudl be 1x1");
-     alpha= (float)(mxGetScalar(prhs[2]));
-     maxIter=(int)floor(mxGetScalar(prhs[3])+0.5);
-     ratio= (float)(mxGetScalar(prhs[4]));
-     
-     size_t uiGpuCount = mxGetN(prhs[5]);
-        if (uiGpuCount == 0) {
-            mexErrMsgIdAndTxt( "TIGRE:minPICCS","6th parameter must be a row vector");
-            return;
-        }
-        int* piGpuIds = (int*)mxGetData(prhs[5]);
-        gpuids.SetIds(uiGpuCount, piGpuIds);
-    }else{
-        int iGpuCount = GetGpuCount();
-        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
-        for (int iI = 0; iI < iGpuCount; ++iI) {
-            piDev[iI] = iI;
-        }
-        gpuids.SetIds(iGpuCount, piDev);
-        free(piDev); piDev = 0;
-    }
-    if (nrhs==2){
-        maxIter=100;
-        alpha=15.0f;
-        ratio=0.5;
-    }
-        
-    
-////////////////////////// First input.
-    // First input should be x from (Ax=b), or the image.
-    mxArray const * const image = prhs[0];
-    mwSize const numDims = mxGetNumberOfDimensions(image);
-    if (numDims!=3){
-        mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D");
-    }   
-    mxArray const * const prior_mex = prhs[1];
-    mwSize const numDims_prior = mxGetNumberOfDimensions(image);
-    if (numDims_prior!=3){
-        mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image is not 3D");
-    }
-    if(numDims_prior!=numDims)
-        mexErrMsgIdAndTxt("TIGRE:minPICCS", "Image and prior are not the same size");
-    // Image should be dim 3
-
-    // Now that input is ok, parse it to C data types.
-    float const * const img   = static_cast<float const *>(mxGetData(image));
-    float const * const prior = static_cast<float const *>(mxGetData(prior_mex));
-    const mwSize *size_img= mxGetDimensions(image); //get size of image
-    
-
-    // Allocte output image  
-    const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] };
-    plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL);
-    float *imgout =(float*) mxGetPr(plhs[0]);
-    
-    
-    piccs_tv(img,prior,imgout, alpha,ratio, imageSize, maxIter,gpuids); 
-    
-
-    
-}
\ No newline at end of file
diff --git a/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip b/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip
deleted file mode 100644
index da60446c..00000000
--- a/MATLAB/Utilities/cuda_interface/minTV.cpp.prehip
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX gateway for Total variation minimization via Steepest descend
- *
- * This file gets the data from MATLAB, checks it for errors and then 
- * parses it to C and calls the relevant C/CUDA functions.
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-#include <math.h>
-#include <string.h>
-#include <tmwtypes.h>
-#include <mex.h>
-#include <matrix.h>
-#include <CUDA/GD_TV.hpp>
-#include <CUDA/GpuIds.hpp>
-#include <CUDA/gpuUtils.hpp>
-void mexFunction(int  nlhs , mxArray *plhs[],
-        int nrhs, mxArray const *prhs[])
-{
-///////// First check if the amount of inputs is right.    
-    int maxIter;
-    float alpha;
-    GpuIds gpuids;
-    if (nrhs==4) {
-        size_t iM = mxGetM(prhs[3]);
-        if (iM != 1) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
-            return;
-        }
-        size_t uiGpuCount = mxGetN(prhs[3]);
-        if (uiGpuCount == 0) {
-            mexErrMsgIdAndTxt( "TIGRE:minTV","4th parameter must be a row vector.");
-            return;
-        }
-        int* piGpuIds = (int*)mxGetData(prhs[3]);
-        gpuids.SetIds(uiGpuCount, piGpuIds);
-    } else {
-        int iGpuCount = GetGpuCount();
-        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
-        for (int iI = 0; iI < iGpuCount; ++iI) {
-            piDev[iI] = iI;
-        }
-        gpuids.SetIds(iGpuCount, piDev);
-        free(piDev); piDev = 0;
-    }
-    if (nrhs==1){
-        maxIter=100;
-        alpha=15.0f;
-    } else if (nrhs==2){
-       mexErrMsgIdAndTxt("minTV:mex", "Only 1 POCS hyperparameter inputted");
-    } else if (nrhs==3 || nrhs==4){
-     size_t mrows = mxGetM(prhs[1]);
-     size_t ncols = mxGetN(prhs[1]);
-     if (mrows!=1 || ncols !=1)
-        mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1");
-     mrows = mxGetM(prhs[2]);
-     ncols = mxGetN(prhs[2]);
-     if (mrows!=1 || ncols !=1)
-        mexErrMsgIdAndTxt("minTV:mex", "POCS parameters should be 1x1");
-     alpha= (float)(mxGetScalar(prhs[1]));
-     maxIter=(int)floor(mxGetScalar(prhs[2])+0.5);
-    } else {
-       mexErrMsgIdAndTxt("minTV:mex", "Too many input arguments");
-    }
-    
-////////////////////////// First input.
-    // First input should be x from (Ax=b), or the image.
-    mxArray const * const image = prhs[0];
-    mwSize const numDims = mxGetNumberOfDimensions(image);
-    mwSize third_dim = 1;
-    
-    
-    // Now that input is ok, parse it to C data types.
-    float  *  img = static_cast<float  *>(mxGetData(image));
-    const mwSize *size_img = mxGetDimensions(image); //get size of image    
-
-    // Image should be dim 3
-    if (numDims==3){
-        third_dim = size_img[2];
-    }
-    
-    // Allocte output image  
-    const long imageSize[3]={size_img[0] ,size_img[1], third_dim };
-    plhs[0] = mxCreateNumericArray(numDims, size_img, mxSINGLE_CLASS, mxREAL);
-    float *imgout =(float*) mxGetPr(plhs[0]);
-    
-    pocs_tv(img,imgout, alpha, imageSize, maxIter, gpuids); 
-}
diff --git a/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip b/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip
deleted file mode 100644
index 1142a5f7..00000000
--- a/MATLAB/Utilities/cuda_interface/pCTCubicSpline_mex.cpp.prehip
+++ /dev/null
@@ -1,124 +0,0 @@
-/*--------------------------------------------------------------------------
---------------------------------------------------------------------------
- This file is part of the TIGRE Toolbox
- 
- Copyright (c) 2015, University of Bath and 
-                     CERN-European Organization for Nuclear Research
-                     All rights reserved.
-
- License:            Open Source under BSD. 
-                     See the full license at
-                     https://github.com/CERN/TIGRE/blob/master/LICENSE
-
- Contact:            tigre.toolbox@gmail.com
- Codes:              https://github.com/CERN/TIGRE/
- Coded by:           Stefanie Kaser, Benjamin Kirchmayer 
---------------------------------------------------------------------------*/
-
-#include "mex.h"
-#include "CUDA/improvedForwardProjections.hpp"
-#include <stdexcept>
-#include <iostream>
-#include <cstring>
-
-
-void mexFunction(int nlhs, mxArray *plhs[], int nrhs,const mxArray *prhs[]){
-
-    if (nrhs =! 7){
-        mexErrMsgIdAndTxt("CS Projections:", "Check Number of Input arguments!");
-    }
-
-    float *posIn, *posOut, *dirIn, *dirOut;
-    float *Wepl, *pixelSize, *detectorDistanceIn, *detectorDistanceOut, *initEnergy;
-
-    //Load parameters
-    posIn = (float *)(mxGetPr(prhs[0]));
-    posOut = (float *)mxGetPr(prhs[1]);
-    dirIn = (float *)mxGetPr(prhs[2]);
-    dirOut = (float *)mxGetPr(prhs[3]);
-    Wepl = (float*) mxGetPr(prhs[4]);
-    initEnergy = (float*) mxGetPr(prhs[5]);
-
-    //Get Number of Protons contained in the root files
-    int numOfProtons = (int) mxGetM(prhs[4]);
-
-    mxArray * geometryMex=(mxArray*)prhs[6];
-
-    const char *fieldnames_geo[7];
-    fieldnames_geo[0] = "dDetector";
-    fieldnames_geo[1] = "DSD";
-    fieldnames_geo[2] = "DSID";
-    fieldnames_geo[3] = "DSO";
-    fieldnames_geo[4] = "hull";
-    fieldnames_geo[5] = "sDetector";
-    fieldnames_geo[6] = "mode";
-    
-    double * pix0, *dsd0, *dsid0, *hull0, *det0, *dso0;
-    float pix[2], dsd, dsid, dso, hull[4], det[2];
-    const char* mode;
-    bool coneBeam = true;
-    mxArray    *tmp;
-    for (int ifield=0; ifield<7; ifield++){
-        tmp=mxGetField(geometryMex,0,fieldnames_geo[ifield]);
-        switch(ifield){
-            case 0:
-                pix0 =(double *)mxGetData(tmp);
-                pix[0] = (float)pix0[0];
-                pix[1] = (float)pix0[1];
-                break;
-            case 1:
-                dsd0 =(double *)mxGetData(tmp);
-                dsd = (float)dsd0[0];
-                break;
-            case 2:
-                dsid0 =(double *)mxGetData(tmp);
-                dsid = (float)dsid0[0];
-                break;
-            case 3:
-                dso0 =(double *)mxGetData(tmp);
-                dso = (float)dso0[0];
-                break;
-            case 4:
-                hull0 =(double *)mxGetData(tmp);
-                hull[0] = (float)hull0[0];
-                hull[1] = (float)hull0[1];
-                hull[2] = (float)hull0[2];
-                hull[3] = (float)hull0[3];
-                break;
-            case 5:
-                det0 =(double *)mxGetData(tmp);
-                det[0] = (float)det0[0];
-                det[1] = (float)det0[1];
-                break;
-            case 6:
-                mode="";
-                mode=mxArrayToString(tmp);
-                if (!strcmp(mode,"parallel"))
-                    coneBeam=false;
-                break;
-        } 
-    }
-    
-    
-    if (hull[3] == 0){std::cout << "Info: Calculation of optimized proton radiographies will be performed without object hull!" << std::endl;}
-    
-    if (hull[2] > 6.28318530717958648){std::cout << "Info: Hull rotation angle exceeds 2 Pi. Please check the input! Continuing with calculation..." << std::endl;}
-
-    mwSize outSize[2];
-    outSize[0] = int(det[1]/pix[1]);
-    outSize[1] = int(det[0]/pix[0]);
-    plhs[0] = mxCreateNumericArray(2, outSize, mxSINGLE_CLASS, mxREAL);
-    float *outProjections = (float*)mxGetPr(plhs[0]);
-   
-    //For Calculation 2 historgrams are needed
-    // 
-    if(coneBeam == false){
-        std::cout << "Info: Parallel geometry selected..." << std::endl;
-        ParticleProjections(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, *initEnergy, hull);
-    }
-    else{
-        std::cout << "Info: Cone beam geometry selected..." << std::endl;
-        ParticleProjectionsCone(outProjections, posIn, posOut, dirIn, dirOut, Wepl, numOfProtons, int(det[0]/pix[0]), int(det[1]/pix[1]), pix, dsid-dso, dsd-dso, -1*dso, *initEnergy, hull);
-    }
-
-}
diff --git a/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip b/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip
deleted file mode 100644
index f905bcbd..00000000
--- a/MATLAB/Utilities/cuda_interface/tvDenoise.cpp.prehip
+++ /dev/null
@@ -1,147 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * MATLAB MEX  functions for TV image denoising. Check inputs and parses 
- * MATLAB data to C++ data.
- *
- *
- * CODE by       Ander Biguri
- *
----------------------------------------------------------------------------
----------------------------------------------------------------------------
-Copyright (c) 2015, University of Bath and CERN- European Organization for 
-Nuclear Research
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without 
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, 
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, 
-this list of conditions and the following disclaimer in the documentation 
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
- ---------------------------------------------------------------------------
-
-Contact: tigre.toolbox@gmail.com
-Codes  : https://github.com/CERN/TIGRE
---------------------------------------------------------------------------- 
- */
-
-
-
-
-
-
-#include <math.h>
-#include <string.h>
-#include <tmwtypes.h>
-#include <mex.h>
-#include <matrix.h>
-#include <CUDA/tv_proximal.hpp>
-#include <CUDA/GpuIds.hpp>
-#include <CUDA/gpuUtils.hpp>
-/**
- * MEX gateway
- */
-void mexFunction(int  nlhs , mxArray *plhs[],
-        int nrhs, mxArray const *prhs[])
-{
-    int maxIter;
-    float lambda;
-    GpuIds gpuids;
-    if (nrhs==4) {
-        size_t iM = mxGetM(prhs[3]);
-        if (iM != 1) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
-            return;
-        }
-        size_t uiGpuCount = mxGetN(prhs[3]);
-        if (uiGpuCount == 0) {
-            mexErrMsgIdAndTxt( "CBCT:MEX:Ax:unknown","4th parameter must be a row vector.");
-            return;
-        }
-        int* piGpuIds = (int*)mxGetData(prhs[3]);
-        gpuids.SetIds(uiGpuCount, piGpuIds);
-    } else {
-        int iGpuCount = GetGpuCount();
-        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
-        for (int iI = 0; iI < iGpuCount; ++iI) {
-            piDev[iI] = iI;
-        }
-        gpuids.SetIds(iGpuCount, piDev);
-        free(piDev); piDev = 0;
-    }
-    if (nrhs == 0) {
-        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "At least one input argumet required.");
-    } else if (nrhs==1){
-        maxIter=100;
-        lambda=15.0f;
-    } else if (nrhs==2){
-        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Only 1 TV hyperparameter inputted");
-    } else if (nrhs==3 || nrhs==4){
-        size_t mrows = mxGetM(prhs[1]);
-        size_t ncols = mxGetN(prhs[1]);
-        if (mrows!=1 || ncols !=1) {
-            mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1");
-        }
-        mrows = mxGetM(prhs[2]);
-        ncols = mxGetN(prhs[2]);
-        if (mrows!=1 || ncols !=1) {
-            mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "TV parameters should be 1x1");
-        }
-        lambda= (float)(mxGetScalar(prhs[1]));
-        maxIter=(int)round(mxGetScalar(prhs[2]));
-    } else if (nrhs>4) {
-        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Too many input arguments");
-    }
-    ////////////////////////// First input.
-    // First input should be x from (Ax=b), or the image.
-    mxArray const * const image = prhs[0];
-    mwSize const numDims = mxGetNumberOfDimensions(image);
-    
-    // Image should be dim 3
-    if (numDims!=3){
-        mexErrMsgIdAndTxt("CBCT:CUDA:TVdenoising", "Image is not 3D");
-    }
-    // Now that input is ok, parse it to C data types.
-    float  *  img = static_cast<float  *>(mxGetData(image));
-    // We need a float image, and, unfortunately, the only way of casting it is by value
-    const mwSize *size_img= mxGetDimensions(image); //get size of image
-    
-    //////////////
-    //prepareotputs
-    plhs[0] = mxCreateNumericArray(3,size_img, mxSINGLE_CLASS, mxREAL);
-    float *imgout =(float*) mxGetPr(plhs[0]);
-    // Allocte output image
-    // call C function with the CUDA denoising
-    const float spacing[3]={1,1,1};
-    const long imageSize[3]={size_img[0] ,size_img[1],size_img[2] };
-   
-    tvdenoising(img,imgout, lambda, spacing, imageSize, maxIter, gpuids); 
-    
-    
-    
-//     memcpy(mxImgout,imgout,size_img[0] *size_img[1] *size_img[2]*sizeof(float));
-    //free memory
-//     free(img);
-//     free(imgout);
-     
-
-}

From f610f8c9c6627df9e42585cf04dc95d87c01e781 Mon Sep 17 00:00:00 2001
From: purepani <pani0028@umn.edu>
Date: Wed, 19 Mar 2025 19:38:13 -0500
Subject: [PATCH 3/3] Successful compilation of HIP

---
 Common/CUDA/GD_AwTV.cu                        |   4 +-
 Common/CUDA/GD_TV.cu                          |   4 +-
 Common/CUDA/RandomNumberGenerator.cu          |   3 +-
 Common/CUDA/Siddon_projection.cu              |  10 +-
 Common/CUDA/Siddon_projection_parallel.cu     |   2 +-
 Common/CUDA/ray_interpolated_projection.cu    |   4 +-
 .../ray_interpolated_projection_parallel.cu   |   4 +-
 Common/CUDA/voxel_backprojection.cu           |   4 +-
 Common/CUDA/voxel_backprojection2.cu          |   4 +-
 Common/CUDA/voxel_backprojection_parallel.cu  |   4 +-
 Python/setup.py                               | 102 +++++++++---------
 11 files changed, 75 insertions(+), 70 deletions(-)

diff --git a/Common/CUDA/GD_AwTV.cu b/Common/CUDA/GD_AwTV.cu
index 03956111..e899b196 100644
--- a/Common/CUDA/GD_AwTV.cu
+++ b/Common/CUDA/GD_AwTV.cu
@@ -542,7 +542,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
                         
                         hipStreamSynchronize(stream[dev*nStream_device+1]);
-                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
+                        reduceNorm2 <<<dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2[dev], d_norm2aux[dev], total_pixels);
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
@@ -553,7 +553,7 @@ void aw_pocs_tv(float* img,float* dst,float alpha,const long* image_size, int ma
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
 
                         if (dimgridRed > 1) {
-                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
+                            reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed);
                             hipStreamSynchronize(stream[dev*nStream_device]);
                             hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
diff --git a/Common/CUDA/GD_TV.cu b/Common/CUDA/GD_TV.cu
index 4086e951..9777bbc2 100644
--- a/Common/CUDA/GD_TV.cu
+++ b/Common/CUDA/GD_TV.cu
@@ -526,7 +526,7 @@ do { \
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
                         
                         hipStreamSynchronize(stream[dev*nStream_device+1]);
-                        reduceNorm2 << <dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>> >(d_norm2[dev], d_norm2aux[dev], total_pixels);
+                        reduceNorm2 <<<dimgridRed, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2[dev], d_norm2aux[dev], total_pixels);
                         
                     }
                     for (dev = 0; dev < deviceCount; dev++){
@@ -537,7 +537,7 @@ do { \
                         size_t dimgridRed = (total_pixels + MAXTHREADS - 1) / MAXTHREADS;
 
                         if (dimgridRed > 1) {
-                            reduceSum << <1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device] >> >(d_norm2aux[dev], d_norm2[dev], dimgridRed);
+                            reduceSum <<<1, dimblockRed, MAXTHREADS*sizeof(float),stream[dev*nStream_device]>>>(d_norm2aux[dev], d_norm2[dev], dimgridRed);
                             hipStreamSynchronize(stream[dev*nStream_device]);
                             hipMemcpyAsync(&sumnorm2[dev], d_norm2[dev], sizeof(float), hipMemcpyDeviceToHost,stream[dev*nStream_device+1]);
                         }
diff --git a/Common/CUDA/RandomNumberGenerator.cu b/Common/CUDA/RandomNumberGenerator.cu
index 5910b407..e4e7c283 100644
--- a/Common/CUDA/RandomNumberGenerator.cu
+++ b/Common/CUDA/RandomNumberGenerator.cu
@@ -48,7 +48,8 @@
 #include <stdlib.h>
 #include <hip/hip_runtime.h>
 #include <hiprand/hiprand_kernel.h>
-#include <hiprand.h>
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand.h>
 
 #include "gpuUtils.hpp"
 #include "RandomNumberGenerator.hpp"
diff --git a/Common/CUDA/Siddon_projection.cu b/Common/CUDA/Siddon_projection.cu
index 8e551626..94b9eb1d 100644
--- a/Common/CUDA/Siddon_projection.cu
+++ b/Common/CUDA/Siddon_projection.cu
@@ -230,16 +230,16 @@ __global__ void kernelPixelDetector( Geometry geo,
     float ac=am;
     //eq (28), unit anlges
     float axu,ayu,azu;
-    axu=__frcp_rd(fabsf(ray.x));
-    ayu=__frcp_rd(fabsf(ray.y));
-    azu=__frcp_rd(fabsf(ray.z));
+    axu=__frcp_rn(fabsf(ray.x));
+    ayu=__frcp_rn(fabsf(ray.y));
+    azu=__frcp_rn(fabsf(ray.z));
     // eq(29), direction of update
     float iu,ju,ku;
     iu=(source.x< pixel1D.x)? 1.0f : -1.0f;
     ju=(source.y< pixel1D.y)? 1.0f : -1.0f;
     ku=(source.z< pixel1D.z)? 1.0f : -1.0f;
     
-    float maxlength=__fsqrt_rd(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
+    float maxlength=__fsqrt_rn(ray.x*ray.x*geo.dVoxelX*geo.dVoxelX+ray.y*ray.y*geo.dVoxelY*geo.dVoxelY+ray.z*ray.z*geo.dVoxelZ*geo.dVoxelZ);
     float sum=0.0f;
     unsigned long Np=(imax-imin+1)+(jmax-jmin+1)+(kmax-kmin+1); // Number of intersections
     // Go iterating over the line, intersection by intersection. If double point, no worries, 0 will be computed
@@ -601,7 +601,7 @@ void CreateTexture(const GpuIds& gpuids,const float* imagedata,Geometry geo,hipA
             //hipArray Descriptor
             hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
         }
     }
     for (unsigned int dev = 0; dev < num_devices; dev++){
diff --git a/Common/CUDA/Siddon_projection_parallel.cu b/Common/CUDA/Siddon_projection_parallel.cu
index 65e04a92..a6c50130 100644
--- a/Common/CUDA/Siddon_projection_parallel.cu
+++ b/Common/CUDA/Siddon_projection_parallel.cu
@@ -491,7 +491,7 @@ void CreateTextureParallel(float* image,Geometry geo,hipArray** d_cuArrTex, hipT
     //hipArray Descriptor
     hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
     //cuda Array
-    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0);
 
 
         hipMemcpy3DParms copyParams = {0};
diff --git a/Common/CUDA/ray_interpolated_projection.cu b/Common/CUDA/ray_interpolated_projection.cu
index 8ab4a7e7..cfafb99b 100644
--- a/Common/CUDA/ray_interpolated_projection.cu
+++ b/Common/CUDA/ray_interpolated_projection.cu
@@ -162,7 +162,7 @@ template<bool sphericalrotation>
     P.z=(uvOrigin.z+pixelU*deltaU.z+pixelV*deltaV.z);
     
     // Length is the ray length in normalized space
-    float length=__fsqrt_rd((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z));
+    float length=__fsqrt_rn((source.x-P.x)*(source.x-P.x)+(source.y-P.y)*(source.y-P.y)+(source.z-P.z)*(source.z-P.z));
     //now legth is an integer of Nsamples that are required on this line
     length=ceilf(__fdividef(length,geo.accuracy));//Divide the directional vector by an integer
     vectX=__fdividef(P.x -source.x,length);
@@ -561,7 +561,7 @@ void CreateTextureInterp(const GpuIds& gpuids,const float* imagedata,Geometry ge
             
             hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
             cudaCheckErrors("Texture memory allocation fail");
         }
         
diff --git a/Common/CUDA/ray_interpolated_projection_parallel.cu b/Common/CUDA/ray_interpolated_projection_parallel.cu
index 4793821f..45cd3984 100644
--- a/Common/CUDA/ray_interpolated_projection_parallel.cu
+++ b/Common/CUDA/ray_interpolated_projection_parallel.cu
@@ -419,7 +419,7 @@ void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex
     //hipArray Descriptor
     hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
     //cuda Array
-    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+    hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0);
     
     
     hipMemcpy3DParms copyParams = {0};
@@ -447,4 +447,4 @@ void CreateTextureParallelInterp(float* image,Geometry geo,hipArray** d_cuArrTex
     texDescr.readMode = hipReadModeElementType;
     hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
     
-}
\ No newline at end of file
+}
diff --git a/Common/CUDA/voxel_backprojection.cu b/Common/CUDA/voxel_backprojection.cu
index 8fb9df3c..b525d23c 100644
--- a/Common/CUDA/voxel_backprojection.cu
+++ b/Common/CUDA/voxel_backprojection.cu
@@ -247,7 +247,7 @@ __global__ void kernelPixelBackprojectionFDK(const Geometry geo, float* image,co
             
             weight=__fdividef(DSO+realy*sinalpha-realx*cosalpha,DSO);
             
-            weight=__frcp_rd(weight*weight);
+            weight=__frcp_rn(weight*weight);
             
             // Get Value in the computed (U,V) and multiply by the corresponding weight.
             // indAlpha is the ABSOLUTE number of projection in the projection array (NOT the current number of projection set!)
@@ -680,7 +680,7 @@ void CreateTexture(const GpuIds& gpuids, float* projectiondata,Geometry geo,hipA
             //hipArray Descriptor
             hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
             
         }
     }
diff --git a/Common/CUDA/voxel_backprojection2.cu b/Common/CUDA/voxel_backprojection2.cu
index 43091e78..814422a8 100644
--- a/Common/CUDA/voxel_backprojection2.cu
+++ b/Common/CUDA/voxel_backprojection2.cu
@@ -272,7 +272,7 @@ __global__ void kernelPixelBackprojection(const Geometry geo, float* image,const
             realD.y=-realDaux.x*sinalpha  + realDaux.y*cosalpha; //sin(-x)=-sin(x) , cos(-x)=cos(x)
             float L,lsq;
             
-            L = __fsqrt_rd( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always.
+            L = __fsqrt_rn( (realS.x-realD.x)*(realS.x-realD.x)+ (realS.y-realD.y)*(realS.y-realD.y)+ (realD.z)*(realD.z)); // Sz=0 always.
             lsq =  (realS.x-realvoxel.x)*(realS.x-realvoxel.x)
             + (realS.y-realvoxel.y)*(realS.y-realvoxel.y)
             + (realS.z-realvoxel.z)*(realS.z-realvoxel.z);
@@ -665,7 +665,7 @@ void CreateTexture2(const GpuIds& gpuids, float* projectiondata,Geometry geo,hip
             //hipArray Descriptor
             hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
             //cuda Array
-            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent);
+            hipMalloc3DArray(&d_cuArrTex[dev], &channelDesc, extent, 0);
             
         }
     }
diff --git a/Common/CUDA/voxel_backprojection_parallel.cu b/Common/CUDA/voxel_backprojection_parallel.cu
index 58ab9f38..055d27d2 100644
--- a/Common/CUDA/voxel_backprojection_parallel.cu
+++ b/Common/CUDA/voxel_backprojection_parallel.cu
@@ -595,7 +595,7 @@ void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArr
         hipChannelFormatDesc channelDesc = hipCreateChannelDesc<float>();
         //cuda Array
         if (alloc){
-        hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent);
+        hipMalloc3DArray(&d_cuArrTex[0], &channelDesc, extent, 0);
         cudaCheckErrors("Texture memory allocation fail");
         }
         hipMemcpy3DParms copyParams = {0};
@@ -625,4 +625,4 @@ void CreateTextureParallel(float* projectiondata,Geometry geo,hipArray** d_cuArr
         hipCreateTextureObject(&texImage[0], &texRes, &texDescr, NULL);
         cudaCheckErrors("Texture object creation fail");
     
-}
\ No newline at end of file
+}
diff --git a/Python/setup.py b/Python/setup.py
index 40bc3b3f..eee483e0 100644
--- a/Python/setup.py
+++ b/Python/setup.py
@@ -20,7 +20,7 @@
     if "--no_pinned_memory" in sys.argv[2:] :
         no_pinned=True
         sys.argv.pop(sys.argv.index("--no_pinned_memory"))
- 
+
 if no_pinned:
     define_macros.append(("NO_PINNED_MEMORY",None))     
 
@@ -48,10 +48,10 @@
 ]
 
 COMPUTE_CAPABILITY_ARGS = [
-    "-gencode=arch=compute_70,code=compute_70", # allows forward compiling
-    "--ptxas-options=-v",
+    #"-gencode=arch=compute_70,code=compute_70",  # allows forward compiling
+    #"--ptxas-options=-v",
     "-c",
-    "--default-stream=per-thread",
+    #"--default-stream=per-thread",
 ]
 
 
@@ -65,13 +65,14 @@ def get_cuda_version(cuda_home):
                 return version_str.split(" ")[2][:4]
         else:
             version_str = subprocess.check_output(
-                [os.path.join(cuda_home, "bin", "nvcc"), "--version"]
+                [os.path.join(cuda_home, "bin", "hipcc"), "--version"]
             )
             version_str = str(version_str).replace("\n", "").replace("\r", "")
             idx = version_str.find("release")
             return version_str[idx + len("release ") : idx + len("release ") + 4]
     except:
-        raise RuntimeError("Cannot read cuda version file")
+        pass
+        #raise RuntimeError("Cannot read cuda version file")
 
 
 def locate_cuda():
@@ -81,16 +82,16 @@ def locate_cuda():
     and values giving the absolute path to each directory.
 
     Starts by looking for the CUDA_HOME or CUDA_PATH env variable. If not found, everything
-    is based on finding 'nvcc' in the PATH.
+    is based on finding 'hipcc' in the PATH.
     """
     # Guess #1
-    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH") or os.environ.get("HIP_PATH")
     if cuda_home is None:
         # Guess #2
         try:
             which = "where" if IS_WINDOWS else "which"
-            nvcc = subprocess.check_output([which, "nvcc"]).decode().rstrip("\r\n")
-            cuda_home = os.path.dirname(os.path.dirname(nvcc))
+            hipcc = subprocess.check_output([which, "hipcc"]).decode().rstrip("\r\n")
+            cuda_home = os.path.dirname(os.path.dirname(hipcc))
         except subprocess.CalledProcessError:
             # Guess #3
             if IS_WINDOWS:
@@ -124,24 +125,24 @@ def _is_cuda_file(path):
 
 CUDA, CUDA_VERSION = locate_cuda()
 
-cuda_version = 11.0
-try:
-    cuda_version = float(CUDA_VERSION)
-except ValueError:
-    cuda_list = re.findall('\d+', CUDA_VERSION)
-    cuda_version = float( str(cuda_list[0] + '.' + cuda_list[1]))
-
-# Insert CUDA arguments depedning on the version
-for item in CC_COMPATIBILITY_TABLE:
-    support_begin = item[2]
-    support_end   = item[3]
-    if cuda_version < support_begin:
-        continue
-    if cuda_version >= support_end:
-        continue
-    str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}"
-    COMPUTE_CAPABILITY_ARGS.insert(0, str_arg)
-
+#cuda_version = 11.0
+#try:
+    #cuda_version = float(CUDA_VERSION)
+#except ValueError:
+    #cuda_list = re.findall("\d+", CUDA_VERSION)
+    #cuda_version = float(str(cuda_list[0] + "." + cuda_list[1]))
+#
+## Insert CUDA arguments depedning on the version
+#for item in CC_COMPATIBILITY_TABLE:
+    #support_begin = item[2]
+    #support_end = item[3]
+    #if cuda_version < support_begin:
+        ##continue
+    #if cuda_version >= support_end:
+        #continue
+    #str_arg = f"-gencode=arch=compute_{item[0]},code=sm_{item[1]}"
+    #COMPUTE_CAPABILITY_ARGS.insert(0, str_arg)
+#
 # Obtain the numpy include directory.  This logic works across numpy versions.
 try:
     NUMPY_INCLUDE = numpy.get_include()
@@ -153,10 +154,10 @@ def _is_cuda_file(path):
 
 
 COMMON_NVCC_FLAGS = [
-    "-D__CUDA_NO_HALF_OPERATORS__",
-    "-D__CUDA_NO_HALF_CONVERSIONS__",
-    "-D__CUDA_NO_HALF2_OPERATORS__",
-    "--expt-relaxed-constexpr",
+    #"-D__CUDA_NO_HALF_OPERATORS__",
+    #"-D__CUDA_NO_HALF_CONVERSIONS__",
+    #"-D__CUDA_NO_HALF2_OPERATORS__",
+    #"--expt-relaxed-constexpr",
 ]
 
 
@@ -211,18 +212,20 @@ def build_extensions(self):
         def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
             # Copy before we make any modifications.
             cflags = copy.deepcopy(extra_postargs)
+            cflags.append("-D__HIP_PLATFORM_AMD__")
             try:
                 original_compiler = self.compiler.compiler_so
                 if _is_cuda_file(src):
-                    nvcc = _join_cuda_home("bin", "nvcc")
-                    if not isinstance(nvcc, list):
-                        nvcc = [nvcc]
-                    self.compiler.set_executable("compiler_so", nvcc)
+                    hipcc = _join_cuda_home("bin", "hipcc")
+                    if not isinstance(hipcc, list):
+                        hipcc = [hipcc]
+                    self.compiler.set_executable("compiler_so", hipcc)
+                    self.compiler.set_executable("compiler", hipcc)
                     if isinstance(cflags, dict):
-                        cflags = cflags["nvcc"]
+                        cflags = cflags["hipcc"]
                     cflags = (
                         COMMON_NVCC_FLAGS
-                        + ["--compiler-options", "'-fPIC'"]
+                        + ["-fPIC"]
                         + cflags
                         + COMPUTE_CAPABILITY_ARGS
                     )
@@ -237,6 +240,7 @@ def unix_wrap_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
             finally:
                 # Put the original compiler back in place.
                 self.compiler.set_executable("compiler_so", original_compiler)
+                self.compiler.set_executable("compiler", original_compiler)
 
         def win_wrap_compile(
             sources,
@@ -269,9 +273,9 @@ def spawn(cmd, cflags):
                     src = src_list[0]
                     obj = obj_list[0]
                     if _is_cuda_file(src):
-                        nvcc = _join_cuda_home("bin", "nvcc")
+                        hipcc = _join_cuda_home("bin", "hipcc")
                         if isinstance(cflags, dict):
-                            cflags = cflags["nvcc"]
+                            cflags = cflags["hipcc"]
                         elif not isinstance(cflags, list):
                             cflags = []
 
@@ -287,7 +291,7 @@ def spawn(cmd, cflags):
                             elif len(macro) == 1:
                                 cflags += ["--undefine-macro", macro[0]]
 
-                        cmd = [nvcc, "-c", src, "-o", obj] + include_list + cflags
+                        cmd = [hipcc, "-c", src, "-o", obj] + include_list + cflags
                     elif isinstance(cflags, dict):
                         cflags = COMMON_MSVC_FLAGS  # + self.cflags['cxx']
                         cmd += cflags
@@ -372,7 +376,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -395,7 +399,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -416,7 +420,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -437,7 +441,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -458,7 +462,7 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -475,7 +479,7 @@ def include_headers(filename_list, sdist=False):
         sdist=sys.argv[1] == "sdist",
     ),
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
     include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
@@ -496,10 +500,10 @@ def include_headers(filename_list, sdist=False):
     ),
     define_macros=define_macros,
     library_dirs=[CUDA["lib64"]],
-    libraries=["cudart"],
+    libraries=["amdhip64", "hiprand"],
     language="c++",
     runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
-    include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
+    include_dirs=[NUMPY_INCLUDE, CUDA["include"],"../Common/CUDA/"],
 )