#include <stdio.h>
#include "defs.h"

__constant__ float mvpMatrixDevice[4][4];
__constant__ transFunc<6,4> tfDevice;


#if SI3D == 1
extern "C" void mapMVPMatrixToDevice(float matrix[4][4]){
  cudaMemcpyToSymbol(mvpMatrixDevice, matrix, sizeof(float)*16);
}

extern "C" void mapTFToDevice(transFunc<6,4> transferFunction){
  cudaMemcpyToSymbol(tfDevice,&transferFunction,sizeof(transFunc<6,4>));
}

__device__ float opacityFromTF(float scalar){
  for(int i = 0 ; i < tfDevice.numOpPoints - 1 ; i++){
  float lowerScalar = tfDevice.opacityPoints[i].scalar;
  float upperScalar = tfDevice.opacityPoints[i+1].scalar;
    if(scalar >  lowerScalar && scalar < upperScalar){
        float delta = (scalar - lowerScalar) / (upperScalar - lowerScalar);
        return (tfDevice.opacityPoints[i].opacity + (tfDevice.opacityPoints[i+1].opacity - tfDevice.opacityPoints[i].opacity) * delta);
    }
  }
  return 0.0f;
}

__device__ RGBA colorFromTF(float scalar){
  RGBA ret = 0x00000000;
  unsigned char color[4] = {0,0,0,0};
  //first, opacity:
  color[3] = static_cast<unsigned char>(opacityFromTF(scalar) * 255.0f);
  //next, color:
  for(int i = 0 ; i < tfDevice.numColPoints - 1 ; i++){
    float lowerScalar = tfDevice.colorPoints[i].scalar;
    float upperScalar = tfDevice.colorPoints[i+1].scalar;
    if(scalar > lowerScalar && scalar < upperScalar){
      
      float delta = (scalar - lowerScalar) / (upperScalar - lowerScalar);
      unsigned char tfColCharLow[4];
      //rgbaUintToByteArray(&tfDevice.colorPoints[i].color,tfColCharLow);
      memcpy(tfColCharLow,&tfDevice.colorPoints[i].color,4);
      unsigned char tfColCharHigh[4];
      //rgbaUintToByteArray(&tfDevice.colorPoints[i+1].color,tfColCharHigh);
      memcpy(tfColCharHigh,&tfDevice.colorPoints[i+1].color,4);
      
      color[0] = tfColCharLow[0] + (char)((float)(tfColCharHigh[0] - tfColCharLow[0])*delta);
      color[1] = tfColCharLow[1] + (char)((float)(tfColCharHigh[1] - tfColCharLow[1])*delta);
      color[2] = tfColCharLow[2] + (char)((float)(tfColCharHigh[2] - tfColCharLow[2])*delta);
      
    }
  }

  //charArrayToRGBA(color,&ret);
  memcpy(&ret,color,4);

  return ret;
}


  __device__ void projectParticle(float pos0, float pos1, float pos2, float scalar, RGBA* subPixelsD, depthValue* subPixelsDepthD,unsigned int windowWidth, unsigned int windowHeight){
  float clipped[4];

  // vertex * MVPmatrix:
  clipped[0] = pos0 * mvpMatrixDevice[0][0] + pos1 * mvpMatrixDevice[0][1] + pos2 * mvpMatrixDevice[0][2] + 1.0f * mvpMatrixDevice[0][3];
  clipped[1] = pos0 * mvpMatrixDevice[1][0] + pos1 * mvpMatrixDevice[1][1] + pos2 * mvpMatrixDevice[1][2] + 1.0f * mvpMatrixDevice[1][3];
  clipped[2] = pos0 * mvpMatrixDevice[2][0] + pos1 * mvpMatrixDevice[2][1] + pos2 * mvpMatrixDevice[2][2] + 1.0f * mvpMatrixDevice[2][3];
  clipped[3] = pos0 * mvpMatrixDevice[3][0] + pos1 * mvpMatrixDevice[3][1] + pos2 * mvpMatrixDevice[3][2] + 1.0f * mvpMatrixDevice[3][3];

  
  // normalized device coordinates:
  if(clipped[3] < 0.0001f && clipped[3] > -0.0001f)
    return;
  float reciW = 1.0f/clipped[3];
  clipped[0] *= reciW;
  clipped[1] *= reciW;
  clipped[2] *= reciW;
  clipped[3] = 1.0f;

  /*float xPos = (clipped[0] * 0.5 + 0.5) * static_cast<float>(windowWidth * SUBPIXELLEVEL);
  float yPos = (clipped[1] * 0.5 + 0.5) * static_cast<float>(windowHeight * SUBPIXELLEVEL);*/
  float zPos = (1.0 + clipped[2]) * 0.5;

  //get screen coordinates for buffer access:
  int xPixel = static_cast<int>((clipped[0] * 0.5 + 0.5) * static_cast<float>(windowWidth * SUBPIXELLEVEL));
  int yPixel = static_cast<int>((clipped[1] * 0.5 + 0.5) * static_cast<float>(windowHeight * SUBPIXELLEVEL));

  //compare x and y here to return if out of bounds:
  if(xPixel < 0 || xPixel >= windowWidth*SUBPIXELLEVEL)
    return;
  if(yPixel <0 ||yPixel >= windowHeight*SUBPIXELLEVEL)
    return;
  
  /*
  unsigned int arrayPos = yPixel*windowWidth*SUBPIXELLEVEL + xPixel;
  //depth test:
  if(zPos > subPixelsDepthD[arrayPos] && subPixelsDepthD[arrayPos] > 0.001f)
    return;

  subPixelsD[arrayPos] = colorFromTF(scalar);
  subPixelsDepthD[arrayPos] = zPos;
  */

  //no more depth test, 3d superimposing!
  unsigned int arrayPos = (yPixel*windowWidth*SUBPIXELLEVEL + xPixel)*ZDEPTH;
  
  for(unsigned int i = arrayPos ; i < arrayPos+ZDEPTH ; i++){
    //find position for insertion:
    if(zPos < subPixelsDepthD[i] && zPos > 0.001f){
      //first, move the rest back by one:
      for(int j = arrayPos+ZDEPTH - 1 ; j > i ; j--){
        subPixelsDepthD[j] = subPixelsDepthD[j-1];
        subPixelsD[j] = subPixelsD[j-1];
      }
      //and now store the new value
      subPixelsD[i] = colorFromTF(scalar);
      subPixelsDepthD[i] = zPos;
      //and only do this once!
      break;
    }
  }
  
  subPixelsD[arrayPos] = colorFromTF(scalar);
  subPixelsDepthD[arrayPos] = zPos;
  
}


__global__ void kernelProjection(curandState* rngStatesD, unsigned int numParticles, float* volumeRatios, unsigned int numTetras, unsigned int subpixelLevel, unsigned int windowWidth, unsigned int windowHeight, tetraFloat* tetrasD, RGBA* subPixelsD, float* subPixelsDepthD){
  
  unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
  if(x > numTetras)
    return;

  curandState localRNGState = rngStatesD[x];

  //get the model coordinates of the 4 points of the current tetra:
  __shared__ vertexFloat v0[BLOCKSIZE_PROJECTION];
  __shared__ vertexFloat v1[BLOCKSIZE_PROJECTION];
  __shared__ vertexFloat v2[BLOCKSIZE_PROJECTION];
  __shared__ vertexFloat v3[BLOCKSIZE_PROJECTION];
  v0[threadIdx.x] = tetrasD[x].v[0];
  v1[threadIdx.x] = tetrasD[x].v[1];
  v2[threadIdx.x] = tetrasD[x].v[2];
  v3[threadIdx.x] = tetrasD[x].v[3];

  if((opacityFromTF(v0[threadIdx.x].scalar) + opacityFromTF(v1[threadIdx.x].scalar) + opacityFromTF(v2[threadIdx.x].scalar) + opacityFromTF(v3[threadIdx.x].scalar)) < 0.1f)
    return;

  int cellParticles = static_cast<int>(volumeRatios[x] * static_cast<float>(numParticles));


  //generate the given amount of particles:
  //for(int i = 0 ; i <= cellParticles ; i++){
  for(int i = 0 ; i <= cellParticles ; i++){

    //generate 3 parameters randomly in the range 0..1
    float par0 = curand_uniform(&localRNGState);
    float par1 = curand_uniform(&localRNGState);
    float par2 = curand_uniform(&localRNGState);

    //if the sum of the 3 parameters is > 1, calculate 1 - parameter for each
    if(par0 + par1 + par2 > 1.0f){
      par0 = 1.0f - par0;
      par1 = 1.0f - par1;
      par2 = 1.0f - par2;
    }

    //fourth parameter is calculated here:
    float par3 = 1.0f - par0 - par1 - par2;
    

    //particle opacity equals emission probability:
    float particleScalar = v0[threadIdx.x].scalar * par0 + v1[threadIdx.x].scalar * par1 + v2[threadIdx.x].scalar * par2 + v3[threadIdx.x].scalar * par3;

    //if opacity of particle >= random variable, project it. otherwise, reject it.
    if(opacityFromTF(particleScalar) >= curand_uniform(&localRNGState)){

      //finalize particle and project it:
      //position is calculated on the fly, minimize mem access
      projectParticle(v0[threadIdx.x].pos[0] * par0 + v1[threadIdx.x].pos[0] * par1 + v2[threadIdx.x].pos[0] * par2 + v3[threadIdx.x].pos[0] * par3,
                      v0[threadIdx.x].pos[1] * par0 + v1[threadIdx.x].pos[1] * par1 + v2[threadIdx.x].pos[1] * par2 + v3[threadIdx.x].pos[1] * par3,
                      v0[threadIdx.x].pos[2] * par0 + v1[threadIdx.x].pos[2] * par1 + v2[threadIdx.x].pos[2] * par2 + v3[threadIdx.x].pos[2] * par3,
                      particleScalar,subPixelsD,subPixelsDepthD,windowWidth,windowHeight);
    }
  }

  //write back updated state:
  rngStatesD[x] = localRNGState;

}



extern "C" void launchProjection(curandState* rngStatesD, unsigned int numParticles, float* volumeRatios, unsigned int numTetras, unsigned int windowWidth, unsigned int windowHeight, tetraFloat* tetrasD, RGBA* subPixelsD, float* subPixelsDepthD){
  dim3 block(BLOCKSIZE_PROJECTION,1,1);
  double gridSize = static_cast<double>(numTetras)/static_cast<double>(BLOCKSIZE_PROJECTION);
  dim3 grid(static_cast<int>(gridSize)+1,1,1);

  #if DEBUG_CU_FILE
    printf("LaunchKernel projection");
    printf(" \n num tetras = %d\n Block Size = %d x 1 x 1 \n Grid Size = %d",BLOCKSIZE_PROJECTION,numTetras,static_cast<int>(gridSize)+1);
    printf(" \n window width = %d, window height = %d\n",windowWidth,windowHeight);
  #endif

  kernelProjection<<< grid, block>>>(rngStatesD, numParticles, volumeRatios, numTetras, SUBPIXELLEVEL, windowWidth, windowHeight, tetrasD, subPixelsD, subPixelsDepthD);


}


#else

extern "C" void mapMVPMatrixToDevice(float matrix[4][4]){
  cudaMemcpyToSymbol(mvpMatrixDevice, matrix, sizeof(float)*16);
}

extern "C" void mapTFToDevice(transFunc<6,4> transferFunction){
  cudaMemcpyToSymbol(tfDevice,&transferFunction,sizeof(transFunc<6,4>));
}

__device__ float opacityFromTF(float scalar){
  for(int i = 0 ; i < tfDevice.numOpPoints - 1 ; i++){
  float lowerScalar = tfDevice.opacityPoints[i].scalar;
  float upperScalar = tfDevice.opacityPoints[i+1].scalar;
    if(scalar >  lowerScalar && scalar < upperScalar){
        float delta = (scalar - lowerScalar) / (upperScalar - lowerScalar);
        return (tfDevice.opacityPoints[i].opacity + (tfDevice.opacityPoints[i+1].opacity - tfDevice.opacityPoints[i].opacity) * delta);
    }
  }
  return 0.0f;
}

__device__ RGBA colorFromTF(float scalar){
  RGBA ret = 0x00000000;
  unsigned char color[4] = {0,0,0,0};
  //first, opacity:
  color[3] = static_cast<unsigned char>(opacityFromTF(scalar) * 255.0f);
  //next, color:
  for(int i = 0 ; i < tfDevice.numColPoints - 1 ; i++){
    float lowerScalar = tfDevice.colorPoints[i].scalar;
    float upperScalar = tfDevice.colorPoints[i+1].scalar;
    if(scalar > lowerScalar && scalar < upperScalar){
      
      float delta = (scalar - lowerScalar) / (upperScalar - lowerScalar);
      unsigned char tfColCharLow[4];
      //rgbaUintToByteArray(&tfDevice.colorPoints[i].color,tfColCharLow);
      memcpy(tfColCharLow,&tfDevice.colorPoints[i].color,4);
      unsigned char tfColCharHigh[4];
      //rgbaUintToByteArray(&tfDevice.colorPoints[i+1].color,tfColCharHigh);
      memcpy(tfColCharHigh,&tfDevice.colorPoints[i+1].color,4);
      
      color[0] = tfColCharLow[0] + (char)((float)(tfColCharHigh[0] - tfColCharLow[0])*delta);
      color[1] = tfColCharLow[1] + (char)((float)(tfColCharHigh[1] - tfColCharLow[1])*delta);
      color[2] = tfColCharLow[2] + (char)((float)(tfColCharHigh[2] - tfColCharLow[2])*delta);
      
    }
  }

  //charArrayToRGBA(color,&ret);
  memcpy(&ret,color,4);

  return ret;
}


  __device__ void projectParticle(float pos0, float pos1, float pos2, float scalar, RGBA* subPixelsD, depthValue* subPixelsDepthD,unsigned int windowWidth, unsigned int windowHeight){
  float clipped[4];

  // vertex * MVPmatrix:
  clipped[0] = pos0 * mvpMatrixDevice[0][0] + pos1 * mvpMatrixDevice[0][1] + pos2 * mvpMatrixDevice[0][2] + 1.0f * mvpMatrixDevice[0][3];
  clipped[1] = pos0 * mvpMatrixDevice[1][0] + pos1 * mvpMatrixDevice[1][1] + pos2 * mvpMatrixDevice[1][2] + 1.0f * mvpMatrixDevice[1][3];
  clipped[2] = pos0 * mvpMatrixDevice[2][0] + pos1 * mvpMatrixDevice[2][1] + pos2 * mvpMatrixDevice[2][2] + 1.0f * mvpMatrixDevice[2][3];
  clipped[3] = pos0 * mvpMatrixDevice[3][0] + pos1 * mvpMatrixDevice[3][1] + pos2 * mvpMatrixDevice[3][2] + 1.0f * mvpMatrixDevice[3][3];

  
  // normalized device coordinates:
  if(clipped[3] < 0.0001f && clipped[3] > -0.0001f)
    return;
  float reciW = 1.0f/clipped[3];
  clipped[0] *= reciW;
  clipped[1] *= reciW;
  clipped[2] *= reciW;
  clipped[3] = 1.0f;

  /*float xPos = (clipped[0] * 0.5 + 0.5) * static_cast<float>(windowWidth * SUBPIXELLEVEL);
  float yPos = (clipped[1] * 0.5 + 0.5) * static_cast<float>(windowHeight * SUBPIXELLEVEL);*/
  float zPos = (1.0 + clipped[2]) * 0.5;

  //get screen coordinates for buffer access:
  int xPixel = static_cast<int>((clipped[0] * 0.5 + 0.5) * static_cast<float>(windowWidth * SUBPIXELLEVEL));
  int yPixel = static_cast<int>((clipped[1] * 0.5 + 0.5) * static_cast<float>(windowHeight * SUBPIXELLEVEL));

  //compare x and y here to return if out of bounds:
  if(xPixel < 0 || xPixel >= windowWidth*SUBPIXELLEVEL)
    return;
  if(yPixel <0 ||yPixel >= windowHeight*SUBPIXELLEVEL)
    return;
  
  unsigned int arrayPos = yPixel*windowWidth*SUBPIXELLEVEL + xPixel;
  //depth test:
  if(zPos > subPixelsDepthD[arrayPos] && subPixelsDepthD[arrayPos] > 0.001f)
    return;

  subPixelsD[arrayPos] = colorFromTF(scalar);
  subPixelsDepthD[arrayPos] = zPos;

}


__global__ void kernelProjection(curandState* rngStatesD, unsigned int numParticles, float* volumeRatios, unsigned int numTetras, unsigned int subpixelLevel, unsigned int windowWidth, unsigned int windowHeight, tetraFloat* tetrasD, RGBA* subPixelsD, float* subPixelsDepthD){
  
  unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
  if(x > numTetras)
    return;

  curandState localRNGState = rngStatesD[x];

  //get the model coordinates of the 4 points of the current tetra:
  __shared__ vertexFloat v0[BLOCKSIZE_PROJECTION];
  __shared__ vertexFloat v1[BLOCKSIZE_PROJECTION];
  __shared__ vertexFloat v2[BLOCKSIZE_PROJECTION];
  __shared__ vertexFloat v3[BLOCKSIZE_PROJECTION];
  v0[threadIdx.x] = tetrasD[x].v[0];
  v1[threadIdx.x] = tetrasD[x].v[1];
  v2[threadIdx.x] = tetrasD[x].v[2];
  v3[threadIdx.x] = tetrasD[x].v[3];

  if((opacityFromTF(v0[threadIdx.x].scalar) + opacityFromTF(v1[threadIdx.x].scalar) + opacityFromTF(v2[threadIdx.x].scalar) + opacityFromTF(v3[threadIdx.x].scalar))/4.0f < 0.05f)
    return;

  int cellParticles = static_cast<int>(volumeRatios[x] * static_cast<float>(numParticles));


  //generate the given amount of particles:
  //for(int i = 0 ; i <= cellParticles ; i++){
  for(int i = 0 ; i <= cellParticles ; i++){

    //generate 3 parameters randomly in the range 0..1
    float par0 = curand_uniform(&localRNGState);
    float par1 = curand_uniform(&localRNGState);
    float par2 = curand_uniform(&localRNGState);

    //if the sum of the 3 parameters is > 1, calculate 1 - parameter for each
    if(par0 + par1 + par2 > 1.0f){
      par0 = 1.0f - par0;
      par1 = 1.0f - par1;
      par2 = 1.0f - par2;
    }

    //fourth parameter is calculated here:
    float par3 = 1.0f - par0 - par1 - par2;
    

    //particle opacity equals emission probability:
    float particleScalar = v0[threadIdx.x].scalar * par0 + v1[threadIdx.x].scalar * par1 + v2[threadIdx.x].scalar * par2 + v3[threadIdx.x].scalar * par3;

    //if opacity of particle >= random variable, project it. otherwise, reject it.
    if(opacityFromTF(particleScalar) >= curand_uniform(&localRNGState)){

      //finalize particle and project it:
      //position is calculated on the fly, minimize mem access
      projectParticle(v0[threadIdx.x].pos[0] * par0 + v1[threadIdx.x].pos[0] * par1 + v2[threadIdx.x].pos[0] * par2 + v3[threadIdx.x].pos[0] * par3,
                      v0[threadIdx.x].pos[1] * par0 + v1[threadIdx.x].pos[1] * par1 + v2[threadIdx.x].pos[1] * par2 + v3[threadIdx.x].pos[1] * par3,
                      v0[threadIdx.x].pos[2] * par0 + v1[threadIdx.x].pos[2] * par1 + v2[threadIdx.x].pos[2] * par2 + v3[threadIdx.x].pos[2] * par3,
                      particleScalar,subPixelsD,subPixelsDepthD,windowWidth,windowHeight);
    }
  }

  //write back updated state:
  rngStatesD[x] = localRNGState;

}



extern "C" void launchProjection(curandState* rngStatesD, unsigned int numParticles, float* volumeRatios, unsigned int numTetras, unsigned int windowWidth, unsigned int windowHeight, tetraFloat* tetrasD, RGBA* subPixelsD, float* subPixelsDepthD){
  dim3 block(BLOCKSIZE_PROJECTION,1,1);
  double gridSize = static_cast<double>(numTetras)/static_cast<double>(BLOCKSIZE_PROJECTION);
  dim3 grid(static_cast<int>(gridSize)+1,1,1);

  #if DEBUG_CU_FILE
    printf("LaunchKernel projection");
    printf(" \n num tetras = %d\n Block Size = %d x 1 x 1 \n Grid Size = %d",BLOCKSIZE_PROJECTION,numTetras,static_cast<int>(gridSize)+1);
    printf(" \n window width = %d, window height = %d\n",windowWidth,windowHeight);
  #endif

  kernelProjection<<< grid, block>>>(rngStatesD, numParticles, volumeRatios, numTetras, SUBPIXELLEVEL, windowWidth, windowHeight, tetrasD, subPixelsD, subPixelsDepthD);


}


#endif

