
#include "CudaRenderer.h"
#include "VBOCreator.h"

#include <cstdlib> 
#include <ctime> 

#include "boost/numeric/ublas/matrix.hpp"
#include "boost/numeric/ublas/lu.hpp"
#include <boost/numeric/ublas/io.hpp>

#include "windows.h"

extern "C" void mapMVPMatrixToDevice(float matrix[4][4]);
extern "C" void mapTFToDevice(transFunc<6,4> transferFunction);
extern "C" void launchProjection(curandState* rngStatesD, unsigned int numParticles, float* volumeRatios, unsigned int numTetras, unsigned int windowWidth, unsigned int windowHeight, tetraFloat* tetrasD, RGBA* subPixelsD, depthValue* subPixelsDepthD);
extern "C" void launchSuperimposingSpatial(unsigned int windowWidth, unsigned int windowHeight, unsigned int subPixelLevel, RGBA* pixels, RGBA* subPixels, depthValue* subPixelsDepthD);
extern "C" curandState* initRNGCuda(unsigned int numTetras, unsigned int seed);



CudaRenderer::CudaRenderer(){

  m_className = "CudaRenderer";
  m_cudaIsInit = false;

  m_fpsCount = 0;
  m_fpsLimit = 1;
  m_frameCount = 0;
  m_timer = 0;

  m_rotX = 0.0f;
  m_rotY = 0.0f;

  unsigned int million = 1000000;
#if TUMOR
  m_numParticles = 12 * million;
#endif

#if DRAGON
  m_numParticles = 130 * million;
#endif

#if HEART
  m_numParticles = 60 * million;
#endif;

  m_rngStates = NULL;

  cutilCheckError( cutCreateTimer( &m_timer));
}

void CudaRenderer::initCuda(){
  initTetraBufferD();
  initSubPixelsD();
  initSubPixelsDepthD();
  initVolumeRatioBufferD();

  cutilSafeCall(cudaGraphicsGLRegisterBuffer(&m_pboPixelsCuda, VBOCreator::getInstance()->getPBOPixels(), cudaGraphicsMapFlagsNone));

  initRNG();
  
  m_cudaIsInit = true;
}


void CudaRenderer::mapPixelBuffer(){
  cutilSafeCall(cudaGraphicsMapResources(1, &m_pboPixelsCuda));
}

void CudaRenderer::unmapPixelBuffer(){
  cutilSafeCall(cudaGraphicsUnmapResources(1, &m_pboPixelsCuda));
}

void CudaRenderer::initRNG(){
  unsigned int seed = 0;
  srand((unsigned)time(0)); 
  seed = rand(); 
  unsigned int numTetras = VBOCreator::getInstance()->getNumTetras();
  //std::cout << "Init rng: num tetras = " << numTetras << std::endl;
  m_rngStates = initRNGCuda(numTetras,seed);
}



void CudaRenderer::computeFPS()
{
  m_frameCount++;
  m_fpsCount++;

  if (m_fpsCount == m_fpsLimit) {
    char fps[256];
    float ifps = 1.f / (cutGetAverageTimerValue(m_timer) / 1000.f);
    sprintf(fps, "Cuda GL Interop (VBO): %3.1f fps (Max 100Hz)", ifps);  

    std::cout << fps << std::endl;
    m_fpsCount = 0; 
    m_fpsLimit = (int)MAX(ifps, 1.f);

#if FPS_QA
    if(ifps < MIN_FPS){
      float ratio = static_cast<float>(ifps) / static_cast<float>(MIN_FPS);
      m_numParticles *= ratio;
    }
    if(ifps > MAX_FPS){
      float ratio = static_cast<float>(ifps) / static_cast<float>(MAX_FPS);
      m_numParticles *= ratio;
    }
      

    std::cout << "Num particles = " << m_numParticles << std::endl;
#endif
    cutilCheckError(cutResetTimer(m_timer));  
  }
}



void CudaRenderer::launchKernel(unsigned int windowWidth, unsigned int windowHeight){

  //used deprecated functions:
  //tetraFloat* tetrasD = VBOCreator::getInstance()->mapTetrasToDevice();
  //subPixelConcrete* subPixelsD = VBOCreator::getInstance()->mapSubPixelsToDevice();
  if(!m_cudaIsInit)
    initCuda();

  cutilCheckError(cutStartTimer(m_timer));  

  mapOpenGLMatrixToDevice();
  initTransferFunction();


  mapPixelBuffer();

  size_t numBytes;
  RGBA* pixelsD;

  unsigned int numSubPixels = WINDOWWIDTH*WINDOWHEIGHT*SUBPIXELLEVEL*SUBPIXELLEVEL*ZDEPTH;
  cutilSafeCall(cudaMemset(m_subPixelsD, 0xff, numSubPixels * sizeof(RGBA)));
  cutilSafeCall(cudaMemset(m_subPixelsDepthD, 0x00, numSubPixels * sizeof(RGBA)));

  cutilSafeCall(cudaGraphicsResourceGetMappedPointer((void **)&pixelsD, &numBytes, m_pboPixelsCuda));
  cutilSafeCall(cudaMemset(pixelsD, 0xff, windowWidth*windowHeight*4));


  //std::cout << "Error before kernels: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
  launchProjection(m_rngStates,m_numParticles,m_volumeRatiosD,m_numTetras, windowWidth, windowHeight,m_tetrasD,m_subPixelsD, m_subPixelsDepthD);
  //std::cout << "Projection error: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
  cudaDeviceSynchronize();
  launchSuperimposingSpatial(windowWidth,windowHeight,SUBPIXELLEVEL,pixelsD,m_subPixelsD, m_subPixelsDepthD);
  cudaDeviceSynchronize();

  unmapPixelBuffer();

  cutilCheckError(cutStopTimer(m_timer));  
  computeFPS();
  
  
}

void CudaRenderer::updateCamRotation(float rotX, float rotY){
  m_rotX = rotX;
  m_rotY = rotY;
}

void CudaRenderer::mapOpenGLMatrixToDevice(){

  glMatrixMode(GL_MODELVIEW);
  glLoadIdentity();

#if TUMOR
  glTranslatef(0,0,60);
  //gluLookAt(0,0,0,0,0,-1,0,1,0);
#endif

#if DRAGON
  glTranslatef(0,0,170);
  //gluLookAt(0,0,0,0,0,-1,0,1,0);
#endif

#if HEART
  glTranslatef(0,0,300);
#endif

  glRotatef(m_rotX,1.0,0.0,0.0);
  glRotatef(m_rotY,0.0,1.0,0.0);

  float zNear = 1.0f;
  float zFar = 1000.0f;


  float mv[16];
  glGetFloatv(GL_MODELVIEW_MATRIX,mv);
  
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
  gluPerspective (45, (float)WINDOWWIDTH/(float)WINDOWHEIGHT, zNear, zFar);
  float pr[16];
  glGetFloatv(GL_PROJECTION_MATRIX,pr);


  /*

  boost::numeric::ublas::matrix<float> mvBoost(4,4);
  boost::numeric::ublas::matrix<float> prBoost(4,4);
  boost::numeric::ublas::matrix<float> mvpBoost(4,4);
  
  for(int i = 0 ; i < 4 ; i++){
    for(int j = 0 ; j < 4 ;j++){
      mvBoost(i,j) = mv[4*i+j];
      prBoost(i,j) = pr[4*i+j];
    }
  }
  mvBoost = trans(mvBoost);
  prBoost = trans(prBoost);
  mvpBoost = prod(prBoost,mvBoost);

  float mvpArray[4][4];
  for(int i = 0 ; i < 4 ; i++){
    for(int j = 0 ; j < 4 ;j++){
      mvpArray[i][j] = mvpBoost(i,j);
    }
  }*/

     // Get The Current PROJECTION Matrix From OpenGL
   float proj[16];
   float modl[16];
   float clip[16];
   glGetFloatv( GL_PROJECTION_MATRIX, proj );
   // Get The Current MODELVIEW Matrix From OpenGL
   glGetFloatv( GL_MODELVIEW_MATRIX, modl );
   // Combine The Two Matrices (Multiply Projection By Modelview)
   clip[ 0] = modl[ 0] * proj[ 0] + modl[ 1] * proj[ 4] + modl[ 2] * proj[ 8] +    modl[ 3] * proj[12];
   clip[ 1] = modl[ 0] * proj[ 1] + modl[ 1] * proj[ 5] + modl[ 2] * proj[ 9] +    modl[ 3] * proj[13];
   clip[ 2] = modl[ 0] * proj[ 2] + modl[ 1] * proj[ 6] + modl[ 2] * proj[10] +    modl[ 3] * proj[14];
   clip[ 3] = modl[ 0] * proj[ 3] + modl[ 1] * proj[ 7] + modl[ 2] * proj[11] +    modl[ 3] * proj[15];
   clip[ 4] = modl[ 4] * proj[ 0] + modl[ 5] * proj[ 4] + modl[ 6] * proj[ 8]    + modl[ 7] * proj[12];
   clip[ 5] = modl[ 4] * proj[ 1] + modl[ 5] * proj[ 5] + modl[ 6] * proj[ 9] +    modl[ 7] * proj[13];
   clip[ 6] = modl[ 4] * proj[ 2] + modl[ 5] * proj[ 6] + modl[ 6] * proj[10] +    modl[ 7] * proj[14];
   clip[ 7] = modl[ 4] * proj[ 3] + modl[ 5] * proj[ 7] + modl[ 6] * proj[11] +    modl[ 7] * proj[15];
   clip[ 8] = modl[ 8] * proj[ 0] + modl[ 9] * proj[ 4] + modl[10] * proj[ 8]    + modl[11] * proj[12];
   clip[ 9] = modl[ 8] * proj[ 1] + modl[ 9] * proj[ 5] + modl[10] * proj[ 9] +    modl[11] * proj[13];
   clip[10] = modl[ 8] * proj[ 2] + modl[ 9] * proj[ 6] + modl[10] * proj[10] +    modl[11] * proj[14];
   clip[11] = modl[ 8] * proj[ 3] + modl[ 9] * proj[ 7] + modl[10] * proj[11] +    modl[11] * proj[15];
   clip[12] = modl[12] * proj[ 0] + modl[13] * proj[ 4] + modl[14] * proj[ 8]    + modl[15] * proj[12];
   clip[13] = modl[12] * proj[ 1] + modl[13] * proj[ 5] + modl[14] * proj[ 9] +    modl[15] * proj[13];
   clip[14] = modl[12] * proj[ 2] + modl[13] * proj[ 6] + modl[14] * proj[10] +    modl[15] * proj[14];
   clip[15] = modl[12] * proj[ 3] + modl[13] * proj[ 7] + modl[14] * proj[11] +    modl[15] * proj[15];

   float mvp[4][4];
   for(int i = 0 ; i < 4 ; i++){
     for(int j = 0 ; j < 4 ; j++){
       mvp[i][j] = clip[4*j+i];
     }
   }
   mapMVPMatrixToDevice(mvp);

}

void CudaRenderer::initTransferFunction(){

#if TUMOR
  transFunc<6,4> tf;
  tf.numOpPoints = 6;
  tf.numColPoints = 4;
  transFuncOpacityPoint op1,op2,op3,op4,op5,op6;
  op1.scalar = -1.0f;
  op1.opacity = 0.0f;

  op2.scalar = 0.7f;
  op2.opacity = 0.0f;

  op3.scalar = 0.75f;
  op3.opacity = 0.5f;

  op4.scalar = 1.0f;
  op4.opacity = 0.7f;
  
  op5.scalar = 1.15f;
  op5.opacity = 0.7f;

  op6.scalar = 2.0f;
  op6.opacity = 0.7f;


  tf.opacityPoints[0] = op1;
  tf.opacityPoints[1] = op2;
  tf.opacityPoints[2] = op3;
  tf.opacityPoints[3] = op4;
  tf.opacityPoints[4] = op5;
  tf.opacityPoints[5] = op6;

  transFuncColorPoint col1,col2,col3,col4;
  col1.scalar = 0.69f;
  col1.color = 0x00ff0000;

  //col2.scalar = 0.7f;
  //col2.color = 0xffc04c3b;
  col2.scalar = 0.8f;
  col2.color = 0x00ff0000;

  //col3.scalar = 1.0f;
  //col3.color = 0xff2604b4;
  col3.scalar = 1.0f;
  col3.color = 0x000000ff;

  col4.scalar = 1.5f;
  col4.color = 0x000000ff;;

  tf.colorPoints[0] = col1;
  tf.colorPoints[1] = col2;
  tf.colorPoints[2] = col3;
  tf.colorPoints[3] = col4;

  mapTFToDevice(tf);

#endif

#if DRAGON
  transFunc<6,4> tf;
  tf.numOpPoints = 6;
  tf.numColPoints = 4;
  transFuncOpacityPoint op1,op2,op3,op4,op5,op6;
  /*op1.scalar = -1.0f;
  op1.opacity = 0.0f;

  op2.scalar = 0.0f;
  op2.opacity = 0.0f;

  op3.scalar = 0.0015f;
  op3.opacity = 0.0f;

  op4.scalar = 0.005f;
  op4.opacity = 0.7f;

  op5.scalar = 0.02116f;
  op5.opacity = 1.0f;

  op6.scalar = 1.0f;
  op6.opacity = 1.0f;*/

  op1.scalar = -1.0f;
  op1.opacity = 0.0f;

  op2.scalar = 0.0f;
  op2.opacity = 0.0f;

  op3.scalar = 0.0015f;
  op3.opacity = 0.0f;

  op4.scalar = 0.005f;
  op4.opacity = 1.0f;

  op5.scalar = 0.02116f;
  op5.opacity = 1.0f;

  op6.scalar = 1.0f;
  op6.opacity = 1.0f;

  tf.opacityPoints[0] = op1;
  tf.opacityPoints[1] = op2;
  tf.opacityPoints[2] = op3;
  tf.opacityPoints[3] = op4;
  tf.opacityPoints[4] = op5;
  tf.opacityPoints[5] = op6;

  transFuncColorPoint col1,col2,col3,col4;
  col1.scalar = -2.0f;
  col1.color = 0x00ff0000;
  //col1.color = 0x3b4cc000;

  col2.scalar = -0.00063f;
  col2.color = 0x00ff0000;
  //col2.color = 0x3b4cc000;

  col3.scalar = 0.0211f;
  col3.color = 0x0000ff00;
  //col3.color = 0xb4042600;

  col4.scalar = 2.0f;
  col4.color = 0x0000ff00;
  //col4.color = 0xb4042600;

  tf.colorPoints[0] = col1;
  tf.colorPoints[1] = col2;
  tf.colorPoints[2] = col3;
  tf.colorPoints[3] = col4;

  mapTFToDevice(tf);

#endif

#if HEART
    transFunc<6,4> tf;
  tf.numOpPoints = 6;
  tf.numColPoints = 4;
  transFuncOpacityPoint op1,op2,op3,op4,op5,op6;
  op1.scalar = 0.0f;
  op1.opacity = 0.0f;

  op2.scalar = 180.0f;
  op2.opacity = 0.3f;

  op3.scalar = 320.0f;
  op3.opacity = 1.0f;

  op4.scalar = 481.0f;
  op4.opacity = 1.0f;

  op5.scalar = 482.0f;
  op5.opacity = 1.0f;

  op6.scalar = 483.0f;
  op6.opacity = 1.0f;


  tf.opacityPoints[0] = op1;
  tf.opacityPoints[1] = op2;
  tf.opacityPoints[2] = op3;
  tf.opacityPoints[3] = op4;
  tf.opacityPoints[4] = op5;
  tf.opacityPoints[5] = op6;

  transFuncColorPoint col1,col2,col3,col4;
  col1.scalar = 180.0f;
  col1.color = 0x00ff0000;
  //col1.color = 0x3b4cc000;

  col2.scalar = 330.0f;
  col2.color = 0x0000ff00;
  //col2.color = 0x3b4cc000;

  col3.scalar = 480.0f;
  col3.color = 0x000000ff;
  //col3.color = 0xb4042600;

  col4.scalar = 481.0f;
  col4.color = 0x000000ff;
  //col4.color = 0xb4042600;

  tf.colorPoints[0] = col1;
  tf.colorPoints[1] = col2;
  tf.colorPoints[2] = col3;
  tf.colorPoints[3] = col4;

  mapTFToDevice(tf);
#endif

}


void CudaRenderer::initTetraBufferD(){
  m_tetras = VBOCreator::getInstance()->getTetras();
  m_numTetras = VBOCreator::getInstance()->getNumTetras();

  cutilSafeCall(cudaMalloc((void **)&m_tetrasD, m_numTetras * sizeof(tetraFloat)));  
  cutilSafeCall(cudaMemcpy(m_tetrasD,m_tetras,m_numTetras*sizeof(tetraFloat),cudaMemcpyHostToDevice));
  std::cout << "Allocating tetras: " << m_numTetras*sizeof(tetraFloat) /(double)(1024*1024) << " mb" << std::endl;
}

void CudaRenderer::initVolumeRatioBufferD(){
  float* volumes = VBOCreator::getInstance()->getTetraVolumes();
  float totalVolume = UGridTetrahedrizer::getInstance()->getGridVolume();
  float* volumeRatiosH = new float[m_numTetras];

  for(int i = 0 ; i < m_numTetras ; i++){
    volumeRatiosH[i] = volumes[i]/totalVolume;
  }

  cutilSafeCall(cudaMalloc((void**)&m_volumeRatiosD, m_numTetras*sizeof(float)));
  std::cout << "Allocating ratios: " << m_numTetras*sizeof(float) /(double)(1024*1024) << " mb" << std::endl;
  cutilSafeCall(cudaMemcpy(m_volumeRatiosD,volumeRatiosH,m_numTetras*sizeof(float),cudaMemcpyHostToDevice));


}

void CudaRenderer::initSubPixelsD(){
  unsigned int numSubPixels = WINDOWWIDTH*WINDOWHEIGHT*SUBPIXELLEVEL*SUBPIXELLEVEL*ZDEPTH;
  std::cout << "Allocating subpixels: " << numSubPixels * sizeof(RGBA) /(double)(1024*1024) << " mb" << std::endl;
  cutilSafeCall(cudaMalloc((void **)&m_subPixelsD, numSubPixels * sizeof(RGBA)));  
  cutilSafeCall(cudaMemset(m_subPixelsD, 0xff, numSubPixels * sizeof(RGBA)));
}

void CudaRenderer::initSubPixelsDepthD(){
  unsigned int numSubPixels = WINDOWWIDTH*WINDOWHEIGHT*SUBPIXELLEVEL*SUBPIXELLEVEL*ZDEPTH;
  cutilSafeCall(cudaMalloc((void **)&m_subPixelsDepthD, numSubPixels * sizeof(float)));  
  std::cout << "Allocating depth: " << numSubPixels * sizeof(float) /(double)(1024*1024) << " mb" << std::endl;
  cutilSafeCall(cudaMemset(m_subPixelsDepthD, 0x00, numSubPixels * sizeof(float)));
}