/*
  Softshell: Dynamic Scheduling on GPUs.
  http://www.icg.tugraz.at/project/mvp

  Copyright (C) 2012 Institute for Computer Graphics and Vision,
                     Graz University of Technology

  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
              Bernhard Kainz - kainz ( at ) icg.tugraz.at
              Michael Kenzel - kenzel ( at ) icg.tugraz.at
              Stefan Hauswiesner - hauswiesner ( at ) icg.tugraz.at
              Bernhard Kerbl - kerbl ( at ) icg.tugraz.at
              Dieter Schmalstieg - schmalstieg ( at ) icg.tugraz.at

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
*/


/*
* file created by    Markus Steinberger / steinberger ( at ) icg.tugraz.at
*
* modifications by
*/

#ifndef SOFTSHELL_QUEUE_QUEUE_INCLUDED
#define SOFTSHELL_QUEUE_QUEUE_INCLUDED


#include "tools/types.h"
#include "queue/queueEntry.cuh"
#include "communicator/dparams.cuh"
#include "tools/common.cuh"
#include "tools/bitonicSort.cuh"

namespace Softshell
{
  class Workpackage;

  class Queue
  {
    static const int QueueSize = 2U*1024U*1024U;
    volatile QueueEntry queue[QueueSize];

  public:
    uint front, back;
    int count;
    volatile uint sortingFence;
    volatile bool hitSortingFence;
    uint sortingMinBorder;
    uint lastSortEnd;


  public:

    __device__ void init(uint _sortingMinBorder = 128)
    {
      for(int id = threadIdx.x + blockIdx.x*blockDim.x;
        id < QueueSize;
        id += blockDim.x*gridDim.x)
        queue[id].init();
      if(threadIdx.x == 0 && blockIdx.x == 0)
        front = back = count = 0;
      lastSortEnd = sortingFence = QueueSize;
      hitSortingFence = false;
      sortingMinBorder = _sortingMinBorder;

    }
    __device__ void setSortingBorder(uint val) { sortingMinBorder = val; }
    __device__ bool enqueue(Workpackage* workpackage, DeviceEntryProcedureFunc proc, uint forThreads)
    {
      __shared__ int spos[32];

      //combine
      uint mask = __ballot(1);
      int ourcount = __popc(mask);
      uint mypos = __popc(lanemask_lt() & mask);
      uint warpid = threadIdx.x/32;

      if(mypos == 0)
      {
        int old = atomicAdd(&count, ourcount);
        if(old >= QueueSize)
        {
          atomicSub(&count,ourcount);
          spos[warpid] = -1;
        }
        else
        {
          int pos = atomicAdd(&back, ourcount);
          spos[warpid] = pos;
        }
      }

      //force scheduler to combine divergent threads again
      uint mask2 = __ballot(1);
      uint mypos2 = __popc(lanemask_lt() & mask2);

      if(spos[warpid] == -1)
      {
        printf("ERROR in queue: queue overflow!\nABORT\n");
        trap();
        return false;
      }





      uint pos = spos[warpid] + mypos2;
      pos = pos % QueueSize;

      //printf("enqueing for %llx %llx for threads %d @ %d\n", workpackage,proc, forThreads, pos);
      queue[pos].acquire();
      queue[pos].func = proc;
      queue[pos].wp = workpackage;
      queue[pos].numThreads = forThreads;
      __threadfence();
      queue[pos].ready();
      return true;
    }
    __device__ bool dequeue(Workpackage* volatile * wp,
                            DeviceEntryProcedureFunc volatile * func,
                            uint volatile & numThreads)
    {
      *wp = 0;
      *func = 0;

      int c = atomicSub(&count, 1);
      if(c <= 0)
      {
        atomicAdd(&count, 1);
        return false;
      }
      else
      {
        uint pos = atomicInc(&front, QueueSize-1);

        //make sure we don't pull things which are being sorted
        uint currentfence;
        //uint mycoco = 0;
        while((currentfence = sortingFence) != QueueSize /*&& ++mycoco < 1000000*/)
        {
          if(currentfence > pos) break;
          if(currentfence < back &&  back < pos ) break;
          //ouch, we are blocked due to sorting!
          hitSortingFence = true;
        }
        //if(mycoco >= 1000000)
        //{
        //  printf("Sorting fence is blocking forever?!\nABORT\n");
        //  trap();
        //}

        queue[pos].read();
        *wp = queue[pos].wp;
        *func = queue[pos].func;
        numThreads = queue[pos].numThreads;
        queue[pos].free();
        return true;
      }
    }
    template<class PriorityType, class PriorityCaller>
    __forceinline__ __device__ void sortIteration(uint* ids, PriorityType* priorities,  uint num, uint linId, uint threads)
    {
      __shared__ int sortStart;
      //return;

      if(2*threads < num)
      {
        printf("Queue Sorting Error: %d threads is are too few for sorting %d elements\n", threads, num);
      }
      int cFront = 0;
      if(linId == 0)
      {
        cFront = *((volatile uint*)(&front));
        int cBack = *((volatile uint*)(&back))%QueueSize;

        int thisSortEnd = lastSortEnd;
        //construct not ringbuffered
        if(cFront > cBack)
        {
          cBack += QueueSize;
          if(cBack > thisSortEnd)
            thisSortEnd += QueueSize;
        }

        //compute next sorting position
        if(thisSortEnd == QueueSize || thisSortEnd < cFront)
          thisSortEnd = cBack - (int)num;
        else
          thisSortEnd = lastSortEnd - (num/2);

        //is there enough border?
        int maxfill = thisSortEnd - (int)sortingMinBorder;
        if(maxfill < cFront || count < (int)(512 + sortingMinBorder + num))
        {
          lastSortEnd = QueueSize;
          sortStart = -1;
        }
        else
        {
          sortStart = thisSortEnd;
        }

        ////debug
        // if(sortStart < 0)
        //  printf("not going to sort %d (%d->%d = %d)\n", sortStart, cFront, cBack, *(volatile int*)&count);
        //else
        //{
        //  printf("going to try sort @%d (%d->%d = %d)!\n", sortStart, cFront, cBack, *(volatile int*)&count);
        //  lastSortEnd = sortStart;
        // }
      }


      syncthreads(1, threads);
      if(sortStart < 0) return;

      ////debug
      //clock_t startLoad = clock();
      ////debug

      //load in data
      for(uint i = linId; i < num; i += threads)
      {
        uint elementId = (sortStart + i) % QueueSize;
        //uint mycoco = 0;
        while(!queue[elementId].isReady());
          //if(++mycoco > 1000000)
          //{
          //  printf("element in queue never got ready for sorting!\n num: %d el:%d front:%d back:%d sortingMinBorder: %d\nABORT!\n",count, elementId, front, back%QueueSize, sortingMinBorder);
          //  trap();
          //}
        ids[i] = elementId;
        priorities[i] = PriorityCaller::priority(queue[elementId].wp, queue[elementId].func);
      }

      __threadfence();
      syncthreads(1, threads);

      ////debug
      //clock_t endLoad = clock();
      ////debug

      //check if still ok and enable fence
      if(linId == 0)
      {
        hitSortingFence = false;
        sortingFence = sortStart % QueueSize;
        __threadfence();
        int nFront = *((volatile uint*)(&front));
        if(nFront < cFront) nFront += QueueSize;

        int maxfill = sortStart - (int)sortingMinBorder/2;
        if(maxfill < nFront)
        {
          //outch not enough space left
          sortingFence = QueueSize;
          lastSortEnd = QueueSize;
          sortStart = -1;
        }
        else
          lastSortEnd = sortStart;

        ////debug
        //if(sortStart < 0)
        //  printf("disabled fence %d (%d/%d->%d = %d)\n", sortStart, cFront, nFront, back, *(volatile int*)&count);
        //else
        //{
        //  printf("fence is up @%d (%d/%d->%d = %d)!\n", sortStart, cFront, nFront, back, *(volatile int*)&count);
        //  lastSortEnd = sortStart;
        //}
      }

      ////deb
      //sortingFence = QueueSize;
      //return;
      ////deb

      syncthreads(1, threads);
      if(sortStart < 0) return;

      ////debug
      //clock_t startSort = clock();
      ////debug

      //sort
      if(linId < num/2)
        Sort::bitonic<PriorityType, uint, false>(priorities, ids, linId, num);
      syncthreads(2, threads);


      ////debug
      //clock_t endSort = clock();
      ////debug

      //copy in
      QueueEntryData one, two;
      if(linId < num) one = queue[(sortStart + linId) % QueueSize];
      if(linId + threads < num) two = queue[(sortStart + linId + threads) % QueueSize];
      syncthreads(1, threads);

      //write out
      if(linId < num)  queue[ids[linId]] = one;
      if(linId + threads < num) queue[ids[linId + threads]] = two;

      __threadfence();
      syncthreads(1, threads);

      ////debug
      //clock_t endWrite = clock();
      ////debug




      //unset fence
      if(linId == 0)
      {
        sortingFence = QueueSize;
        //debug
        //printf("sorting done; queue: %d->%d, sorting: %d->%d (l: %d, s: %d, w: %d) %d\n", front, back, sortStart, sortStart+num, endLoad-startLoad, endSort-startSort, endWrite-endSort,hitSortingFence);

        if(hitSortingFence)
        {
          //we need to increase the margin
          sortingMinBorder += 64;
          d_params->changedParam<uint>(P_SortingBorder, sortingMinBorder);
          hitSortingFence = false;
        }
      }

    }
    __device__ int fillLevel() const
    {
      return max(0,*((volatile int*)&count));
    }
  };

  extern __device__ Queue d_queue;
};


#endif
