/*
  Softshell: Dynamic Scheduling on GPUs.
  http://www.icg.tugraz.at/project/mvp

  Copyright (C) 2012 Institute for Computer Graphics and Vision,
                     Graz University of Technology

  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
              Bernhard Kainz - kainz ( at ) icg.tugraz.at
              Michael Kenzel - kenzel ( at ) icg.tugraz.at
              Stefan Hauswiesner - hauswiesner ( at ) icg.tugraz.at
              Bernhard Kerbl - kerbl ( at ) icg.tugraz.at
              Dieter Schmalstieg - schmalstieg ( at ) icg.tugraz.at

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
*/


/*
* file created by    Markus Steinberger / steinberger ( at ) icg.tugraz.at
*
* modifications by
*/

#ifndef SOFTSHELL_MEGAKERNEL_EXECSTATE_CUH_INCLUDED
#define SOFTSHELL_MEGAKERNEL_EXECSTATE_CUH_INCLUDED

#include "megakernel/launchstates.cuh"
#include "timing/timesync.cuh"
#include "tools/common.cuh"

namespace Softshell
{
  extern __constant__ float maxExecTime;
  extern __constant__ uint runningThreads;

  #define __execStateOffset() \
  (Softshell::___i_shared_all[threadIdx.x/32])
  #define __execState() \
  ((Softshell::ExecState*)(Softshell::___i_shared_all + __execStateOffset()))
  /*#define __threadIdOffset() \
    (*(___i_shared_all + blockDim.x/32 + threadIdx.x/32)) */
  #define __sync() \
    (__execState()->syncthreads())
  #define __threadId() \
    (__execState()->threadId())
  #define __numThreads() \
    (__execState()->numThreads())
  #define __event() \
    DEvent(__execState()->eventId,__execState()->eventLaunchId)

  #define __warpSpace() \
    (Softshell::___i_shared_all +  2*(blockDim.x/32) + 2*(threadIdx.x/32))

  #define __basicSharedRequirements()\
    (4*(blockDim.x/32))
  #define __sharedMemPointer(BYTE_OFFSET) \
    (((unsigned char*) (Softshell::___i_shared_all + __basicSharedRequirements())) + (BYTE_OFFSET))

  #define __blockState() \
        ((Softshell::BlockExecState*)(__sharedMemPointer(0)))

  #define __shared() \
    (__execState()->shared())
  #define __sharedAlloc(BYTES) \
    (__execState()->sharedAlloc(BYTES))
  #define __sharedFree(POINTER) \
    (__execState()->sharedFree(POINTER))

  extern __shared__ uint ___i_shared_all[];
  struct BlockExecState
  {
    uint launchId;
    volatile int run;
    clock64_t starttime;

    volatile int sharedOffset;
    volatile int sharedOffsetTop;
    int sharedOffsetEnd;
    volatile uint queuedReqShared;

    __device__ int sharedAlloc(uint size) volatile
    {
      int isize = (size + 15)/16*(16/sizeof(uint));
      //try to get something
      int expected = sharedOffset, prevExpected;
      do
      {
        int newOffset = expected + isize;
        if(newOffset > sharedOffsetTop)
          return -1; //out of space
        prevExpected = expected;
        expected = atomicCAS((int*)(&sharedOffset),prevExpected,newOffset);
      } while(prevExpected != expected);
      if(expected + isize > sharedOffsetTop)
      {
        atomicSub((int*)(&sharedOffset), isize);
        printf("%d %d sharedAlloc failed due to too little space:  %d > %d\n",blockIdx.x, threadIdx.x,expected + isize, sharedOffsetTop);
        return -1;
      }
      return expected;
    }
    __device__ void sharedFree(uint offset) volatile
    {
      //autoremove..
    }

    __device__ int sharedAllocTopMin(int newoffset) volatile
    {
      if(newoffset < sharedOffset)
        return 0;
      int old = atomicMin((int*)(&sharedOffsetTop),newoffset);
      if(newoffset < sharedOffset)
      {
        sharedOffsetTop = sharedOffset;
        printf("%d %d sharedAllocTop failed due to too little space:  %d < %d\n",blockIdx.x, threadIdx.x,newoffset, sharedOffset);
        return 0;
      }
      return min(old,newoffset);
    }
    __device__ int sharedAllocTop(uint size) volatile
    {
      int isize = (size + 15)/16*(16/sizeof(uint));
      //try to get something
      int expected = sharedOffsetTop, prevExpected, newOffset;
      do
      {
        newOffset = expected - isize;
        if(newOffset < sharedOffset)
          return -1; //out of space
        prevExpected = expected;
        expected = atomicCAS((int*)(&sharedOffsetTop),prevExpected,newOffset);
      } while(prevExpected != expected);
      if(newOffset < sharedOffset)
      {
        atomicAdd((int*)(&sharedOffsetTop), isize);
        return -1;
      }
      return newOffset;
    }
    __device__ void sharedFreeTop(uint offset) volatile
    {
      printf("MEGAKERNEL ERROR: call of sharedFreeTop is forbidden! there is no manual management for top shared!\n");
      trap();
    }

    __device__ void checkRunstate(bool checker) volatile
    {
      if(checker)
      {
        double diff = (getTimeCycles() - starttime)*TIMESYNC_seconds_per_cycle;
        if(diff >= maxExecTime)
        {
          //printf("%d megakernel: launch %d end due to time %lld %lld %f\n",blockIdx.x,launchId, starttime,getTimeCycles(),diff);
          d_launchStates->endLaunch(launchId);
          run = false;
        }
        else if(!d_launchStates->checkRunState(launchId))
        {
          //printf("%d megakernel: launch %d  end due to launch state\n",blockIdx.x, launchId);
          run = false;
        }
      }
    }
  };

  struct ExecState
  {
    uint nThreads;
    uint syncpoint;
    uint threadOffset;
    uint freeCount;
    uint eventLaunchId;
    uint eventId;

    PointerEquivalent wp;
    PointerEquivalent proc;

    uint sharedOffset;
    volatile uint distribute;

    __device__ BlockExecState* blockExecState()
    {
      return (BlockExecState*)(__sharedMemPointer(0));
    }
    __device__ void syncthreads() const
    {
      uint nt = ((nThreads + 31)/32)*32;
      //if(Softshell::laneid() == 0)
      //  printf(" %d %d sync %d %d\n", blockIdx.x, threadIdx.x, syncpoint, nt);
      Softshell::syncthreads(syncpoint, nt);
    }
    __device__ uint numThreads() const
    {
      return nThreads;
    }
    __device__ uint threadId() const
    {
      return threadIdx.x - threadOffset;
    }
    __device__ unsigned char* shared() const
    {
      return (unsigned char*)(___i_shared_all + sharedOffset);
    }

    __device__ unsigned char* sharedAlloc(uint size)
    {
      if(threadId() == 0)
        distribute = __blockState()->sharedAlloc(size);
      syncthreads();
      int res = distribute;
      if(res != -1)
        return (unsigned char*)(___i_shared_all + res);
      else
        return 0;
    }
    __device__ void sharedFree(void* pointer)
    {
      uint offset = (unsigned int*)(pointer) - ___i_shared_all;
      //TODO: figure out if everyone is taking part or not..?
      if(threadId() == 0)
        __blockState()->sharedFree(offset);
    }
  };


  ////dynamic group start
  //struct ExecState
  //{
  //  uint nThreads;
  //  uint syncpoint;
  //  int threadOffset;
  //  union{
  //  uint freeCount;
  //  uint share;
  //  };
  //  union
  //  {
  //  PointerEquivalent wp;
  //  uint eventId;
  //  };
  //  union
  //  {
  //  PointerEquivalent proc;
  //  uint eventLaunchId;
  //  };

  //  __device__ BlockExecState* blockExecState()
  //  {
  //    return (BlockExecState*)(__sharedMemPointer(0));
  //  }
  //  __device__ void syncthreads() const
  //  {
  //    uint nt = ((nThreads + 31)/32)*32;
  //    Softshell::syncthreads(syncpoint&0xFF, nt);
  //  }
  //  __device__ uint numThreads() const
  //  {
  //    return nThreads;
  //  }
  //  __device__ uint threadId() const
  //  {
  //    return threadOffset + (int)threadIdx.x;
  //  }
  //};



  inline __device__ bool __isMatching(uint data)
  {
    uint local_id = __popc(lanemask_lt() & __ballot(1));
    if(local_id == 0)
      __warpSpace()[0] = data;
    return __all(__warpSpace()[0] == data);
  }
  inline __device__ bool __isMatching(unsigned long long data)
  {
    uint local_id = __popc(lanemask_lt() & __ballot(1));
    if(local_id == 0)
    {
      __warpSpace()[0] = (uint)(data);
      __warpSpace()[1] = (uint)(data >> 32);
    }
    return __all(__warpSpace()[0] == (uint)(data) && __warpSpace()[1] == (uint)(data >> 32));
  }
  inline __device__ int __share(int data, bool share)
  {
    if(share)
      *(int*)(__warpSpace()) = data;
    if(__any(share))
      return *(int*)(__warpSpace());
    else
      return 0;
  }
  inline __device__ uint __share(uint data, bool share)
  {
    if(share)
      __warpSpace()[0] = data;
    if(__any(share))
      return __warpSpace()[0];
    else
      return 0;
  }

  inline __device__ unsigned long long  __share(unsigned long long data, bool share)
  {
    if(share)
    {
      __warpSpace()[0] = data & 0xFFFFFFFF;
      __warpSpace()[1] = data >> 32;
    }
    if(__any(share))
    {
      return (unsigned long long)(__warpSpace()[0]) | (((unsigned long long)__warpSpace()[1]) << 32);
    }
    else
      return 0;
  }

}
#endif
