/*
  Softshell: Dynamic Scheduling on GPUs.
  http://www.icg.tugraz.at/project/mvp

  Copyright (C) 2012 Institute for Computer Graphics and Vision,
                     Graz University of Technology

  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
              Bernhard Kainz - kainz ( at ) icg.tugraz.at
              Michael Kenzel - kenzel ( at ) icg.tugraz.at
              Stefan Hauswiesner - hauswiesner ( at ) icg.tugraz.at
              Bernhard Kerbl - kerbl ( at ) icg.tugraz.at
              Dieter Schmalstieg - schmalstieg ( at ) icg.tugraz.at

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
*/


/*
* file created by    Markus Steinberger / steinberger ( at ) icg.tugraz.at
*
* modifications by
*/

#ifndef SOFTSHELL_MEGAKERNEL_MEGAKERNEL_CUH_INCLUDED
#define SOFTSHELL_MEGAKERNEL_MEGAKERNEL_CUH_INCLUDED

#include "tools/utils.h"
#include "tools/types.h"
#include "tools/common.cuh"
#include "api/workitem.h"
#include "megakernel/controller.h"
#include "megakernel/launchstates.cuh"
#include "megakernel/execstate.cuh"
#include "timing/timesync.cuh"
#include "queue/dscheduler.cuh"
#include "queue/queue.cuh"
#include "communicator/hparams.h"
#include "distributor/proceduremanager.cuh"
#include "aggregation/localaggregation.cuh"
#include "megakernel/localprepull.cuh"
#include "memory/roundrobinalloc.cuh"
#include "megakernel/controller.cuh"
#include "priorities/eventPriorityEval.h"
#include <stdio.h>


namespace Softshell
{
  __constant__ float maxExecTime = 1.0f;
  __constant__ uint runningThreads = 256;
  __device__ uint sleepTime = 2000;
  __constant__ uint defaultSharedMemory = 0;
  __constant__ int stopOnNoWork = 0;

  #define state \
  ((volatile BlockExecState*)(__sharedMemPointer(0)))

  #define localPrePull \
  ((volatile LocalPrePull*)__sharedMemPointer(sizeof(BlockExecState)))

  #define ExecStates \
  ((volatile ExecState*)__sharedMemPointer(sizeof(BlockExecState)+sizeof(LocalPrePull)))



  __device__ void d_worker(uint launchId,  uint additionalSharedMemory, uint NumGroups, uint GroupSize)
  {


    const uint minDynSharedMem = 16;
    //top is reserved
    BlockExecState* bes = (BlockExecState*)(__sharedMemPointer(0));
    bes->launchId = launchId;
    bes->run = true;
    bes->starttime = getTimeCycles();
    bes->sharedOffset = __basicSharedRequirements()+(sizeof(BlockExecState)+sizeof(LocalPrePull)+sizeof(ExecState)*NumGroups)/sizeof(uint);
    bes->sharedOffsetEnd = additionalSharedMemory/sizeof(uint);
    bes->sharedOffsetTop = (additionalSharedMemory-16)/sizeof(uint);

    new((LocalPrePull*)__sharedMemPointer(sizeof(BlockExecState))) LocalPrePull(NumGroups);

    if(threadIdx.x < NumGroups)
        ExecStates[threadIdx.x].syncpoint = 2+threadIdx.x;

    //make sure that the shared memory is DEADBEEF out in the free region!
    for(int i = threadIdx.x + bes->sharedOffset; i < bes->sharedOffsetTop; i += blockDim.x)
      ___i_shared_all[i] = 0xDEADBEEF;


    LocalAggregation::init();

    __shared__ int didWork;
    didWork = 0;
    state->checkRunstate(threadIdx.x == 0);

    while(true)
    //for(uint mycoco = 0; mycoco < 1000; ++mycoco)
    {
      syncthreads(0);
      //if(mycoco == 5)
      //  trap();


      //avoid register pressure during proc call
      {
        //pull in new workpackages
        int myWarp = threadIdx.x / 32;
        uint linId = laneid();
        uint front = localPrePull->front;
        int count = localPrePull->count;
        volatile ExecState* execStates = ExecStates;
        syncthreads(0);
        if(!state->run)
        {
          if((int)(threadIdx.x) < count)
          {
            if(!isshared((void*)(execStates[(front + threadIdx.x)%NumGroups].wp)))
              d_queue.enqueue((Workpackage*)execStates[(front + threadIdx.x)%NumGroups].wp,
                              (DeviceEntryProcedureFunc)execStates[(front + threadIdx.x)%NumGroups].proc,
                              execStates[(front + threadIdx.x)%NumGroups].nThreads);
            //printf("%d pushing elements out: %d < %d %llx %llx %d!\n",blockIdx.x,threadIdx.x,count,execStates[(front + threadIdx.x)%NumGroups].wp,execStates[(front + threadIdx.x)%NumGroups].proc,execStates[(front + threadIdx.x)%NumGroups].nThreads);
          }
          //take out all local elements
          LocalAggregation::maintainFirst(localPrePull, 0, 0, false,true);
          ////debug
          //if(threadIdx.x == 0)
          //  printf("%d executed %d wps\n",blockIdx.x, executedWps);
          return;
        }



        //put in local wps first
        //LocalAggregation::maintainFirst(localPrePull, (localPrePull->lastPull - count+1)/2);
        LocalAggregation::maintainFirst(localPrePull, localPrePull->lastPull - count);
        syncthreads(0);

        front = localPrePull->front;
        count = localPrePull->count;
        if(myWarp < localPrePull->lastPull - count)
        {
          if(linId == 0)
          {
            //printf("%d %d going to pull in\n", blockIdx.x, threadIdx.x);
            Workpackage* wp = 0;
            DeviceEntryProcedureFunc proc = 0;
            uint numThreads = 0;
            if(d_queue.dequeue(&wp, &proc, numThreads))
            {
              //printf("%d %d got %llx %llx to execute\n",blockIdx.x, threadIdx.x, proc, wp);
              uint mySpot = atomicInc((uint*)&localPrePull->back, NumGroups-1);
              atomicAdd((int*)&localPrePull->count,1);
              execStates[mySpot].wp = (PointerEquivalent)wp;
              execStates[mySpot].proc = (PointerEquivalent)proc;
              execStates[mySpot].nThreads = numThreads;
              ProcedureInfo* info = d_procedureManager->get(proc);
              uint reqShared = 0;
              if(info != 0)
                reqShared = (info->sharedMem + numThreads*info->sharedMemPerThread+15)/16*16;
              else
                reqShared = defaultSharedMemory;
              uint maxSpace = (state->sharedOffsetEnd-state->sharedOffset)*sizeof(uint);
              if(reqShared+minDynSharedMem > maxSpace)
              {
                printf("%4d ERROR: Megakernel has not got enough shared memory (%d < %d) to execute procedure with custom id %d (%llx)\nIncrease \"MegakernelSharedMemory\"\n",
                        blockIdx.x,maxSpace, reqShared, info->customId,proc);
                trap();
              }
              execStates[mySpot].sharedOffset = reqShared;
              //printf("%d %d got workpackage %llx %llx for event %d %d to be execute by %d threads with %d bytes of smem and stored it at %d\n", blockIdx.x, threadIdx.x, proc, wp, wp->getEvent().getEventId(), wp->getEvent().getEventLaunchId(), numThreads,execStates[mySpot].sharedOffset, mySpot);
              didWork = 1;
            }
          }
        }
        syncthreads(0);


        int afterPullCount = localPrePull->count;

        if(afterPullCount == 0)
        {
          syncthreads(0);
          LocalAggregation::maintainFirst(localPrePull, localPrePull->lastPull, 1000, true);
          syncthreads(0);
          afterPullCount = localPrePull->count;
        }
        if(afterPullCount > 0)
        {
          syncthreads(0);
          uint reqShared = 0;
          for(int i = front, c = 0; c < afterPullCount; ++c,i=(i+1)%NumGroups)
            reqShared += execStates[i].sharedOffset;
          localPrePull->inc = -2*LocalAggregation::maintainAfter(localPrePull, (reqShared+minDynSharedMem*afterPullCount)/sizeof(uint));
          syncthreads(0);
          afterPullCount = localPrePull->count;
        }

        if(afterPullCount == 0)
        {
          if(threadIdx.x == 0 && (didWork & stopOnNoWork))
          {
            //printf("%d %d end due to no work\n", launchId, blockIdx.x);
            d_launchStates->endLaunch(launchId);
            state->run = false;
          }
        }

        syncthreads(0);

        //find my group
        front = localPrePull->front;
        int back = localPrePull->back;
        int myGroup = -1;
        bool manage = false;
        int wCount = 0;
        uint sOffset = state->sharedOffset;
        uint sOffsetTop = state->sharedOffsetTop;
        uint mySOffset = 0;

        int c = 0;
        for(int i = front; c < afterPullCount; ++c,i=(i+1)%NumGroups)
        {
          manage |= wCount == myWarp;
          wCount += (execStates[i].nThreads+31)/32;
          uint nsoffset = sOffset + ((execStates[i].sharedOffset+15)/16*16)/sizeof(uint);
          if(wCount > blockDim.x / 32 || nsoffset >= sOffsetTop)
          {
            localPrePull->inc = -1;
            break;
          }
          //if(threadIdx.x == 0)
          //  printf("%2d wp %llx proc %llx exec by %d threads needs %d bytes of shared memory\n",blockIdx.x,execStates[i].wp, execStates[i].proc, execStates[i].nThreads, execStates[i].sharedOffset);
          if(myGroup == -1 && myWarp < wCount)
          {
            myGroup = i;
            mySOffset = sOffset;
          }
          sOffset = nsoffset;
        }
        syncthreads(0);
        if(threadIdx.x == 0)
        {
          //if(c != 0)
          //  printf("%2d changed state: count: %d->%d (wCount: %d) front: %d->%d sharedOffset: %d->%d (top: %d)\n",blockIdx.x,localPrePull->count, localPrePull->count-c, wCount, localPrePull->front,(localPrePull->front + c)%NumGroups,state->sharedOffset,sOffset, state->sharedOffsetTop);
          localPrePull->count -= c;
          localPrePull->front = (localPrePull->front + c)%NumGroups;
          state->sharedOffset = sOffset;
        }
        if(linId == 0)
        {
          //printf("%d %d %d %d %d\n",blockIdx.x, threadIdx.x, myGroup, manage,mySOffset);
          if(myGroup != -1)
          {
            __execStateOffset() = __basicSharedRequirements() + (sizeof(BlockExecState)+sizeof(LocalPrePull)+myGroup*sizeof(ExecState))/sizeof(uint);
            if(manage)
            {
              execStates[myGroup].sharedOffset = mySOffset;
              //atomicAdd((uint*)&state->sharedOffset,execStates[myGroup].sharedOffset/sizeof(uint));
              //printf("%d %d shared mem offset for %d: %d\n", blockIdx.x, threadIdx.x,myGroup, execStates[myGroup].sharedOffset);
              execStates[myGroup].threadOffset = myWarp*32;
            }
          }
          else
            __execStateOffset() = 0;
        }
        syncthreads(0);

        ////debug
        //executedWps += afterPullCount - localPrePull->count;

        //next round pullcount
        int lastPull = localPrePull->lastPull;
        if(threadIdx.x == 0 && afterPullCount ==  lastPull)
        {
          int empty = localPrePull->count == 0;
          int inc = localPrePull->inc + empty;
          localPrePull->lastPull = max(1,min(lastPull + inc, NumGroups));
          ///printf("%3d changing pullcount: empty: %d localPrePull->inc: %d  -> inc: %d -> lastpull = %d + %d = %d\n",blockIdx.x, empty, localPrePull->inc, inc, lastPull, inc, localPrePull->lastPull);

          localPrePull->inc = 0;
          //printf("%d %d changed pullcount: %d->%d\n", blockIdx.x, threadIdx.x,lastPull, localPrePull->lastPull);
        }

      }


      //execute
      volatile ExecState* myExecstate = 0;
      if(__execStateOffset() != 0)
      {
        myExecstate = __execState();
        if(myExecstate->threadOffset == threadIdx.x)
        {
          myExecstate->freeCount = 1;
          volatile Workpackage* wp = (volatile Workpackage*) myExecstate->wp;
          if(wp->isCombo())
          {
            volatile CombWorkpackageBase* cwp = static_cast<volatile CombWorkpackageBase*>(wp);
            myExecstate->freeCount = cwp->numWorkPackages();
          }
          //else
          //  printf("wp is not combo!\n");
          myExecstate->eventId = wp->getEvent().getEventId();
          myExecstate->eventLaunchId = wp->getEvent().getEventLaunchId();
          //printf("%d %d event: %d,%d  : %d %d\n", blockIdx.x, threadIdx.x,myExecstate->eventId,myExecstate->eventLaunchId, myExecstate->freeCount, myExecstate->nThreads);

        }
      }
      syncthreads(0);

      if(__execStateOffset() != 0)
      {
        //if(threadIdx.x%32 == 0)
        //  printf("%d %d going to execute %llx %llx with %d threads, my tid: %d\n", blockIdx.x, threadIdx.x, myExecstate->proc, myExecstate->wp, __execState()->nThreads, threadIdx.x -__execState()->threadOffset);
        if(threadIdx.x - myExecstate->threadOffset <  myExecstate->nThreads)
        {
          ((DeviceEntryProcedureFunc) myExecstate->proc)((Workpackage*) myExecstate->wp);
        }
        //else if(laneid() == 0)
        //  printf("%d %d not participating: %d < %d\n", blockIdx.x, threadIdx.x,threadIdx.x - myExecstate->threadOffset,  myExecstate->nThreads);

      }


      syncthreads(0);
      if(__execStateOffset() != 0)
      {
        volatile ExecState* myExecstate = __execState();
        if(myExecstate->threadOffset == threadIdx.x)
        {
          //printf("%d %d event: %d,%d  : %d %d\n", blockIdx.x, threadIdx.x,myExecstate->eventId,myExecstate->eventLaunchId, myExecstate->freeCount, myExecstate->nThreads);
          //if(myExecstate->eventId != 1 || myExecstate->eventLaunchId != 1)
          //  printf("%d %d event: %d,%d  : %d %d\n", blockIdx.x, threadIdx.x,myExecstate->eventId,myExecstate->eventLaunchId, myExecstate->freeCount, myExecstate->nThreads);
          DEvent event(myExecstate->eventId, myExecstate->eventLaunchId);
          d_eventManager.finishedWorkpackagesForEvent(event,myExecstate->freeCount);
        }
      }


      //make sure that the shared memory is deadbeef in the free region!
      int oldOffset = state->sharedOffset;
      int newOffset = __basicSharedRequirements()+(sizeof(BlockExecState)+sizeof(LocalPrePull)+sizeof(ExecState)*NumGroups)/sizeof(uint);
      syncthreads(0);
      for(int i = newOffset + threadIdx.x; i < oldOffset; i += blockDim.x)
        ___i_shared_all[i] = 0xDEADBEEF;
      state->sharedOffset = newOffset;

      __execStateOffset() = 0;
      state->checkRunstate(threadIdx.x == 0);
    }
  }

  __global__ void d_megakernel(uint launchId, ControllerFunc controllerfunc, uint sortingThreads, uint additionalSharedMemory, uint NumGroups, uint GroupSize)
  {

    //initial check
    ___i_shared_all[0] = d_launchStates->checkRunState(launchId);
    syncthreads();
    if(!___i_shared_all[0])
      return;


    if(blockIdx.x + blockIdx.y + blockIdx.z == 0)
    {
      volatile BlockExecState* bstate = (BlockExecState*)(___i_shared_all + (additionalSharedMemory-sizeof(BlockExecState))/sizeof(uint));
      bstate->launchId = launchId;
      bstate->run = true;
      bstate->starttime = getTimeCycles();
      controllerfunc(bstate, sortingThreads);
    }
    else
      d_worker(launchId,  additionalSharedMemory, NumGroups, GroupSize);
  }

  __global__ void d_separate_worker(uint launchId, uint additionalSharedMemory, uint NumGroups, uint GroupSize)
  {
    //initial check
    ___i_shared_all[0] = d_launchStates->checkRunState(launchId);
    syncthreads();
    if(!___i_shared_all[0])
      return;
    d_worker(launchId,  additionalSharedMemory, NumGroups, GroupSize);
  }

   template<class PriorityEval, bool SortQueue, bool AggregateWorkItems>
  __global__ void d_megakernel_def_controller(uint launchId, uint sortingThreads, uint additionalSharedMemory, uint NumGroups, uint GroupSize)
  {
    //initial check
    ___i_shared_all[0] = d_launchStates->checkRunState(launchId);
    syncthreads();
    if(!___i_shared_all[0])
      return;

    if(blockIdx.x + blockIdx.y + blockIdx.z == 0)
    {
      volatile BlockExecState* bstate = (BlockExecState*)(___i_shared_all + (additionalSharedMemory-sizeof(BlockExecState))/sizeof(uint));
      bstate->launchId = launchId;
      bstate->run = true;
      bstate->starttime = getTimeCycles();
      Controller<PriorityEval, SortQueue, AggregateWorkItems>(bstate, sortingThreads);
    }
    else
     d_worker(launchId, additionalSharedMemory, NumGroups, GroupSize);
  }

  template __global__ void d_megakernel_def_controller<EventPriorityEval, true, true>(uint launchId, uint sortingThreads, uint additionalSharedMemory, uint NumGroups, uint GroupSize);
  template __global__ void d_megakernel_def_controller<EventPriorityEval, true, false>(uint launchId, uint sortingThreads, uint additionalSharedMemory, uint NumGroups, uint GroupSize);
  template __global__ void d_megakernel_def_controller<EventPriorityEval, false, true>(uint launchId, uint sortingThreads, uint additionalSharedMemory, uint NumGroups, uint GroupSize);
  template __global__ void d_megakernel_def_controller<EventPriorityEval, false, false>(uint launchId, uint sortingThreads, uint additionalSharedMemory, uint NumGroups, uint GroupSize);

  //  //dynamic block adjustment:
  //  const uint SyncMagicSleeping    = 0x00010000;
  //  const uint SyncMagicExec        = 0x00020000;
  //  const uint SyncMagicController  = 0x00040000;
  //
  //  __global__ void d_megakernel(uint launchId, ControllerFunc controllerfunc, uint sortingThreads, uint additionalSharedMemory, uint NumGroups, uint GroupSize)
  //  {
  //    if(!d_launchStates->checkRunState(launchId))
  //      return;
  //
  //
  //
  //    if(blockIdx.x + blockIdx.y + blockIdx.z == 0)
  //    {
  //      volatile BlockExecState* bstate = (BlockExecState*)(___i_shared_all + (additionalSharedMemory-sizeof(BlockExecState))/sizeof(uint));
  //      bstate->launchId = launchId;
  //      bstate->run = true;
  //      bstate->starttime = getTime();
  //      controllerfunc(bstate, sortingThreads);
  //    }
  //    else
  //    {
  //      #define state \
  //        ((volatile BlockExecState*)(__sharedMemPointer(0)))
  //      state->launchId = launchId;
  //      state->run = true;
  //      state->starttime = getTime();
  //
  //
  //
  //      struct DynMegakernelDistribution
  //      {
  //        uint controller;
  //        uint dummy[3];
  //        __device__ void init(uint NumGroups) volatile
  //        {
  //          controller = 0;
  //        }
  //      };
  //  #define dyn \
  //      ((volatile DynMegakernelDistribution*)__sharedMemPointer(sizeof(BlockExecState)))
  //      dyn->init(NumGroups);
  //
  //
  //      //init execstates
  //#define es \
  //      ((volatile ExecState*)__sharedMemPointer(sizeof(BlockExecState)+sizeof(DynMegakernelDistribution)))
  //      if(threadIdx.x < NumGroups)
  //      {
  //        es[threadIdx.x].wp = 0;
  //        es[threadIdx.x].proc = 0;
  //        es[threadIdx.x].syncpoint = 0;
  //      }
  //
  //
  //
  //      uint myGroup = threadIdx.x / GroupSize;
  //      volatile ExecState* myExecState = es + myGroup;
  //      syncthreads(0);
  //
  //
  //
  //      uint it = 0;
  //      while(true)
  //      {
  //        syncthreads(myGroup, GroupSize);
  //        //just woke up for a new cycle
  //        //if(threadIdx.x == myGroup*GroupSize)
  //        //  printf("%d %d %x woke up for a cycle\n",blockIdx.x,myGroup, myExecState->syncpoint);
  //
  //
  //        //state: exec controller or 0
  //
  //        //check if i have got something to execute
  //        if(myExecState->syncpoint & SyncMagicExec)
  //        {
  //          int myThreadId = threadIdx.x + myExecState->threadOffset;
  //          //exec
  //          Workpackage* wp = reinterpret_cast<Workpackage*>(myExecState->wp);
  //          DeviceEntryProcedureFunc proc = reinterpret_cast<DeviceEntryProcedureFunc>(myExecState->proc);
  //          syncthreads(myGroup, GroupSize);
  //          if(myThreadId == 0)
  //          {
  //            if(wp->isCombo())
  //            {
  //              CombWorkpackageBase* cwp = static_cast<CombWorkpackageBase*>(wp);
  //              myExecState->freeCount = cwp->numWorkPackages();
  //            }
  //            myExecState->eventId = wp->getEvent().getEventId();
  //            myExecState->eventLaunchId = wp->getEvent().getEventLaunchId();
  //          }
  //          //sync on group
  //          syncthreads(myExecState->syncpoint&0xFF, (myExecState->nThreads+GroupSize-1)/GroupSize*GroupSize);
  //
  //          //if(myThreadId < myExecState->nThreads)
  //          // proc(wp);
  //          if(myThreadId == 0)
  //          //if(threadIdx.x == myGroup*GroupSize)
  //            printf("%d %d going to execute %x %d %llx %llx .... %d -> %d\n",blockIdx.x,myGroup,myExecState->syncpoint,myExecState->nThreads,myExecState->proc,myExecState->wp, threadIdx.x, __threadId());
  //
  //          //sync on my barrier
  //          uint barrier = myExecState->syncpoint&0xFF;
  //          syncthreads(myGroup, GroupSize);
  //          //unset exec and barrier
  //          if(threadIdx.x == myGroup*GroupSize)
  //            myExecState->syncpoint = 0;
  //          //sync all, so noone is using my barrier anymore
  //          syncthreads(barrier, (myExecState->nThreads+GroupSize-1)/GroupSize*GroupSize);
  //
  //          if(threadIdx.x + myExecState->threadOffset == 0)
  //          {
  //            //printf("%d %d would end launch for %d %d : %d\n",blockIdx.x,myGroup,myExecState->eventId, myExecState->eventLaunchId,myExecState->freeCount);
  //            d_eventManager.finishedWorkpackagesForEvent(DEvent(myExecState->eventId, myExecState->eventLaunchId),myExecState->freeCount);
  //          }
  //        }
  //
  //        syncthreads(myGroup, GroupSize);
  //        //check run
  //        if(threadIdx.x == myGroup*GroupSize)
  //          myExecState->share = state->run;
  //        syncthreads(myGroup, GroupSize);
  //        if(!myExecState->share)
  //          return;
  //        //check if there is a controller
  //        if(threadIdx.x == myGroup*GroupSize)
  //        {
  //          if(myExecState->syncpoint != SyncMagicController)
  //          {
  //            //if(threadIdx.x == myGroup*GroupSize)
  //            //  printf("%d %d tries to be the new controller!\n",blockIdx.x,myGroup);
  //            //set sleep
  //            myExecState->syncpoint = SyncMagicSleeping;
  //            __threadfence_block();
  //            uint hasController = atomicOr((uint*)&dyn->controller,0x1);
  //            __threadfence_block();
  //
  //            myExecState->share = hasController&0x1;
  //            if(!(myExecState->syncpoint&SyncMagicSleeping))
  //            {
  //              //someone requested us in the meanwhile, i can't be the controller
  //              myExecState->share = 1;
  //              if((hasController&0x1) == 0)
  //                dyn->controller = hasController;
  //            }
  //          }
  //          else
  //            myExecState->share = 0;
  //        }
  //
  //        syncthreads(NumGroups + myGroup, GroupSize);
  //
  //        if(myExecState->share == 0)
  //        {
  //          //if(threadIdx.x == myGroup*GroupSize)
  //          //  printf("%d %d is the new controller!\n",blockIdx.x,myGroup);
  //          //we are the new controller
  //          syncthreads(myGroup, GroupSize);
  //          //check if there are leftovers
  //          if(threadIdx.x == myGroup*GroupSize)
  //          {
  //            uint left = dyn->controller >> 1;
  //            if(left != 0)
  //            {
  //              myExecState->proc = es[left-1].proc;
  //              myExecState->wp = es[left-1].wp;
  //              myExecState->nThreads = es[left-1].nThreads;
  //              myExecState->threadOffset = 0 - myGroup*GroupSize;
  //              myExecState->syncpoint = SyncMagicExec | NumGroups + myGroup;
  //
  //              //if(threadIdx.x == myGroup*GroupSize)
  //              //  printf("%d %d there is a stalled wp: %llx %llx!\n",blockIdx.x,myGroup,myExecState->proc,myExecState->wp);
  //            }
  //            myExecState->share = left;
  //          }
  //          syncthreads(myGroup, GroupSize);
  //
  //
  //          //if we dont have anything yet
  //          if(!myExecState->share)
  //          {
  //            //check run state
  //            state->checkRunstate(threadIdx.x == myGroup*GroupSize);
  //            syncthreads(myGroup, GroupSize);
  //
  //
  //            if(!state->run)
  //            {
  //              //if(threadIdx.x == myGroup*GroupSize)
  //              //  printf("%d %d launch end!\n",blockIdx.x,myGroup);
  //
  //              //mark as uncontrolled
  //              if(threadIdx.x == myGroup*GroupSize)
  //              {
  //                dyn->controller = 0;
  //                for(uint j = 0; j < NumGroups; ++j)
  //                {
  //                  uint currentState = atomicAnd((uint*)&(es[j].syncpoint), ~SyncMagicSleeping);
  //                  //wake other if it is sleeping
  //                  if(currentState&SyncMagicSleeping)
  //                  {
  //                    //printf("%d %d waking %d/%d!\n",blockIdx.x,myGroup,j,NumGroups);
  //                    arrive(j, GroupSize + 32);
  //                  }
  //                }
  //              }
  //              return;
  //            }
  //
  //            //get work
  //            if(threadIdx.x == myGroup*GroupSize)
  //            {
  //              //printf("%d %d trying to get work \n",blockIdx.x,myGroup);
  //              Workpackage* wp = 0;
  //              DeviceEntryProcedureFunc func = 0;
  //              uint numThreads = 0;
  //              uint gotWp = d_queue.dequeue(wp, func, numThreads);
  //              if(gotWp)
  //              {
  //                //printf("%d %d got wp %llx\n",blockIdx.x, myGroup, wp);
  //                myExecState->proc = (PointerEquivalent) func;
  //                myExecState->wp = (PointerEquivalent) wp;
  //                myExecState->nThreads = numThreads;
  //                myExecState->threadOffset = 0 - myGroup*GroupSize;
  //                myExecState->syncpoint = SyncMagicExec | (NumGroups + myGroup);
  //              }
  //              //debug
  //              myExecState->share = gotWp;
  //            }
  //          }
  //          syncthreads(myGroup, GroupSize);
  //          if(myExecState->share)
  //          {
  //            //there is something to execute, so find enough to execute
  //            //if(threadIdx.x == myGroup*GroupSize)
  //            //    printf("%d %d has something to execute: %llx %llx -> searching for available groups\n",blockIdx.x,myGroup,myExecState->proc,myExecState->wp);
  //
  //            syncthreads(myGroup, GroupSize);
  //            //check if there are enough
  //            myExecState->share = GroupSize;
  //            syncthreads(myGroup, GroupSize);
  //            uint needed = myExecState->nThreads;
  //            int myoffset = -1;
  //            uint lid = threadIdx.x - myGroup*GroupSize;
  //            if(lid < NumGroups)
  //              if(es[lid].syncpoint & SyncMagicSleeping)
  //                myoffset = atomicAdd((uint*)&myExecState->share, GroupSize);
  //            if(myoffset >= needed)
  //              myoffset = -1;
  //            syncthreads(myGroup, GroupSize);
  //            if(myExecState->share >= needed)
  //            {
  //               //if(threadIdx.x == myGroup*GroupSize)
  //               // printf("%d %d found enough threads to join execution on %llx %llx: %d >= %d\n",blockIdx.x,myGroup,myExecState->proc,myExecState->wp, myExecState->share, needed);
  //
  //              //it is possible to execute this one, so set state
  //              if(lid < NumGroups && myoffset != -1)
  //              {
  //                //printf("%d %d preparing %d for execution\n",blockIdx.x,myGroup,lid);
  //                es[lid].syncpoint = SyncMagicExec | (NumGroups + myGroup);
  //                es[lid].wp = myExecState->wp;
  //                es[lid].proc = myExecState->proc;
  //                es[lid].nThreads = myExecState->nThreads;
  //                es[lid].threadOffset = myoffset - (int)(lid*GroupSize);
  //              }
  //              syncthreads(myGroup, GroupSize);
  //
  //              //wake needs serialize
  //              if(lid == 0)
  //                for(uint i = 0; i < NumGroups; ++i)
  //                  if(es[i].syncpoint == (SyncMagicExec | (NumGroups + myGroup)) && i != myGroup)
  //                  {
  //                    //printf("%d %d waking %d\n",blockIdx.x,myGroup,i);
  //                     arrive(i, GroupSize + 32);
  //                  }
  //
  //              syncthreads(myGroup, GroupSize);
  //              //if(threadIdx.x == myGroup*GroupSize)
  //              //  printf("%d %d starting another controller\n",blockIdx.x,myGroup);
  //
  //              if(lid == 0)
  //              {
  //                 //unset me as controller
  //                dyn->controller = 0;
  //                __threadfence_block();
  //                //wake a random other to become the controller
  //                for(uint i = 0; i < NumGroups; ++i)
  //                {
  //                  if(atomicAnd((uint*)&es[i].syncpoint,~SyncMagicSleeping) & SyncMagicSleeping )
  //                  {
  //                    arrive(i, GroupSize + 32);
  //                    break;
  //                  }
  //                }
  //                myExecState->share = 1;
  //              }
  //              syncthreads(myGroup, GroupSize);
  //              //if(threadIdx.x == myGroup*GroupSize)
  //              //  printf("%d %d joining the execution of %x %llx %llx\n",blockIdx.x,myGroup,myExecState->syncpoint,myExecState->proc,myExecState->wp);
  //              //go and execute!
  //            }
  //            else
  //            {
  //              syncthreads(myGroup, GroupSize);
  //              //not enough threads
  //              if(needed > blockDim.x)
  //              {
  //                printf("ERROR: should execute %d threads, but only have %d in block!!\n",blockIdx.x,myGroup,needed,blockDim.x);
  //                __threadfence();
  //                trap();
  //              }
  //              //if(threadIdx.x == myGroup*GroupSize)
  //              //  printf("%d %d not enough threads to execute %llx %llx ( %d < needed %d ) marking for later execution\n",blockIdx.x,myGroup,myExecState->proc,myExecState->wp,myExecState->share, needed);
  //              __threadfence();
  //              //unset controller with my id shifted one to the right
  //              myExecState->share = (myGroup+1) << 1;
  //              syncthreads(myGroup, GroupSize);
  //            }
  //          }
  //          if((myExecState->share&0x1)==0)
  //          {
  //            //if(threadIdx.x == myGroup*GroupSize)
  //            //    printf("%d %d there is nothing to execute ... going to sleep?\n",blockIdx.x,myGroup);
  //            //there is nothing to execute
  //            syncthreads(myGroup, GroupSize);
  //            //if someone else is executing, this group can go to sleep
  //            if(threadIdx.x == myGroup*GroupSize)
  //            {
  //              myExecState->syncpoint = SyncMagicSleeping;
  //              __threadfence_block();
  //              dyn->controller = myExecState->share;
  //              __threadfence_block();
  //              myExecState->share = 1;
  //            }
  //            //search for others that are executing
  //            syncthreads(NumGroups + myGroup, GroupSize);
  //            uint lid = threadIdx.x - myGroup*GroupSize;
  //            if(lid < NumGroups)
  //            {
  //              uint otherstate = es[lid].syncpoint;
  //              if(!(otherstate&SyncMagicSleeping))
  //                myExecState->share = 0;
  //            }
  //
  //            syncthreads(NumGroups + myGroup, GroupSize);
  //            //if there is no other, I am going to be the controller again
  //            uint old;
  //            if(threadIdx.x == myGroup*GroupSize && myExecState->share == 1)
  //              if((old = atomicOr((uint*)&dyn->controller,0x1) & 0x1) == 0)
  //              {
  //                if(myExecState->syncpoint & SyncMagicSleeping)
  //                  myExecState->syncpoint = SyncMagicController;
  //                else
  //                  dyn->controller = old;
  //              }
  //            syncthreads(NumGroups + myGroup, GroupSize);
  //            //if(threadIdx.x == myGroup*GroupSize)
  //            //  printf("%d %d going to sleep: %d\n",blockIdx.x,myGroup,myExecState->share==0 && myExecState->syncpoint != SyncMagicController);
  //
  //            if(myExecState->share == 0 && myExecState->syncpoint != SyncMagicController)
  //              syncthreads(myGroup, GroupSize + 32);
  //          }
  //        }
  //        else
  //        {
  //          //i have done my work, i am not the controller -> lets go to sleep
  //          //if(threadIdx.x == myGroup*GroupSize)
  //          //  printf("%d %d going to sleep\n",blockIdx.x,myGroup);
  //          syncthreads(myGroup, GroupSize + 32);
  //        }
  //      }
  //    }
  //  }

  //__global__ void tellMegaKernelValues()
  //{
  //  printf("%f %d\n",maxExecTime,sleepTime);
  //}

  void MegaKernelParamCallback(const std::string& name, uint& value, int* pd)
  {
    int dev = *reinterpret_cast<int*>(&pd);
    cudaSetDevice(dev);
    if(name.compare("DefaultSharedMemory") == 0)
      CUDA_CHECKED_CALL(cudaMemcpyToSymbol(defaultSharedMemory,&value,sizeof(uint)));
    if(name.compare("StopOnNoWork") == 0)
      CUDA_CHECKED_CALL(cudaMemcpyToSymbol(stopOnNoWork,&value,sizeof(int)));
  }

  void initMegakernalConstants(HParams* params)
  {
    //TODO: check timing!!!
    float t = 1.0f;
    uint sTime = 2000;
    sTime = params->addParam(P_SleepTime, sTime);

    CUDA_CHECKED_CALL(cudaDeviceSynchronize());
    CUDA_CHECKED_CALL(cudaMemcpyToSymbol(maxExecTime, &t, sizeof(float)));
    CUDA_CHECKED_CALL(cudaMemcpyToSymbol(sleepTime, &sTime, sizeof(uint)));

    Config* config = params->getConfig();
    int dev;
    cudaGetDevice(&dev);

    uint defSharedMemory = config->registerConfig<uint>("DefaultSharedMemory",
                                                        0U,
                                                        MegaKernelParamCallback,
                                                        reinterpret_cast<int*>(dev));
    CUDA_CHECKED_CALL(cudaMemcpyToSymbol(defaultSharedMemory,&defSharedMemory,sizeof(uint)));

    uint stopONW = config->registerConfig<uint>("StopOnNoWork",
                                              0U,
                                              MegaKernelParamCallback,
                                              reinterpret_cast<int*>(dev));
    CUDA_CHECKED_CALL(cudaMemcpyToSymbol(stopOnNoWork,&stopONW,sizeof(int)));



    //tellMegaKernelValues<<<1,1>>>();
    //CUDA_CHECKED_CALL(cudaDeviceSynchronize());
  }
}


#endif //SOFTSHELL_MEGAKERNEL_MEGAKERNEL_CUH_INCLUDED
