/*
  Softshell: Dynamic Scheduling on GPUs.
  http://www.icg.tugraz.at/project/mvp

  Copyright (C) 2012 Institute for Computer Graphics and Vision,
                     Graz University of Technology

  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
              Bernhard Kainz - kainz ( at ) icg.tugraz.at
              Michael Kenzel - kenzel ( at ) icg.tugraz.at
              Stefan Hauswiesner - hauswiesner ( at ) icg.tugraz.at
              Bernhard Kerbl - kerbl ( at ) icg.tugraz.at
              Dieter Schmalstieg - schmalstieg ( at ) icg.tugraz.at

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
*/

#ifndef SOFTSHELL_AGGREGATION_LOCALAGGREGATION_H_INCLUDED
#define SOFTSHELL_AGGREGATION_LOCALAGGREGATION_H_INCLUDED

#include "distributor/deviceentries.h"
#include "api/workpackage.h"
#include "api/workitem.h"
#include "megakernel/execstate.cuh"
#include "tools/common.cuh"
#include "timing/timesync.cuh"

/*
* file created by    Markus Steinberger / steinberger ( at ) icg.tugraz.at
*
* modifications by
*/

#include "megakernel/localprepull.cuh"
#include "aggregation/aggregation.cuh"

#ifndef USE_CUDA_HEAP
#include "memory/UseScatterAlloc.cuh"
#endif

namespace Softshell
{
  extern __constant__ uint LocalCombWpDataSize;
  extern __constant__ uint LocalCombMinLocalElements;
  extern __constant__ uint defaultSharedMemory;
  class LocalAggregation
  {

    inline __device__ static CombWorkpackageBase* createNewCWB(const DEvent& event, DeviceEntryProcedureFunc proc, const uint workitemSize, uint paramSize, const uint* pParam, uint count, uint entryPoint, uint timeout)
    {
      //printf("%d creating cwb for event: %d %d\n",blockIdx.x, event.eventId, event.eventLaunchId);
      uint req_size = sizeof(CombWorkpackageBase) + paramSize*sizeof(uint) + count*workitemSize;
      //uint cwb_size = req_size - (paramSize*sizeof(uint) + count*workitemSize);
      //uint paramsize_ = paramSize*sizeof(uint);
      //uint witemsize = (req_size - cwb_size - paramsize_)/count;
      //printf("%d %d - going to alloc: %d bytes (%d + %d + %d * %d)\n",blockIdx.x,threadIdx.x,req_size, cwb_size,paramsize_,count,witemsize);
      void* alloced = RoundRobinAlloc::wpCheckedAlloc(req_size);
      //  malloc(sizeof(CombWorkpackageBase) + paramSize*sizeof(uint) + count*workitemSize);
      //printf("alloced: %llx\n",alloced);
      CombWorkpackageBase* nCWB = new(alloced) CombWorkpackageBase(event, count,
        ((unsigned char*)alloced) + sizeof(CombWorkpackageBase) + paramSize*sizeof(uint),
        workitemSize,
        0,
        paramSize*sizeof(uint),
        getTimeCycles() + timeout,
        count,
        entryPoint==1?1:2
        );

      if(nCWB == 0)
      {
        printf("Error in Megakernel: could not create new CombWorkpackageBase, OUT OF MEMORY! \nABORT!\n");
        trap();
      }
      if(paramSize)
      {
        nCWB->_pparams = ((unsigned char*)alloced) + sizeof(CombWorkpackageBase);
        if(pParam != 0)
          for(uint i = 0; i < paramSize; ++i)
              ((uint*)(((unsigned char*)alloced) + sizeof(CombWorkpackageBase)))[i] = pParam[i];
      }
      return nCWB;
    }
    template<class WorkItem>
    inline __device__ static CombWorkpackageBase* createNewCWB(const DEvent& event, DeviceEntryProcedureFunc proc, uint paramSize, const uint* pParam, uint count, uint entryPoint, uint timeout)
    {
      return createNewCWB(event, proc, sizeof(WorkItem), paramSize, pParam, count,entryPoint, timeout);
    }

    inline __device__ static bool match(volatile uint* p, uint m)
    {
      uint d = *p;
      if(d == 0xDEADBEEF)
        if((d = atomicCAS((uint*)(p),0xDEADBEEF,m))==0xDEADBEEF)
        {
          //printf("%d %d set match for me: %x \n", blockIdx.x, threadIdx.x, m);
          return true;
        }
      //printf("%d %d comparing match: %d == %d \n", blockIdx.x, threadIdx.x, d, m);
        //if(d != m)
        //  printf("%2d %d no match: %x != %x\n",blockIdx.x,threadIdx.x, d,m);
      return d == m;
    }
    inline __device__ static bool match(volatile unsigned long long* p, volatile unsigned long long m)
    {
      volatile unsigned long long d;
      if((d=atomicCAS((unsigned long long*)(p),0xDEADBEEFDEADBEEFULL,m))==0xDEADBEEFDEADBEEFULL)
      {
        /*printf("%d %d set match for me: %llx \n", blockIdx.x, threadIdx.x, m);*/
        return true;
      }
      //if(d != m)
      //   printf("%2d %d no match: %llx != %llx\n",blockIdx.x,threadIdx.x, d,m);
      //printf("%d %d comparing match: %llx == %llx \n", blockIdx.x, threadIdx.x, d, m);
      return d == m;
    }
  public:
    template<class WorkItem>
    __device__ static int addWorkItem(const DEvent& event, DeviceEntryProcedureFunc proc, const WorkItem& workitem, uint paramSize, const uint* pParam, uint execthreads, uint entryPoint, uint timeout, int start = -1)
    {
#define DEBUG_ADDWORKITEM 0

      execthreads = max(execthreads,1);

      //uint nparamsize = (paramSize + sizeof(uint)-1)/sizeof(uint);
      //printf("%d %d local addWorkItem called: event %d proc %llx, paramsize %d->%d, execthreads %d, entrypoint %d \n", blockIdx.x, threadIdx.x, event.getEventLaunchId(), proc, paramSize, nparamsize, execthreads, entryPoint);
      paramSize = (paramSize + sizeof(uint)-1)/sizeof(uint);
      BlockExecState* bes = __blockState();
      //check existing ones for match
      const int infostructsize = sizeof(LocalCombWorkpackage)/sizeof(uint);

      if(__any(start != -1))
        start = __share(start, start != -1);

      int current = start;
      if(current == -1)
      {
        current = bes->sharedOffsetEnd-16/sizeof(uint);
        current = current - infostructsize;
      }
      CombWorkpackageBase* fullwp = 0;

      int fullSpace = paramSize + (sizeof(WorkItem)*LocalCombWpDataSize/execthreads + sizeof(uint)-1)/sizeof(uint);
      int wantedSpace = fullSpace;
      if( bes->sharedOffset > bes->sharedOffsetTop - wantedSpace)
        wantedSpace = paramSize;

      //for(uint mycoco = 0; mycoco < 1000; ++mycoco)
      while(true)
      {
        int myid = __popc(lanemask_lt() & __ballot(1));
        int count = __popc(__ballot(1));
        volatile LocalCombWorkpackage* wpinfo = (volatile LocalCombWorkpackage*)(___i_shared_all + current);

        int offset = -1;
        if(myid == 0)
        {
          if(current == start)
            //quick add
            offset = atomicAdd((uint*)&wpinfo->entryPoint_elements, count) & 0xFFFF;
          else
          {

            //check if we need a new one
            int top = ((volatile int*)___i_shared_all)[bes->sharedOffsetEnd-1];

            if(current - wantedSpace < top)
            {
              //we need to add more space
  #if DEBUG_ADDWORKITEM
              printf("%d %d we need to add more space: %d->%d\n",blockIdx.x, threadIdx.x,top,current - wantedSpace);
  #endif
              int allocoffset = 0;
  #if !defined(NO_LOCAL_AGGREGATION)

              allocoffset = bes->sharedAllocTopMin(current - wantedSpace);
              if(allocoffset == 0 && wantedSpace > paramSize)
              {
                wantedSpace =  paramSize;
    #if DEBUG_ADDWORKITEM
                printf("%d %d trying to alloc infostruct + params: %d->%d\n",blockIdx.x, threadIdx.x,top, current-wantedSpace);
    #endif
                allocoffset = bes->sharedAllocTopMin(current - wantedSpace);
              }
  #endif

  #if DEBUG_ADDWORKITEM
              printf("%d %d alloc result: %d\n",blockIdx.x, threadIdx.x,allocoffset);
  #endif
              if(allocoffset == 0)
              {
  #if DEBUG_ADDWORKITEM
                printf("%d %d could not even alloc info struct, so we need to go to global right away\n",blockIdx.x, threadIdx.x);
  #endif
                //could not even alloc info struct, so we need to go to global right away
                int alloccount = min(count, runningThreads/execthreads);
                if(fullwp != 0 && LocalCombWpDataSize/execthreads < alloccount)
                {
                  free((void*)fullwp);
                  fullwp = 0;
                }
                if(fullwp == 0)
                  fullwp = createNewCWB<WorkItem>(event, proc, paramSize, pParam, alloccount, entryPoint, timeout);
                else
                  fullwp->_elements = alloccount;
                offset = 0xFFFFFF;
              }
              else
              {
                top = allocoffset;
  #if DEBUG_ADDWORKITEM
                printf("%d %d updating top to %d\n",blockIdx.x, threadIdx.x,top);
  #endif
                atomicMin(___i_shared_all+bes->sharedOffsetEnd-1, top);
              }
            }


            if(offset == -1)
            {
              //check for offset
              uint currentoffset = wpinfo->offset;
  #if DEBUG_ADDWORKITEM
              printf("%d %d %d current offset is %d\n",blockIdx.x, threadIdx.x, current, currentoffset);
  #endif
              if(currentoffset == 0xDEADBEEFU)
              {
                currentoffset = atomicCAS((int*)&(wpinfo->offset),0xDEADBEEF,wantedSpace);
                if(currentoffset == 0xDEADBEEFU)
                {
                  currentoffset = wantedSpace;
  #if DEBUG_ADDWORKITEM
                  printf("%d %d set offset for %d to %d\n",blockIdx.x, threadIdx.x, current, wantedSpace);
  #endif
                }
              }

              //check for match or set
  #if DEBUG_ADDWORKITEM
              printf("%d %d going to check match %d @ %llx (%llx), %d == %d || %d == %d && %x & 0x80000000 (%x)\n",blockIdx.x, threadIdx.x,current, wpinfo, proc,currentoffset,fullSpace,currentoffset,paramSize,wpinfo->locked_execthreads,wpinfo->locked_execthreads &  0x80000000);
  #endif

              if(currentoffset == fullSpace || currentoffset == paramSize)
              if(wpinfo->locked_execthreads &  0x80000000)
              {
                //check entryPoint match and set elements
                uint currentEntryPointElements = wpinfo->entryPoint_elements;
                if( currentEntryPointElements == 0xDEADBEEFU )
                {
                  currentEntryPointElements = atomicCAS((int*)(&wpinfo->entryPoint_elements),0xDEADBEEF,entryPoint << 16);
                  if(currentEntryPointElements == 0xDEADBEEFU)
                    currentEntryPointElements = entryPoint << 16;
                }
                if( (currentEntryPointElements & 0xFFFF0000) == (entryPoint << 16) && (currentEntryPointElements & 0xFFFF) < LocalCombWpDataSize/execthreads)
                if(match(&wpinfo->eventLaunchId,event.getEventLaunchId()))
                if(match(&wpinfo->eventId,event.getEventId()))
                if(match(&wpinfo->func, (PointerEquivalent)proc))
                if(match(&wpinfo->paramSize,paramSize+1))
                if(match(&wpinfo->locked_execthreads,(entryPoint>1?0xC0000000:0x80000000)|execthreads))
                {
                  //if((wpinfo->locked_execthreads & 0x0000FFFF) != 1)
                  //{
                  //  printf("nonono %x mine: %x\n",wpinfo->locked_execthreads,(entryPoint>1?0xC0000000:0x80000000)|execthreads);
                  //  trap();
                  //}
  #if DEBUG_ADDWORKITEM
                  printf("%d %d for %d everything matches so far, going to check params\n",blockIdx.x, threadIdx.x, current);
  #endif
                  bool doesmatch = true;
                  //check for param match
                  volatile uint* p = ((uint*)wpinfo)-currentoffset;
                  //compare param
                  for(uint i = 0; i < paramSize && doesmatch; ++i, ++p)
                    doesmatch = match(p, pParam[i]);

                  if(doesmatch)
                  {
  #if DEBUG_ADDWORKITEM
                    printf("%d %d for %d params do match, currentoffset %d\n",blockIdx.x, threadIdx.x, current, currentoffset);
  #endif
                    //params do match, so check datapointer
                    if((PointerEquivalent)wpinfo->data ==  0xDEADBEEFDEADBEEFULL)
                    {
                      if(currentoffset == paramSize)
                      {
                        //need global data pointer
                        if(fullwp == 0)
                          fullwp = createNewCWB<WorkItem>(event, proc, paramSize, 0, LocalCombWpDataSize/execthreads, entryPoint, timeout);
                        unsigned char* pdata = (unsigned char*)((uint*)(fullwp + 1) + paramSize);
                        if(atomicCAS((PointerEquivalent*)&wpinfo->data, 0xDEADBEEFDEADBEEFULL,(PointerEquivalent)pdata) == 0xDEADBEEFDEADBEEFULL)
                        {
  #if DEBUG_ADDWORKITEM
                          printf("%d %d I set global pointer for %d to %llx\n",blockIdx.x, threadIdx.x, current,pdata);
  #endif
                          fullwp = 0;
                        }
                      }
                      else
                      {
                        //just set data pointer to shared
                        wpinfo->data = (void*)(((uint*)wpinfo) - currentoffset + paramSize);
  #if DEBUG_ADDWORKITEM
                        printf("%d %d I set shared pointer for %d to %llx\n",blockIdx.x, threadIdx.x, current,p + paramSize);
  #endif
                      }
                    }

                    //try to add elements
                    offset = atomicAdd((uint*)&wpinfo->entryPoint_elements, count) & 0xFFFF;
                    if(offset == 0)
                    {
  #if DEBUG_ADDWORKITEM
                      printf("%d %d I am the first to set elements for the new one %d\n",blockIdx.x, threadIdx.x, current);
  #endif
                      //i am the first, i set the time
                      wpinfo->timeout = getTimeCycles() + timeout;
                      d_eventManager.newWorkpackagesForEvent(event);
                    }
  #if DEBUG_ADDWORKITEM
                    printf("%d %d  %d offset is %d for %d spots: %d\n",blockIdx.x, threadIdx.x,current,offset,count,LocalCombWpDataSize);
  #endif
                  }
                }
              }
              //printf("%d %d my new offset is %d\n",blockIdx.x, threadIdx.x,offset);
            }
          }
        }
        offset = __share(offset, myid == 0);
#if DEBUG_ADDWORKITEM
        //printf("%d %d my new offset is %d\n",blockIdx.x, threadIdx.x,offset);
#endif
        if(offset == 0xFFFFFF)
        {
          if(myid < runningThreads/execthreads)
          {
            fullwp = (CombWorkpackageBase*)__share((PointerEquivalent)(fullwp), myid == 0);
            WorkItem* wip = (WorkItem*)(((unsigned char*)fullwp) + sizeof(CombWorkpackageBase) + paramSize*sizeof(uint));

            wip[myid] = workitem;
            //volatile uint* outwpdata = (volatile uint*)(wip + myid);
            //uint* cwpdata = (uint*)(&workitem);
            //for(int i = 0; i < sizeof(WorkItem)/sizeof(uint); ++i)
            //  outwpdata[i] = cwpdata[i];
            __threadfence();

            myid = __popc(lanemask_lt() & __ballot(1));
            if(myid == 0)
            {

  #if DEBUG_ADDWORKITEM
              printf("%d %d pushing wp %llx to global aggregation\n",blockIdx.x, threadIdx.x,fullwp);
  #endif
              //printf("%2d addworkitem  %llx is sent to global for %d %d %llx %d\n",blockIdx.x,fullwp,event.getEventLaunchId(), entryPoint,proc, execthreads);
              d_aggregation->insert(event.getEventLaunchId(), entryPoint, proc, execthreads, fullwp);
              d_eventManager.newWorkpackagesForEvent(event);
            }
            return -1;
          }
        }
        else if(offset >= 0)
        {
          //put it in
          if(offset + myid < LocalCombWpDataSize/execthreads)
          {
#if DEBUG_ADDWORKITEM
            if(myid == 0)
            printf("%d %d putting element in %d @ %d->%d (<%d)\n",blockIdx.x, threadIdx.x,current,offset + myid,min(offset + count,LocalCombWpDataSize/execthreads-1), LocalCombWpDataSize/execthreads);
#endif
            WorkItem* wip = (WorkItem*)wpinfo->data;
            wip[offset+myid] = workitem;
            //volatile uint* outwpdata = (volatile uint*)(wip + offset+myid);
            //uint* cwpdata = (uint*)(&workitem);
            //for(int i = 0; i < sizeof(WorkItem)/sizeof(uint); ++i)
            //  outwpdata[i] = cwpdata[i];
            __threadfence_block();
            //i allocated something, but nobody needs it now :(
            if(fullwp != 0)
              free((void*)fullwp);
            return current;
          }
        }
        current = current - infostructsize - wpinfo->offset;
        //if(myid == 0)
         // printf("%d %d going further to %d\n",blockIdx.x, threadIdx.x,current);
      }
      //printf("LOCALAGGRATION ENDLESS LOOP?\n");
    }
  private:
    inline __device__ static int copyToDest(int current, int dest)
    {
      LocalCombWorkpackage* wpinfo = (LocalCombWorkpackage*)(___i_shared_all + current);
      int infostructsize = sizeof(LocalCombWorkpackage)/sizeof(uint);
      int size = infostructsize + wpinfo->offset;
      int newdest = dest - size;
      //if(threadIdx.x == 0)
      //  printf("%d - copy from %d to %d; newdest %d\n",blockIdx.x,current,dest,newdest);

      //      //debug
      //if(threadIdx.x == 0)
      //{
      //    if(wpinfo->locked_execthreads == 0xDEADBEEF ||
      //    wpinfo->entryPoint_elements == 0xDEADBEEF ||
      //    wpinfo->eventLaunchId == 0xDEADBEEF ||
      //    wpinfo->eventId == 0xDEADBEEF ||
      //    wpinfo->func == 0xDEADBEEF ||
      //    wpinfo->paramSize == 0xDEADBEEF ||
      //    (wpinfo->locked_execthreads& 0xFFFF) > 1)
      //    {
      //      printf("%2d DEAD ELEMENT BEFORE COPY %d :\n"
      //        "\tlocked_execthreads: %x\n"
      //        "\tentryPoint_elements: %x\n"
      //        "\teventLaunchId: %x\n"
      //        "\teventId: %x\n"
      //        "\tfunc: %llx\n"
      //        "\tparamSize: %x\n"
      //        "\toffset: %d\n"
      //        "\tdata: %llx\n"
      //        ,blockIdx.x, current,
      //        wpinfo->locked_execthreads, wpinfo->entryPoint_elements,
      //        wpinfo->eventLaunchId,wpinfo->eventId,wpinfo->func,wpinfo->paramSize,
      //        wpinfo->offset, wpinfo->data);
      //      trap();
      //    }
      //}
      //    //


      syncthreads(0);
      if(dest == current)
        return newdest;
      if(threadIdx.x == 0 && isshared(wpinfo->data))
        wpinfo->data =  (void*)(___i_shared_all + dest - wpinfo->offset + wpinfo->paramSize-1);
      //if(threadIdx.x == 0)
      //{
      //  printf("%d copy: %llx==>%llx: %llx<->%llx ==> %llx<->%llx\n",blockIdx.x,wpinfo,___i_shared_all + dest, ___i_shared_all + current + infostructsize-size,___i_shared_all +current + infostructsize,___i_shared_all + dest + infostructsize-size,___i_shared_all +dest + infostructsize);
      //  printf("%d src (%llx):\n %llx\n %x\n %x\n %x\n %x\n %llx\n %x\n %x\n %llx\n", blockIdx.x, wpinfo, wpinfo->data, wpinfo->elements,  wpinfo->entryPoint,  wpinfo->eventId,  wpinfo->eventLaunchId,  wpinfo->func, wpinfo->locked_execthreads, wpinfo->paramSize, wpinfo->timeout);
      //}
      //copy the things up
      for(int i = 0; i < size; i += blockDim.x)
      {
        syncthreads(0);
        uint data = 0;
        if(i + threadIdx.x < size)
          data = ((volatile uint*)___i_shared_all)[current + infostructsize - 1 - i - threadIdx.x];
        syncthreads(0);
        if(i + threadIdx.x < size)
          ((volatile uint*)___i_shared_all)[dest + infostructsize - 1 - i - threadIdx.x] = data;
      }
      syncthreads(0);
      //if(threadIdx.x == 0)
      //{
        //wpinfo =  (LocalCombWorkpackage*)(___i_shared_all + dest);
        //printf("%d dest (%llx):\n %llx\n %x\n %x\n %x\n %x\n %llx\n %x\n %x\n %llx\n", blockIdx.x, wpinfo, wpinfo->data, wpinfo->elements,  wpinfo->entryPoint,  wpinfo->eventId,  wpinfo->eventLaunchId,  wpinfo->func,  wpinfo->locked_execthreads,wpinfo->paramSize, wpinfo->timeout);
      //}

      ////debug
      //if(threadIdx.x == 0)
      //{
      //  wpinfo =  (LocalCombWorkpackage*)(___i_shared_all + dest);
      //    if(wpinfo->locked_execthreads == 0xDEADBEEF ||
      //    wpinfo->entryPoint_elements == 0xDEADBEEF ||
      //    wpinfo->eventLaunchId == 0xDEADBEEF ||
      //    wpinfo->eventId == 0xDEADBEEF ||
      //    wpinfo->func == 0xDEADBEEF ||
      //    wpinfo->paramSize == 0xDEADBEEF ||
      //    (wpinfo->locked_execthreads& 0xFFFF) > 1)
      //    {
      //      printf("%2d DEAD ELEMENT AFTER COPY %d:\n"
      //        "\tlocked_execthreads: %x\n"
      //        "\tentryPoint_elements: %x\n"
      //        "\teventLaunchId: %x\n"
      //        "\teventId: %x\n"
      //        "\tfunc: %llx\n"
      //        "\tparamSize: %x\n"
      //        "\toffset: %d\n"
      //        "\tdata: %llx\n"
      //        ,blockIdx.x, current,
      //        wpinfo->locked_execthreads, wpinfo->entryPoint_elements,
      //        wpinfo->eventLaunchId,wpinfo->eventId,wpinfo->func,wpinfo->paramSize,
      //        wpinfo->offset, wpinfo->data);
      //      trap();
      //    }
      //}
      //    //
      return newdest;
    }

  public:
    __device__ static void init()
    {
      ___i_shared_all[__blockState()->sharedOffsetEnd-1] = __blockState()->sharedOffsetTop;
    }
    __device__ static int maintainFirst(volatile LocalPrePull* lPrePull, int maxfillin, int maxkeep = 1000, const bool forcePutIn = false, const bool putOutAll = false, int needFreeMem = 0)
    {
      //TODO: use priority instead of the maxfillin to determine what should go to global and what can stay

////debug
//		putOutAll = true;
//		//TODO: make putOutAll const again!
//		maxfillin = 0;
//		maxkeep = 0;
//		//

      //

#define DEBUG_MAINTAIN 0


#if DEBUG_MAINTAIN
      if(threadIdx.x == 0)
          printf("%2d maintainFirst called: %d %d %d %d %d \n",blockIdx.x,maxfillin,maxkeep,forcePutIn,putOutAll,needFreeMem);
#endif

      syncthreads(0);
      BlockExecState* bes = __blockState();
      ExecState* es = (ExecState*)__sharedMemPointer(sizeof(BlockExecState)+sizeof(LocalPrePull));
      int current = bes->sharedOffsetEnd-16/sizeof(uint);
      int top = ___i_shared_all[__blockState()->sharedOffsetEnd-1];
      volatile uint& comm(___i_shared_all[__blockState()->sharedOffsetEnd-2]);
      volatile PointerEquivalent& comm2(*((volatile PointerEquivalent*)(___i_shared_all+__blockState()->sharedOffsetEnd-4)));
      const int infostructsize = sizeof(LocalCombWorkpackage)/sizeof(uint);
      int nextcurrent = current - infostructsize;
      int destination = nextcurrent;


      const uint CommNothing = 0;
      const uint CommRemove_DontCopy = 1;
      const uint CommIssue = 2;
      const uint CommExecuting = 3;


      uint front = lPrePull->front;
      uint back = lPrePull->back;
      clock64_t ctime = getTimeCycles();
      //int mycoco;
      //for(mycoco = 0; mycoco < 1000; ++mycoco)

      syncthreads(0);
      while(true)
      {
        current = nextcurrent;
        if(current < top)
          break;
        LocalCombWorkpackage* wpinfo = (LocalCombWorkpackage*)(___i_shared_all + current);
        if(wpinfo->offset == 0xDEADBEEF)
          break;
        nextcurrent = current - infostructsize - wpinfo->offset;

        uint spots = wpinfo->spots();
        if(threadIdx.x == 0)
        {

          ////debug
          //if(wpinfo->locked_execthreads == 0xDEADBEEF ||
          //wpinfo->entryPoint_elements == 0xDEADBEEF ||
          //wpinfo->eventLaunchId == 0xDEADBEEF ||
          //wpinfo->eventId == 0xDEADBEEF ||
          //wpinfo->func == 0xDEADBEEF ||
          //wpinfo->paramSize == 0xDEADBEEF ||
          //(wpinfo->locked_execthreads& 0xFFFF) > 1)
          //{
          //  printf("%2d DEAD ELEMENT IN MAINTAIN %d (top: %d/%d):\n"
          //    "\tlocked_execthreads: %x\n"
          //    "\tentryPoint_elements: %x\n"
          //    "\teventLaunchId: %x\n"
          //    "\teventId: %x\n"
          //    "\tfunc: %llx\n"
          //    "\tparamSize: %x\n"
          //    "\toffset: %d\n"
          //    "\tdata: %llx\n"
          //    ,blockIdx.x, current, top, bes->sharedOffsetTop,
          //    wpinfo->locked_execthreads, wpinfo->entryPoint_elements,
          //    wpinfo->eventLaunchId,wpinfo->eventId,wpinfo->func,wpinfo->paramSize,
          //    wpinfo->offset, wpinfo->data);
          //  trap();
          //}
          ////

#if DEBUG_MAINTAIN
          printf("%d maintaining element: %d \n",blockIdx.x,current);
#endif
          comm = CommNothing;
          if((wpinfo->locked_execthreads & 0x80000000) == 0)
          {
            comm = CommExecuting;
            uint f = (wpinfo->locked_execthreads >> 16)&0xFF;
#if DEBUG_MAINTAIN
            printf("%2d - %d is issued: front %d back %d f: %d \n",blockIdx.x,current,front,back,f);
#endif
            //check if executed
            if(lPrePull->count == 0 ||
              (front < back && (f < front || f >= back)) ||
              (front > back && f < front && f >= back))
            {
              //has already been executed
#if DEBUG_MAINTAIN
              printf("%2d - %d has already been executed, reducing needFreeMem: %d -> %d\n",blockIdx.x,current,needFreeMem,needFreeMem-(current - nextcurrent));
#endif
              needFreeMem -= current - nextcurrent;
              comm = CommRemove_DontCopy;
            }
            else if(--maxkeep < 0 || needFreeMem > 0)
            {
#if DEBUG_MAINTAIN
              printf("%2d - %d removing the issued one:  reducing needFreeMem: %d -> %d\n",blockIdx.x,current,needFreeMem,needFreeMem-es[f].sharedOffset);
#endif
              needFreeMem -= es[f].sharedOffset;
              comm = CommIssue;

              //remove it if possible
              if(f == lPrePull->front)
              {
#if DEBUG_MAINTAIN
                printf("%2d - %d issued is front, can remove it\n",blockIdx.x,current);
#endif
                --lPrePull->count;
                lPrePull->front = (lPrePull->front+1)%lPrePull->size;
              }
              else if(f == lPrePull->back-1)
              {
#if DEBUG_MAINTAIN
                printf("%2d - %d issued is back, can remove it\n",blockIdx.x,current);
#endif
                --lPrePull->count;
                lPrePull->back = (lPrePull->back+lPrePull->size-1)%lPrePull->size;
              }
              else if(!putOutAll)
              {
#if DEBUG_MAINTAIN
        printf("%2d - %d issued is neither front nor back, so we just unset it\n",blockIdx.x,current);
#endif
                es[f].nThreads = 0;
                es[f].sharedOffset = 0;
              }


            }
            else
            {
#if DEBUG_MAINTAIN
              printf("%2d - %d issued will be copied from %d to %d\n",blockIdx.x,current,current,destination);
#endif
              //we will copy this one..
              es[f].wp = (PointerEquivalent)(___i_shared_all + destination);
            }
          }
          else if(wpinfo->entryPoint_elements != 0xDEADBEEF && (wpinfo->entryPoint_elements&0xFFFF) > 0)
          {

            if(wpinfo->timeout < ctime || (wpinfo->entryPoint_elements & 0xFFFF) >= spots || (forcePutIn && maxfillin > 0) || putOutAll)
            {
#if DEBUG_MAINTAIN
              printf("%2d - element %d should be issued(%llx): %llx < %llx  || %d >= %d || (%d && %d > 0)\n",blockIdx.x,current,wpinfo->func,wpinfo->timeout,ctime,wpinfo->entryPoint_elements& 0xFFFF,spots, forcePutIn, maxfillin);
#endif
              comm = CommIssue;
            }
            else if(--maxkeep < 0 || needFreeMem > 0)
              comm = CommIssue;
          }
          else
          {
#if DEBUG_MAINTAIN
            printf("%2d - element %d is empty\n",blockIdx.x,current);
#endif
            if((PointerEquivalent)wpinfo->data != 0xDEADBEEFDEADBEEFULL &&  !isshared(wpinfo->data))
            {
              free(((unsigned char*)wpinfo->data) - sizeof(CombWorkpackageBase) - (wpinfo->paramSize-1)*sizeof(uint));
            }
            comm = CommRemove_DontCopy;
          }
        }
        syncthreads(0);
        if(comm == CommIssue)
        {
          if((wpinfo->entryPoint_elements&0xFFFF)*(wpinfo->locked_execthreads&0xFFFF) >= LocalCombMinLocalElements && maxfillin > 0 && needFreeMem <= 0)
          {
            --maxfillin;
            int newLoc = destination;
            destination = copyToDest(current, destination);
            comm = CommRemove_DontCopy;

            //insert at the front of the queue
            if(threadIdx.x == 0)
            {
#if DEBUG_MAINTAIN
              printf("%2d - issuing %d\n",blockIdx.x,current);
#endif

              lPrePull->count++;
              int f = (lPrePull->front-1 + lPrePull->size) % lPrePull->size;
              lPrePull->front = f;
              LocalCombWorkpackage* wpinsert = (LocalCombWorkpackage*)(___i_shared_all + newLoc);
              es[f].wp = (PointerEquivalent)wpinsert;
              es[f].proc = wpinsert->func;
              uint nthreads = min(spots,wpinsert->entryPoint_elements & 0xFFFF)*(wpinsert->locked_execthreads&0xFFFF);
              es[f].nThreads = nthreads;
              uint reqShared = 0;
              ProcedureInfo* info = d_procedureManager->get((DeviceEntryProcedureFunc)wpinsert->func);
              if(info != 0)
                reqShared = (info->sharedMem + nthreads*info->sharedMemPerThread+15)/16*16;
              else
                reqShared = defaultSharedMemory;
              es[f].sharedOffset = reqShared;

#if DEBUG_MAINTAIN
              printf("%2d local aggregation issuing workpackage %llx %llx for event %d %d to be executed by %d * %d (%d,%d) threads with %d bytes of smem and stored it at %d data: %llx paramsize: %d offset: %d\n",
                blockIdx.x, wpinsert->func, (PointerEquivalent)wpinsert, ((CombWorkpackageBase*)wpinsert)->getEvent().getEventId(), ((CombWorkpackageBase*)wpinsert)->getEvent().getEventLaunchId(), wpinfo->locked_execthreads&0xFFFF, min(spots,wpinsert->entryPoint_elements & 0xFFFF), spots, wpinsert->entryPoint_elements & 0xFFFF,reqShared, f, wpinsert->data, wpinsert->paramSize, wpinsert->offset);
#endif
              uint maxSpace = (bes->sharedOffsetEnd-bes->sharedOffset)*sizeof(uint);
              if(reqShared > maxSpace)
              {
                printf("%2d ERROR: Megakernel has not got enough shared memory (%d < %d:%d*%d*%d+%d) to execute procedure with custom id %d (%llx)\n  Increase \"MegakernelSharedMemory\"\n",
                  blockIdx.x,maxSpace, reqShared,  min(spots,wpinsert->entryPoint_elements & 0xFFFF), (wpinfo->locked_execthreads&0xFFFF),info->sharedMemPerThread,info->sharedMem, info->customId,wpinsert->func);
                trap();
              }
              wpinsert->locked_execthreads = (wpinsert->locked_execthreads & 0x40000000) | (f  << 16) | (wpinsert->locked_execthreads & 0xFFFF);
            }
          }
          else
          {
            //this one needs to go to the global thread aggregation
            uint elements = min(spots,wpinfo->entryPoint_elements & 0xFFFF);
            if(isshared(wpinfo->data))
            {
              //we need to alloc a new one and make a full copy
              uint elementsize = sizeof(uint)*((wpinfo->offset-(wpinfo->paramSize-1))/spots);
              if(threadIdx.x == 0)
              {
#if DEBUG_MAINTAIN
                printf("%2d - creating a full copy of %d and sending it to global (%d elements of size %d with params of size %dfor entry point %d)\n",blockIdx.x,current,elements,elementsize,wpinfo->paramSize-1, wpinfo->entryPoint_elements >> 16);
#endif
                comm2 = (PointerEquivalent) createNewCWB(DEvent(wpinfo->eventId, wpinfo->eventLaunchId), (DeviceEntryProcedureFunc)wpinfo->func, elementsize, wpinfo->paramSize-1, 0, elements, wpinfo->entryPoint_elements >> 16, 0);
                __threadfence_block();
              }
              syncthreads(0);
              volatile CombWorkpackageBase* cwb = (volatile CombWorkpackageBase*)(comm2);
              uint* data = ((uint*)wpinfo->data);
              volatile uint* poutdata = ((volatile uint*)(cwb + 1)) + (wpinfo->paramSize-1);
              for(uint i = threadIdx.x; i < elementsize*elements/sizeof(uint); i+= blockDim.x)
                poutdata[i] = data[i];
              __threadfence();
            }
            else if(threadIdx.x == 0)
            {

#if DEBUG_MAINTAIN
              printf("%2d - element %d is partially in global memory, copying the data and sending it off (%d elements)  !(%d >= %d && %d > 0)\n",blockIdx.x,current,elements,wpinfo->entryPoint_elements&0xFFFF, LocalCombMinLocalElements, maxfillin);
#endif
              volatile CombWorkpackageBase* cwb = (volatile CombWorkpackageBase*)(((unsigned char*)wpinfo->data) - sizeof(CombWorkpackageBase) - (wpinfo->paramSize-1)*sizeof(uint));

              ((WorkpackageEquivalent*)cwb)->comboWorkpackage = (wpinfo->entryPoint_elements >> 16)==1?0x1:0x40000001;
              cwb->_elements = elements;
              cwb->_timeout = wpinfo->timeout;
              comm2 = (PointerEquivalent)(cwb);
            }
            syncthreads(0);

            //cpy params
            volatile CombWorkpackageBase* cwb = (volatile CombWorkpackageBase*)(comm2);
            volatile uint* pWpParams = ((volatile uint*)wpinfo) - wpinfo->offset;
            volatile uint* pOutParams = (volatile uint*)(cwb + 1);
            for(uint i = threadIdx.x; i < wpinfo->paramSize-1; i+= blockDim.x)
            {
              pOutParams[i] = pWpParams[i];
            }
            __threadfence();
            syncthreads(0);
            if(threadIdx.x == 0)
            {
              //printf("%d - %d the copy: \n",blockIdx.x,current);
              //printf("\teventId: %d\n\teventLaunchId: %d\n\tcomboWorkpackage: %d\n\t_capacity: %d\n\t_elements: %d \n",  cwb->getEvent().getEventId(), cwb->getEvent().getEventLaunchId(), cwb->isCombo(), cwb->_capacity, cwb->_elements);
              //printf("\t_elementSize: %d\n\t_next: %llx\n\t_paramsSize: %d\n\t_pdata: %llx\n\t_pparams: %llx\n\t_timeout: %llx\n",cwb->_elementSize, cwb->_next, cwb->_paramsSize, cwb->_pdata, cwb->_pparams, cwb->_timeout);
              //


              uint nthreads = wpinfo->locked_execthreads & 0xFFFF;
              #if DEBUG_MAINTAIN
              printf("%2d element %d -> %llx is sent to global for %dx%d threads, reduced needFreeMem: %d -> %d  \n",
                blockIdx.x,current,cwb,min(spots,wpinfo->entryPoint_elements & 0xFFFF),nthreads,needFreeMem,needFreeMem-(current - nextcurrent));
              #endif
              needFreeMem -= current - nextcurrent;
              //printf("%2d maintain - elemenet %d -> %llx is sent to global for %d %d %llx %d\n",blockIdx.x,current,cwb,wpinfo->eventLaunchId, wpinfo->entryPoint_elements >> 16,wpinfo->func, nthreads);
              d_aggregation->insert(wpinfo->eventLaunchId, wpinfo->entryPoint_elements >> 16, (DeviceEntryProcedureFunc)wpinfo->func, nthreads, (CombWorkpackageBase*)cwb);
            }

            comm = CommRemove_DontCopy;
          }
        }
        syncthreads(0);
        if(comm != CommRemove_DontCopy)
        {
#if DEBUG_MAINTAIN
          if(threadIdx.x == 0)
            printf("%2d copy: %d -> %d\n",blockIdx.x,current, destination);
#endif
          destination = copyToDest(current, destination);
        }
#if DEBUG_MAINTAIN
        else if(threadIdx.x == 0)
          printf("%2d - no need to copy %d\n",blockIdx.x,current);
#endif
        syncthreads(0);
      }
      syncthreads(0);
      //if(mycoco >= 1000)
      //{
      // printf("endless loop in maintain\n");
      // trap();
      //}
      //set rest to DEADBEEF
      int newtop = destination + infostructsize;
      int oldtop = ___i_shared_all[__blockState()->sharedOffsetEnd-1];

#if DEBUG_MAINTAIN
      if(threadIdx.x == 0 && newtop != oldtop)
        printf("%2d - creating deadbeef: %d -> %d\n",blockIdx.x,oldtop,newtop);
#endif
      for(int i = threadIdx.x; i < newtop-oldtop; i += blockDim.x)
      {
        ___i_shared_all[oldtop + i] = 0xDEADBEEF;
      }
      syncthreads(0);
      if(threadIdx.x == 0)
      {
        ___i_shared_all[__blockState()->sharedOffsetEnd-1] = newtop;
        bes->sharedOffsetTop = newtop;
        comm = needFreeMem;
      }

      syncthreads(0);
      return comm;

    }
    __device__ static int maintainAfter(volatile LocalPrePull* lPrePull, uint requiredShared)
    {
      BlockExecState* bes = __blockState();
      int threshold = bes->sharedOffset + requiredShared;
      //if(threadIdx.x == 0)
      //  printf("%d maintainAfter:  sharedOffset %d, requiredShared %d -> threshold %d, sharedOffsetTop %d --> %d < %d\n",
      //  blockIdx.x,bes->sharedOffset,requiredShared, threshold, bes->sharedOffsetTop, bes->sharedOffsetTop,threshold);


      //compute how much we can keep
      if(bes->sharedOffsetTop < threshold)
      {
        //if(threadIdx.x == 0)
        //    printf("%d maintain after: %d < %d (req: %d)\n",blockIdx.x, bes->sharedOffsetTop, threshold, requiredShared);
        int res = maintainFirst(lPrePull, 0, 1000, false, false, threshold - bes->sharedOffsetTop);
        return 1;
        //if(threadIdx.x == 0)
        //    printf("%d: %d < %d (%d)\n",blockIdx.x, bes->sharedOffsetTop, threshold, res);
        //trap();
        //if(threadIdx.x == 0 && res > 0)
        //{
        //   printf("%d: %d < %d (%d)\n",blockIdx.x, bes->sharedOffsetTop, threshold, res);
        //   printf("%2d MEGAKERNEL error: could not free enough shared memory to execute next workpackage!\n",blockIdx.x);
        //   trap();
        //}

        //maintainFirst(lPrePull, 0,3);
        //syncthreads(0);
        //if(bes->sharedOffsetTop < threshold)
        //{
        //  if(threadIdx.x == 0)
        //    printf("%d: %d < %d (req: %d)\n",blockIdx.x, bes->sharedOffsetTop, threshold, requiredShared);
        //  maintainFirst(lPrePull, 0,0);
        //  syncthreads(0);
        //  if(bes->sharedOffsetTop < threshold)
        //  {
        //    if(threadIdx.x == 0)
        //    {
        //      //TODO: we can take elements out of the local queue nomatter where they are, then we can get rid of the following error
        //      printf("MEGAKERNEL error: could not free enough shared memory to execute next workpackage!\n");
        //      trap();
        //    }
        //    syncthreads(0);
        //  }
        //}
      }
      return 0;
    }
    template<class PROCEDURE, class WorkItem, class Param>
    __device__ static int addWorkItemParams(const DEvent& event, const WorkItem& workitem, const Param& param, uint entryPoint = 1, uint timeout = 0xFFFFFFFF, int start = -1, int numThreads = 0)
    {
      if(timeout == 0xFFFFFFFF)
        timeout = DefaultWorkItemTimeout;
      if(numThreads == 0)
        numThreads = PROCEDURE::ExecThreads;
      while(true)
      {
        //check for equal params in warp
        uint local_id = __popc(lanemask_lt() & __ballot(1));

        //check for params match
        uint s = __share(static_cast<uint>(sizeof(Param)), local_id == 0);

        if(s == sizeof(Param))
        {
          bool participate = true;
          for(uint i = 0; i < (sizeof(Param)+sizeof(uint)-1)/sizeof(uint); ++i)
          {
            uint part = __share(((uint*)(&param))[i], local_id == 0);
            participate = participate && (part == ((uint*)(&param))[i]);
          }
          if(participate)
            return addWorkItem(event, EntryProcedure(PROCEDURE), workitem, sizeof(Param), (const uint*)(&param), numThreads, entryPoint, timeout, start);
        }
      }
    }

    template<class PROCEDURE, class WorkItem, class Param>
    __device__ static int addWorkItemEqualParams(const DEvent& event, const WorkItem& workitem, const Param& param, uint entryPoint, uint timeout = 0xFFFFFFFF, int start = -1, int numThreads = 0)
    {
      if(timeout == 0xFFFFFFFF)
        timeout = DefaultWorkItemTimeout;
      if(numThreads == 0)
        numThreads = PROCEDURE::ExecThreads;
      return addWorkItem(event, EntryProcedure(PROCEDURE), workitem, sizeof(Param), (const uint*)(&param), numThreads, entryPoint, timeout, start);
    }
    template<class PROCEDURE, class WorkItem>
    __device__ static int addWorkItem(const DEvent& event, const WorkItem& workitem, uint entryPoint = 1, uint timeout = 0xFFFFFFFF, int start = -1, int numThreads = 0)
    {
      if(timeout == 0xFFFFFFFF)
        timeout = DefaultWorkItemTimeout;
      if(numThreads == 0)
        numThreads = PROCEDURE::ExecThreads;
      return addWorkItem(event, EntryProcedure(PROCEDURE), workitem, 0, 0, numThreads, entryPoint, timeout, start);
    }

  };

  const uint WorkItemDefaultTimeout = 0xFFFFFFFF;
  const int WorkItemDefaultStart = -1;
  const int WorkItemDefaultNumThreads = 0;

  template<class PROCEDURE, class WORKITEM>
  __device__ int emitWorkItem(const DEvent& event, const WORKITEM& workitem, uint timeout = WorkItemDefaultTimeout, int start = WorkItemDefaultStart, int numThreads = WorkItemDefaultNumThreads)
  {
    return LocalAggregation::addWorkItem<PROCEDURE,WORKITEM>(event,workitem, 1,timeout,start,numThreads);
  }
  template<class PROCEDURE, class WORKITEM, class PARAMS>
  __device__ int emitWorkItemParams(const DEvent& event, const WORKITEM& workitem, const PARAMS& param, uint timeout = WorkItemDefaultTimeout, int start = WorkItemDefaultStart, int numThreads = WorkItemDefaultNumThreads)
  {
    return LocalAggregation::addWorkItemParams<PROCEDURE,WORKITEM,PARAMS>(event, workitem, param, 1, timeout,start,numThreads);
  }
  template<class PROCEDURE, class WORKITEM, class PARAMS>
  __device__ int emitWorkItemEqualParams(const DEvent& event, const WORKITEM& workitem, const PARAMS& param, uint timeout = WorkItemDefaultTimeout, int start = WorkItemDefaultStart, int numThreads = WorkItemDefaultNumThreads)
  {
    return LocalAggregation::addWorkItemEqualParams<PROCEDURE,WORKITEM,PARAMS>(event, workitem, param, 1, timeout,start,numThreads);
  }
}

#endif
