/*
  Softshell: Dynamic Scheduling on GPUs.
  http://www.icg.tugraz.at/project/mvp

  Copyright (C) 2012 Institute for Computer Graphics and Vision,
                     Graz University of Technology

  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
              Bernhard Kainz - kainz ( at ) icg.tugraz.at
              Michael Kenzel - kenzel ( at ) icg.tugraz.at
              Stefan Hauswiesner - hauswiesner ( at ) icg.tugraz.at
              Bernhard Kerbl - kerbl ( at ) icg.tugraz.at
              Dieter Schmalstieg - schmalstieg ( at ) icg.tugraz.at

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
*/


/*
* file created by    Markus Steinberger / steinberger ( at ) icg.tugraz.at
*
* modifications by
*/


#ifndef SOFTSHELL_AGGREGATION_AGGREGATION_CUH_INCLUDED
#define SOFTSHELL_AGGREGATION_AGGREGATION_CUH_INCLUDED

#include "stdlib/unordered_map.cuh"
#include "stdlib/maintained_linked_list.cuh"
#include "stdlib/pair.cuh"
#include "api/workpackage.h"
#include "api/workitem.h"
#include "tools/common.cuh"
#include "timing/timesync.cuh"
#include "queue/dscheduler.cuh"
#include "distributor/deviceentries.h"
#include "megakernel/megakernel.h"
#include "communicator/dparams.cuh"

#ifndef USE_CUDA_HEAP
#include "memory/UseScatterAlloc.cuh"
#endif


namespace Softshell
{

  class Aggregation
  {
    friend class CombWorkpackageBase;
    friend class Visitor;
    friend class DParams;
    friend class LocalAggregation;

    struct Identifier
    {
      volatile uint eventLaunchId;
      volatile uint entryPoint;
      volatile DeviceEntryProcedureFunc procedure;
      __device__ Identifier() { }
      __device__ Identifier(uint _eventLaunchId, uint _entryPoint, DeviceEntryProcedureFunc _procedure) :
        eventLaunchId(_eventLaunchId),
        entryPoint(_entryPoint),
        procedure(_procedure)
      { }
    };
    class IdentifierHash
    {
    public:
       __device__ static bool same(volatile const Identifier& key0, const Identifier& key1)
      {
        return key0.eventLaunchId == key1.eventLaunchId &&
          key0.entryPoint == key1.entryPoint &&
          key0.procedure == key1.procedure;
      }
      __device__ static size_t compute(const Identifier& key)
      {
        size_t procAdjusted = ((PointerEquivalent)(key.procedure) >> 8 & 0xFFFFFFFF) + ((PointerEquivalent)(key.procedure) >> 24) ;
        size_t hash =
          key.eventLaunchId * 1489 +
          key.entryPoint * 3119 +
          procAdjusted * 409;

        return hash;
      }
    };

    struct EntryPair
    {
      CombWorkpackageBase * volatile  first, * volatile second;
      //CombWorkpackageBaseEquivalent * volatile  dummy;
      volatile DeviceEntryProcedureFunc proc;
      volatile uint threads;
      volatile uint eventLaunchId;
      volatile uint entryPoint;
      //volatile int counter;
      //int lastcounter;

     //CombWorkpackageBaseEquivalent dummy;

      __device__ EntryPair() { }
      __device__ EntryPair(CombWorkpackageBase* a, CombWorkpackageBase* b, DeviceEntryProcedureFunc p, uint eLaunchId, uint entrypoint, uint t) : first(a), second(b), proc(p), threads(t), eventLaunchId(eLaunchId), entryPoint(entrypoint)
    {
      //dummy._elements = 0;
      //dummy._next = 0;
      //counter = 0;
      //lastcounter = 0;
    }
    };
    typedef std_gpu::_unordered_map<Identifier, EntryPair, 512, IdentifierHash> HashMap;
    typedef std_gpu::_maintained_linked_list<EntryPair*> LinkedList;

    class Visitor
    {
      uint minT, maxT;
      clock64_t time;
      HashMap& aggregatorMap;
    public:
      __device__ Visitor(uint _min, uint _max, HashMap& _aggregatorMap) : minT(_min), maxT(_max), aggregatorMap(_aggregatorMap)
      {
        time = getTimeCycles();
      }
      __device__ bool visit(volatile EntryPair* pair)
      {
        //printf("visit called for %llx\n",pair);

        bool create = false;
        uint count = 0;
        uint wpcount = 0;
        uint nthreads = max(pair->threads,1);
        CombWorkpackageBase  *last = 0, *current = pair->first;
        CombWorkpackageBase *start = current;
        //CombWorkpackageBase *end = pair->second;
        //CombWorkpackageBase *dummy = (CombWorkpackageBase*)(&pair->dummy);

        if((PointerEquivalent)current == 1)
        {
          //int c = pair->counter;
          //if(pair->lastcounter != c)
          //{
          //  printf("%llx current count: %d back: %llx\n",&pair,pair->counter,pair->second);
          //  pair->lastcounter = c;
          //}
          //this might be done...
          if(d_eventManager.getActiveWorkpackages(pair->eventLaunchId) == 0)
          {
            //printf("removing unused collector for launch %d: left: %d\n",pair->eventLaunchId,pair->counter);
            Identifier id(pair->eventLaunchId, pair->entryPoint, pair->proc);
            //aggregatorMap.erase(id,std_gpu::make_pair(Identifier(0,0,0), EntryPair(0,0,0,0,0,0)));
            aggregatorMap.erase(id);

            //this event is done
            return true;
          }
          return false;
        }

        //if there is no propper setup yet, dont touch it
        if(current == 0 || pair->second == 0)
          return false;

        //printf("there is something: %llx %llx\n",start, pair->second);

        //int mycoco = 0;
        while(current != 0)
        {
          //if(++mycoco > 10000)
          //{
          //  printf("endless loop in maintain!\nABORT\n");
          //  trap();
          //}
          uint currentElements = (current->_elements & 0xFFFF)*nthreads;
          if(count + currentElements > maxT)
          {
            //printf("next one (%llx) would be full, so create one %llx->%llx for %d threads\n", current, start, last, count);
            //emit one
            if(last == 0)
            {
              printf("ERROR in Global Aggregation: element counts too many workitems: %d > %d (max workitems)\nABORT!\n",currentElements,maxT);
              trap();
            }
            //atomicReplace(&last->_next,(CombWorkpackageBase*)0);
            last->setEnd();
            uint threads = count;
            start->_elements = (wpcount << 16) | (start->_elements & 0xFFFF);
            __threadfence();
            //printf("enqueu A with wpcount: %d\n",wpcount);
            enqueueWorkpackage(start, pair->proc, threads);
            //d_eventManager.debugWorkItemCounterAdd(wpcount);
            //atomicSub((int*)&pair->counter,wpcount);

            create = false;
            start = current;
            count = currentElements;
            wpcount = 1;
            last = current;
            current = current->_next;
          }
          //else if(currentElements == 0)
          //{
          //  printf("!!!!outch: there is an entry with zero elements,%llx->%llx->%llx!?!!!!\n",last,current,current->_next);
          //  trap();
          //}
          else
          {
            ++wpcount;
            count += currentElements;
            //printf("%llx: timeout check: %lld < %lld\n", current, current->_timeout, time);
            if(current->_timeout != 0 && current->_timeout < time)
              create = true;
            last = current;
            current = current->_next;
            //printf("advancing: %llx -> %llx (count: %d)\n",last,current,count);
          }
        }

        //set start
        atomicReplace(&pair->first, start);

        //quick out if queue is about to be empty
        create |= (count > 0 && d_queue.fillLevel() < 64);

        //printf("after loop count: %d >= %d   ... %llx -> %llx -> %llx  create: %d\n", count, minT, pair->first,last,current,create);
        if((count >= minT || create) )
        {
          //take out the rest?
          current = atomicSet(&pair->second, last, 0);
          if(current == last)
          {
            //printf("final create %llx->%llx for %d threads ... current: %llx\n", start, last, count, current);
            //last->setEnd();

            //set first free
            atomicSet(&pair->first,start,(CombWorkpackageBase*)1);

            atomicReplace(&last->_next,(CombWorkpackageBase*)CombWorkpackageBase::DefiniteEnd);
            uint threads = count;
            start->_elements = (wpcount << 16) | (start->_elements & 0xFFFF);
            __threadfence();
            enqueueWorkpackage(start, pair->proc, threads);

            //d_eventManager.debugWorkItemCounterAdd(wpcount);
            //atomicSub((int*)&pair->counter,wpcount);
            //if(old != initialstart)
            //  printf("enqueueing B with wpcount: %d and did not replace front(%llx)\n",wpcount,initialstart);
            //else
            // printf("enqueueing B with wpcount: %d and tried to replace front: %llx->%d\n",wpcount,initialstart,1);
            return false;
          }
        }


        __threadfence();
        return false;
      }
    };



    HashMap aggregatorMap;
    LinkedList aggregatorList;

    uint maxAggregationSize;
    uint minAggregationSize;

    CombWorkpackageBase* debug_other;
    DeviceEntryProcedureFunc debug_proc;

    __device__ static CombWorkpackageBase* atomicSet(CombWorkpackageBase*volatile* where, CombWorkpackageBase* expected, CombWorkpackageBase* what)
    {
      return (CombWorkpackageBase*) atomicCAS((PointerEquivalent*)(where),(PointerEquivalent)(expected), (PointerEquivalent)(what));
    }
    __device__ static CombWorkpackageBase* atomicReplace(CombWorkpackageBase*volatile* where, CombWorkpackageBase* what)
    {
      return (CombWorkpackageBase*) atomicExch((PointerEquivalent*)(where), (PointerEquivalent)(what));
    }


    __device__ void insert(uint eventLaunchId, uint entryPoint,  DeviceEntryProcedureFunc procedure, uint execthreads, CombWorkpackageBase* wp)
    {
      __threadfence();

      //fast out
      uint tthreads = execthreads*wp->numWorkItems();
      if(tthreads >= *(const uint*)(&minAggregationSize))
      {
        wp->_elements = (1 << 16) | (wp->_elements & 0xFFFF);
        __threadfence();
        enqueueWorkpackage(wp, procedure, tthreads);
        return;
      }

     // //debug
     // execthreads *= wp->numWorkItems();
     // if(execthreads == 0)
     //   printf("outch\n");
      //atomicOr((uint*)&wp->_elements,0x10000);
      //__threadfence();
     // enqueueWorkpackage(wp, procedure, execthreads);
     // return;


      ////debug
      //while(true)
      //{
      //  CombWorkpackageBase* old = (CombWorkpackageBase*)atomicCAS((unsigned long long*)(&debug_other), 0ULL, (unsigned long long)(wp));
      //  if(old == 0)
      //  {
      //    debug_proc = procedure;
      //    return;
      //  }
      //  CombWorkpackageBase* old2 = (CombWorkpackageBase*)atomicCAS((unsigned long long*)(&debug_other), (unsigned long long)(old), 0ULL);
      //  if(old2 == old)
      //  {
      //    execthreads = old->numWorkItems() + wp->numWorkItems();
      //    wp->push_back(old);
      //    //atomicExch((unsigned long long*)(&wp->_next),(unsigned long long)old);
      //    //printf("%d enqueue: %llx %llx  %d\n",blockIdx.x,wp,old,execthreads);
      //    wp->_elements = (wp->_elements&0xFFFF) | 0x20000;
      //    __threadfence();
      //    enqueueWorkpackage(wp, procedure, execthreads);
      //    //enqueueWorkpackage(old, procedure, old->numWorkItems());
      //    //enqueueWorkpackage(wp, procedure,  wp->numWorkItems());
      //    return;
      //  }
      //}
      ////


      //find entry
      Identifier id(eventLaunchId, entryPoint, procedure);
      HashMap::Iterator found = aggregatorMap.find(id);
      bool insertInList = false;
      if(found == aggregatorMap.end())
      {
        std_gpu::pair<HashMap::Iterator,bool> inserted = aggregatorMap.insert(std_gpu::make_pair(id, EntryPair(0,0, procedure, eventLaunchId, entryPoint, execthreads)));
        found = inserted.first;
        //printf("inserted entry for %d %d %llx: %d %llx\n", eventLaunchId, entryPoint, procedure, found._pos,&found->second);
        insertInList = inserted.second;
      }
      //else
      //  printf("found entry for %d %d %llx: %d .. %llx\n", eventLaunchId, entryPoint, procedure, found._pos,found->second.dummy);

      //replace back
      volatile EntryPair& pair = found->second;

      CombWorkpackageBase* old = atomicReplace(&pair.second, wp);
      if(old == 0)
      {
        atomicReplace(&pair.first,wp);
      }
      else
      {
        atomicReplace(&old->_next,wp);
        //if(atomicReplace(&old->_next,wp) != 0)
        //{
        //  printf("outch set next on not null!\n");
        //  ::trap();
        //}
        //printf("setting myself (%llx) on %llx for %llx\n",wp,old,&pair);
      }

      if(insertInList == true)
        aggregatorList.push_back(&found->second);
    }

  public:
    __device__ Aggregation(uint _minAggregationSize, uint _maxAggregationSize)
    {
      maxAggregationSize = _maxAggregationSize;
      minAggregationSize = _minAggregationSize;
      aggregatorMap.init();
      aggregatorList.init();
      debug_other = 0;
      //printf("aggregation is at %llx with list @ %llx\n",this,&aggregatorList);
    }
    __device__ void setMaxAggregationSize(uint numThreads)
    {
      maxAggregationSize = numThreads;
      d_params->changedParam(P_AggregationMaxNumThreads, maxAggregationSize);
    }
    __device__ void setMinAggregationSize(uint numThreads)
    {
      minAggregationSize = numThreads;
      d_params->changedParam(P_AggregationMinNumThreads, minAggregationSize);
    }

    template<class PROCEDURE>
    __device__ void insert(uint eventLaunchId, uint entryPoint,  CombWorkpackageBase* wp)
    {
     insert(eventLaunchId, entryPoint, EntryProcedure(PROCEDURE), PROCEDURE::ExecThreads, wp);
    }
    __device__ void maintain()
    {
      ////debug
      //CombWorkpackageBase* old = (CombWorkpackageBase*)atomicExch((unsigned long long*)(&debug_other), (unsigned long long)(0));
      //if(old != 0)
      //{
      //  old->_elements = (old->_elements&0xFFFF) | 0x10000;
      //  __threadfence();
      //  enqueueWorkpackage(old, debug_proc, old->numWorkItems());
      //}
      //return;
      ////

      uint tmax = min(maxAggregationSize, runningThreads);
      uint tmin = min(minAggregationSize, tmax-32);
      if(threadIdx.x == 0)
      {
        void* p;
        //printf("starting aggregation for %llx with %d %d\n",&aggregatorList,tmin,tmax);
        Visitor v(tmin, tmax, aggregatorMap);
        aggregatorList.visit(v, p);
      }
    }
  };

  extern __device__ Aggregation* d_aggregation;
  extern __device__ uint DefaultWorkItemTimeout;
}

#endif //SOFTSHELL_AGGREGATION_AGGREGATION_INCLUDED
