/*
  Softshell: Dynamic Scheduling on GPUs.
  http://www.icg.tugraz.at/project/mvp

  Copyright (C) 2012 Institute for Computer Graphics and Vision,
                     Graz University of Technology

  Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
              Bernhard Kainz - kainz ( at ) icg.tugraz.at
              Michael Kenzel - kenzel ( at ) icg.tugraz.at
              Stefan Hauswiesner - hauswiesner ( at ) icg.tugraz.at
              Bernhard Kerbl - kerbl ( at ) icg.tugraz.at
              Dieter Schmalstieg - schmalstieg ( at ) icg.tugraz.at

  Permission is hereby granted, free of charge, to any person obtaining a copy
  of this software and associated documentation files (the "Software"), to deal
  in the Software without restriction, including without limitation the rights
  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  copies of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be included in
  all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE.
*/


/*
* file created by    Markus Steinberger / steinberger ( at ) icg.tugraz.at
*
* modifications by
*/

#ifndef SOFTSHELL_API_WORKITEM_H_INCLUDED
#define SOFTSHELL_API_WORKITEM_H_INCLUDED

#include "workpackage.h"
#include "tools/common.cuh"
#ifdef __CUDACC__
#include "timing/timesync.cuh"
#endif

namespace Softshell
{
  extern __device__ uint DefaultWorkItemTimeout;


  class CombWorkpackageBase : public Workpackage
  {
    friend class Aggregation;
    friend class LocalAggregation;

  protected:
    //debug
  public:
    //
    const uint _capacity;
    volatile uint _elements;
    const uint _elementSize;
    const uint _paramsSize;

    unsigned char* _pdata;
    CombWorkpackageBase* volatile _next;
    unsigned char* volatile _pparams;
    clock64_t _timeout;

#ifdef __CUDACC__
    static const PointerEquivalent DefiniteEnd = 0xFFFFFFFFFFFFFFFFULL;

    __device__ uint structSize() const
    {
      if(isshared((void*)this))
        return reinterpret_cast<const LocalCombWorkpackage*>(this)->structSize();
      return _pdata + _elementSize*_capacity - (unsigned char*)(this);
    }
    __device__ void paste(unsigned char* data) const
    {
      if(isshared((void*)this))
        return reinterpret_cast<const LocalCombWorkpackage*>(this)->paste(data);
      CombWorkpackageBase* pStruct = (CombWorkpackageBase*)(data);
      pStruct = new(pStruct) CombWorkpackageBase(*this);
      pStruct->_pdata = (unsigned char*)(pStruct) + (_pdata - (unsigned char*)(this));
      uint elements = _elements&0xFFFF;
      for(uint i = 0; i < elements*_elementSize/sizeof(uint); ++i)
        ((uint*)pStruct->_pdata)[i] = ((uint*)_pdata)[i];
      if(_pparams != 0)
      {
        pStruct->_pparams = (unsigned char*)(pStruct) + (_pparams - (unsigned char*)(this));
        for(uint i = 0; i < _paramsSize/sizeof(uint); ++i)
          ((uint*)pStruct->_pparams)[i] = ((uint*)_pparams)[i];
      }
    }

    __device__  volatile CombWorkpackageBase* getWpForThread(uint& i) volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<volatile LocalCombWorkpackage*>(this)->getWpForThread(i);
      uint elements = 0;
      volatile CombWorkpackageBase* current = this;
      //int mycoco = 0;
      uint telements = current->_elements&0xFFFF;
      while(i >= elements + telements && current->_next != 0 && current->_next != (CombWorkpackageBase*) DefiniteEnd)
      {
        //if(((uint)(((PointerEquivalent)current) >> 32)) == 0xFFFFFFFF || ((uint)current) == 0xFFFFFFFF)
        //{
        //  printf("current pointer is screwed up: %llx\nABORT!\n",current);
        //  trap();
        //}
        //if(++mycoco > 1000)
        //{
        //  printf("endless loop in wp\nABORT!\n");
        //  trap();
        //}
        elements += telements;
        current = current->_next;
        telements = current->_elements&0xFFFF;
      }
      if(i < elements + telements)
      {
        i -= elements;
        return current;
      }
      return 0;
    }
    __device__  const volatile CombWorkpackageBase* getWpForThread(uint& i) const volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<const volatile LocalCombWorkpackage*>(this)->getWpForThread(i);
      uint elements = 0;
      volatile const CombWorkpackageBase* current = this;
      //int mycoco = 0;
      uint telements = current->_elements&0xFFFF;
      while(elements + telements <= i && current->_next != 0 && current->_next !=  (CombWorkpackageBase*) DefiniteEnd)
      {
        //if(((uint)(((PointerEquivalent)current) >> 32)) == 0xFFFFFFFF || ((uint)current) == 0xFFFFFFFF)
        //{
        //  printf("current pointer is screwed up: %llx\nABORT!\n",current);
        //  trap();
        //}
        //if(++mycoco > 1000)
        //{
        //  printf("endless loop in wp\nABORT!\n");
        //  trap();
        //}
        elements += telements;
        current = current->_next;
        telements = current->_elements&0xFFFF;
      }
      if(elements + telements > i)
      {
        i -= elements;
        return current;
      }
      return 0;
    }

    __device__ bool combineWith(CombWorkpackageBase* other)
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->combineWith(other);
      CombWorkpackageBase* merger = this;
      while(merger != 0)
      {
        //sec check
        if(merger->_elementSize != other->_elementSize)
          return false;
        //can be merged in?
        uint melements = merger->_elements&0xFFFF;
        uint oelements = other->_elements&0xFFFF;
        if(other->_next == 0 &&
           melements + oelements < _capacity &&
           merger->_paramsSize == other->_paramsSize)
        {
          bool equalParams = true;
          for(uint i = 0; i < merger->_paramsSize; ++i)
            if(merger->_pparams[i] != other->_pparams[i])
            {
              equalParams = false;
              break;
            }
          if(equalParams)
          {
            uint* dest = reinterpret_cast<uint*>(merger->_pdata + melements*merger->_elementSize);
            uint* src = reinterpret_cast<uint*>(other->_pdata);
            for(uint i = 0; i < oelements*merger->_elementSize/4; ++i)
              *dest++ = *src++;
            merger->_elements += other->_elements;
            delete other;
            return true;
          }
        }

        //append
        if(merger->_next == 0 || merger->_next ==  (CombWorkpackageBase*) DefiniteEnd)
        {
          merger->_next = other;
          return true;
        }
        //let the next take care of it
        merger = _next;
      }
      return false;
    }
    __device__ CombWorkpackageBase* push_back(CombWorkpackageBase* other)
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->push_back(other);
      volatile CombWorkpackageBase* current = this;
      //int mycoco = 0;
      while(true)
      {
        PointerEquivalent old = atomicCAS((PointerEquivalent*)(&current->_next), 0, (PointerEquivalent)(other));
        if(old == DefiniteEnd)
          return 0;
        if(old == 0)
          return (CombWorkpackageBase*)current;
        current = (CombWorkpackageBase*)(old);
        //if(++mycoco > 1000)
        //{
        //  printf("endless loop in workitem push_back\nABORT!\n");
        //  trap();
        //}
      }
    }
    __device__ CombWorkpackageBase* setEnd()
    {
      CombWorkpackageBase* old = (CombWorkpackageBase*) atomicExch((PointerEquivalent*)(&_next), DefiniteEnd);
      //if(old == 0)
      //  printf("OUTCH!... no no no no\n");
      return old;
    }
    template<class TWorkItem>
    __device__ volatile TWorkItem* getWorkItem(uint i) volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<volatile LocalCombWorkpackage*>(this)->getWorkItem<TWorkItem>(i);
      volatile CombWorkpackageBase* wp = getWpForThread(i);
      if(wp == 0) return 0;
      return reinterpret_cast<volatile TWorkItem*>(wp->_pdata) + i;
    }
    template<class TWorkItem>
    __device__ const volatile TWorkItem* getWorkItem(uint i) const volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<volatile const LocalCombWorkpackage*>(this)->getWorkItem<TWorkItem>(i);
      volatile const CombWorkpackageBase* wp = getWpForThread(i);
      if(wp == 0) return 0;
      return reinterpret_cast<volatile const TWorkItem*>(wp->_pdata) + i;
    }
    template<class TParams>
    __device__ volatile const TParams* getParams(uint i) const volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<volatile const LocalCombWorkpackage*>(this)->getParams<TParams>(i);
      volatile const CombWorkpackageBase* wp = getWpForThread(i);
      if(wp == 0) return 0;
      return reinterpret_cast<const TParams*>(wp->_pparams);
    }
    template<class TParams>
    __device__ volatile TParams* getParams(uint i) volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<volatile LocalCombWorkpackage*>(this)->getParams<TParams>(i);
      volatile CombWorkpackageBase* wp = getWpForThread(i);
      if(wp == 0) return 0;
      return reinterpret_cast<volatile TParams*>(wp->_pparams);
    }

    template<class TWorkItem>
    __device__ TWorkItem* addWorkItem(const TWorkItem& item)
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->addWorkItem<TWorkItem>(item);
      uint old = atomicAdd((uint*)&_elements,1)&0xFFFF;
      if(old >= _capacity)
      {
        atomicSub((uint*)&_elements,1);
        return 0;
      }
      (TWorkItem*)(_pdata)[old] = item;
      return (TWorkItem*)(_pdata) + old;
    }
    template<class TWorkItem>
    __device__ TWorkItem* addWorkItemUnsafe(const TWorkItem& item)
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->addWorkItemUnsafe<TWorkItem>(item);
      uint telements = _elements & 0xFFFF;
      if(telements >= _capacity)
        return 0;

      TWorkItem* mine = (TWorkItem*)(_pdata) + telements;
      *mine = item;
      ++_elements;
      return mine;
    }
    __device__ void setElements(uint num)
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->setElements(num);
      _elements = (_elements & 0xFFFF0000) + num;
    }
    template<class TWorkItem>
    __device__ TWorkItem* getWorkItemsPointer()
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->getWorkItemsPointer<TWorkItem>();
      return (TWorkItem*)(_pdata);
    }
    template<class TWorkItem>
    __device__ uint addElements(uint num, const TWorkItem* items)
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->addElements<TWorkItem>(num, items);
      uint telements = _elements & 0xFFFF;
      int canAdd = _capacity - telements;
      canAdd = min(num,max(0, canAdd));
      for(int i = 0; i < canAdd; ++i)
        (TWorkItem*)(_pdata)[telements+i] = items[i];
      _elements += canAdd;
      return canAdd;
    }
#endif
  public:
    template<class TWorkItem, class TParams>
    __device__ CombWorkpackageBase(DEvent const& event, uint capacity, TWorkItem* data, TParams* params, clock64_t timeout, uint elements = 0, uint state = 1 ) :  Workpackage(event, state),
      _capacity(capacity),
      _elements(elements | 0x00010000),
      _elementSize(sizeof(TWorkItem)),
      _pdata(reinterpret_cast<unsigned char*>(data)),
      _next(0),
      _pparams(reinterpret_cast<unsigned char*>(params)),
      _paramsSize(sizeof(TParams)),
      _timeout(timeout)
    {
    }

    __device__ CombWorkpackageBase(DEvent const& event, uint capacity, unsigned char* data, uint elementSize, unsigned char* params, uint paramSize, clock64_t timeout, uint elements = 0, uint state = 1 ) :  Workpackage(event, state),
      _capacity(capacity),
      _elements(elements| 0x00010000),
      _elementSize(elementSize),
      _pdata(data),
      _next(0),
      _pparams(reinterpret_cast<unsigned char*>(params)),
      _paramsSize(paramSize),
      _timeout(timeout)
    {
    }

    template<class TWorkItem>
    __device__ CombWorkpackageBase(DEvent const& event, uint capacity, TWorkItem* data, clock64_t timeout, uint elements = 0, uint state = 1 ) :  Workpackage(event, state),
      _capacity(capacity),
      _elements(elements| 0x00010000),
      _elementSize(sizeof(TWorkItem)),
      _pdata(reinterpret_cast<unsigned char*>(data)),
      _next(0),
      _pparams(0),
      _paramsSize(0),
      _timeout(timeout)
    {
    }

    __device__ CombWorkpackageBase(DEvent const& event) : Workpackage(event, 1),
      _capacity(0),
      _elements(0x00010000),
      _elementSize(0),
      _pdata(0),
      _next(0),
      _pparams(0),
      _paramsSize(0),
      _timeout(0)
    {
    }

#ifdef __CUDACC__
    __device__ ~CombWorkpackageBase()
    {
      if(!isshared((void*)this))
        if(_next && _next != (CombWorkpackageBase*)DefiniteEnd)
          delete _next;
    }
    __device__ uint numWorkItems() volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<volatile LocalCombWorkpackage*>(this)->numWorkItems();
      uint elements = _elements & 0xFFFF;
      //int mycoco = 0;
      for(CombWorkpackageBase* n = _next; n != 0 && n != (CombWorkpackageBase*)DefiniteEnd; n = n->_next)
      {
        elements += n->_elements & 0xFFFF;
        //if(++mycoco > 1024)
        //{
        //  printf("possible loop in workitem\nABORT\n");
        //  trap();
        //}
      }
      return elements;
    }
    __device__ uint numWorkPackages() volatile
    {
      if(isshared((void*)this))
        return reinterpret_cast<volatile LocalCombWorkpackage*>(this)->numWorkPackages();

      ////test
      //   uint count = 1;
      //   for(CombWorkpackageBase* n = _next; n != 0 && n != (CombWorkpackageBase*)DefiniteEnd; n = n->_next)
      //     ++count;
      //   if(count != _elements >> 16)
      //   {
      //     printf("wp count does not match!: %d vs %d\n",count,_elements >> 16);
      //     trap();
      //   }
      //   return count;
      ////test

      return _elements >> 16;
    }
    __device__ void destroy() volatile
    {
      if(!isshared((void*)this))
      {
        volatile CombWorkpackageBase* n = this;
        while(n != 0 && n != (volatile CombWorkpackageBase*)DefiniteEnd)
        {
          volatile CombWorkpackageBase* t = n;
          n = n->_next;
          free((void*)(t));
        }
      }
      else
        reinterpret_cast<volatile LocalCombWorkpackage*>(this)->destroy(sizeof(CombWorkpackageBase));
    }

    __device__ uint reserveElements(uint num, uint& offset)
    {
      if(isshared((void*)this))
        return reinterpret_cast<LocalCombWorkpackage*>(this)->reserveElements(num, offset);
      offset = atomicAdd((uint*)&_elements, num) & 0xFFFF;
      if(offset + num > _capacity)
      {
        atomicSub((uint*)&_elements, offset + num - _capacity);
        return _capacity - offset;
      }
      else
        return num;
    }
    template<class Param>
    __device__ bool compareParam(const Param* param) const
    {
      if(isshared((void*)this))
        return reinterpret_cast<const LocalCombWorkpackage*>(this)->compareParam(param);
      if(_paramsSize != sizeof(Param)) return false;
      const uint* nP = (const uint*)(param);
      const uint* mP = (const uint*)(_pparams);
      for(uint i = 0; i < sizeof(Param)/sizeof(uint); ++i)
        if(*nP++ != *mP++) return false;
      return true;
    }
#endif
  };

  class CombWorkpackageBaseEquivalent : public WorkpackageEquivalent
  {
  public:
    uint _capacity;
    uint _elements;
    uint _elementSize;
    uint _paramsSize;
    unsigned char* _pdata;
    CombWorkpackageBase* _next;
    unsigned char* _pparams;
    clock64_t _timeout;
  };

  template<class TWorkItem>
  class CombWorkpackageTyped : public CombWorkpackageBase
  {
  public:
    template<class TParams>
    __device__ CombWorkpackageTyped(DEvent const& event, uint capacity, TWorkItem* data, TParams* params, clock64_t timeout, uint elements = 0, uint state = 1 ) : CombWorkpackageBase(event, capacity, data, params, timeout, elements, state) { }
    __device__ CombWorkpackageTyped(DEvent const& event, uint capacity, TWorkItem* data, clock64_t timeout, uint elements = 0, uint state = 1) : CombWorkpackageBase(event, capacity, data, timeout, elements, state) { }

#ifdef __CUDACC__
    __device__ TWorkItem* operator[] (uint i) { return getWorkItem(i); }
    __device__ const TWorkItem* operator[] (uint i) const { return getWorkItem(i); }
    __device__ volatile TWorkItem* getWorkItem(uint i) volatile
    {
      return CombWorkpackageBase::getWorkItem<TWorkItem>(i);
    }
    __device__ volatile const TWorkItem* getWorkItem(uint i) const volatile
    {
      return CombWorkpackageBase::getWorkItem<TWorkItem>(i);
    }
    __device__ TWorkItem* addWorkItem(const TWorkItem& item)
    {
      return CombWorkpackageBase::addWorkItem<TWorkItem>(item);
    }
    __device__ TWorkItem* addWorkItemUnsafe(const TWorkItem& item)
    {
      return CombWorkpackageBase::addWorkItemUnsafe<TWorkItem>(item);
    }
    __device__ void setElements(uint num)
    {
      return CombWorkpackageBase::setElements(num);
    }
    __device__ TWorkItem* getWorkItemsPointer()
    {
      return CombWorkpackageBase::getWorkItemsPointer<TWorkItem>();
    }
    __device__ uint addElements(uint num, const TWorkItem* items)
    {
      return CombWorkpackageBase::addElements<TWorkItem>(num,items);
    }
#endif
  };

  template<class TWorkItem, class TParams>
  class CombWorkpackageParams : public CombWorkpackageTyped<TWorkItem>
  {
    TParams _params;
  public:
    __device__ CombWorkpackageParams(DEvent const& event, uint capacity, TWorkItem* data, const TParams& initParams, clock64_t timeout, uint elements, uint state) : CombWorkpackageTyped<TWorkItem>(event, capacity, data, &_params, timeout, elements, state), _params(initParams) { }
#ifdef __CUDACC__
    __device__ volatile const TParams* getParams(uint i) const  volatile
    {
      return CombWorkpackageBase::getParams<TParams>(i);
    }
    __device__ volatile TParams* getParams(uint i) volatile
    {
      return CombWorkpackageBase::getParams<TParams>(i);
    }
#endif
  };

  template<class TWorkItem, class TParams = void, uint TCapacity = 256>
  class CombWorkpackage : public CombWorkpackageParams<TWorkItem, TParams>
  {
  public:
    static const uint Capacity = TCapacity;
  protected:
    TWorkItem data[TCapacity];
  public:
    __device__ CombWorkpackage(DEvent const& event, const TParams& params = TParams(), clock64_t timeout = getTimeCycles() + DefaultWorkItemTimeout) :
      CombWorkpackageParams<TWorkItem, TParams>(event, TCapacity, data, params, timeout,0,1) { }

  };

  template<class TWorkItem, uint TCapacity>
  class CombWorkpackage<TWorkItem, void, TCapacity> : public CombWorkpackageTyped<TWorkItem>
  {
  public:
    static const uint Capacity = TCapacity;
  protected:
    TWorkItem data[TCapacity];
  public:
    __device__  CombWorkpackage(DEvent const& event, clock64_t timeout = getTimeCycles() + DefaultWorkItemTimeout) :
      CombWorkpackageTyped<TWorkItem>(event, TCapacity, data, timeout,0,1) { }
  };

}

#endif //SOFTSHELL_API_WORKITEM_INCLUDED
