/*****************************************************************
* Unipro UGENE - Integrated Bioinformatics Suite
* Copyright (C) 2008,2009 Unipro, Russia (http://ugene.unipro.ru)
* All Rights Reserved
* 
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/

#ifdef SW2_BUILD_WITH_ATISTREAM

#include "SmithWatermanAlgorithmATISTREAM.h"

//TODO: BUG-817 Enable multi-gpu support for ATI gpus
//KLUDGE: Dirty hack to remove __declspec qualifier from one of the internal objects,
//        generated by brcc
#ifdef Q_OS_WIN
#define __declspec(x) 
#endif
#include "brookgenfiles/sw_atistream.h"
#ifdef Q_OS_WIN
#undef __declspec
#endif
#include "PairAlignSequences.h"

#include <core_api/Log.h>
#include <core_api/AppResources.h>

#ifndef Q_OS_WIN
    #include <malloc.h>
#endif
#include <time.h>

namespace GB2 {

    static LogCategory log(ULOG_CAT_SW);

    void SmithWatermanAlgorithmATISTREAM::launch(SubstMatrix const * _substitutionMatrix, QByteArray const & _patternSeq, QByteArray const & _searchSeq, int _gapOpen, int _gapExtension, int _minScore) {

        log.info("START SmithWatermanAlgorithmATISTREAM::launch");

        setValues(_substitutionMatrix, _patternSeq, _searchSeq, _gapOpen, _gapExtension, _minScore);

        int qLen =_patternSeq.size();
        int sLen =_searchSeq.size();
        int subLen = substitutionMatrix->getAlphabet()->getNumAlphabetChars();

        int profLen = subLen * (qLen + 1) * 'Z';

        ScoreType *  queryProfile = NULL;
#ifdef Q_OS_WIN
        queryProfile = (ScoreType *) _aligned_malloc(profLen * sizeof (ScoreType), 256);
#else
//TODO: need use memalign, but that broken heap
//        queryProfile = (ScoreType *) memalign(profLen * sizeof (ScoreType), 256);
//        posix_memalign((void **) &queryProfile, profLen * sizeof (ScoreType), 256);
        queryProfile = new ScoreType[profLen];
#endif

        for (int i = 0; i < profLen; i++) {
            queryProfile[i] = 0;
        }

        //calculate query profile
        for (int i = 0; i < subLen; i++) {
            for (int j = 0; j < qLen; j++) {
                char ch = _substitutionMatrix->getAlphabet()->getAlphabetChars()[i];
                queryProfile[ch * qLen + j] = 
                    _substitutionMatrix->getScore(ch, _patternSeq.at(j));            
            }        
        }

        //using two-dimensional for increasing size, maximum 1024*1024*64 byte
        int sWeight  = (sLen + 8191) / 8192;
        int sHeight = sLen % 8192 + 1;        
        unsigned int seqLibDim[] = {sHeight, sWeight};

        unsigned int queryProfDim[] = {qLen, subLen * 'Z'};
        
        unsigned int queryDim[] = {qLen};    
        unsigned int isActualDim[] = {1};    

        ScoreType *  HdataArr = NULL;
        ScoreType *  directionArr = NULL;
        ScoreType *  isActual = NULL;
#ifdef Q_OS_WIN
        HdataArr = (ScoreType *) _aligned_malloc(qLen * sizeof (ScoreType), 256);
        directionArr = (ScoreType *) _aligned_malloc(qLen * sizeof (ScoreType), 256);
        isActual = (ScoreType *) _aligned_malloc(1 * sizeof (ScoreType), 256);
#else
//TODO: need use memalign, but that broken heap
//        HdataArr = (ScoreType *) memalign(qLen * sizeof (ScoreType), 256);
//        posix_memalign((void **) &HdataArr, qLen * sizeof (ScoreType), 256);
//        directionArr = (ScoreType *) memalign(qLen * sizeof (ScoreType), 256);
//        posix_memalign((void **) &directionArr, qLen * sizeof (ScoreType), 256);
//        isActual = (ScoreType *) memalign(1 * sizeof (ScoreType), 256);
//        posix_memalign((void **) &isActual, 1 * sizeof (ScoreType), 256);
        HdataArr = new ScoreType[qLen];
        directionArr = new ScoreType[qLen];
        isActual = new ScoreType[1];
#endif

        for (int i = 0; i < qLen; i++) {
            HdataArr[i] = 0;
            directionArr[i] = 0;
        }

        //capture resource

        
        //declare arrays on device and allocate memory
        ::brook::Stream<char> d_seqLib(2, seqLibDim);        
        ::brook::Stream<int> d_queryProf(2, queryProfDim);
        ::brook::Stream<int> * d_HdataCur = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_HdataPrev = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_HdataRec = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_FdataCur = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_FdataRec = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_EdataRec = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_EdataCur = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_directionsCur = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_directionsPrev = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> * d_directionsRec = new ::brook::Stream<int>(1, queryDim);
        ::brook::Stream<int> d_isActual(1, isActualDim);

        ::brook::Stream<int> * d_temp1;
        ::brook::Stream<int> * d_temp2;

        //copy from host to device
        d_seqLib.read(_searchSeq.constData());    
        d_queryProf.read(queryProfile);
        d_HdataCur->read(HdataArr);
        d_HdataPrev->read(HdataArr);
        d_HdataRec->read(HdataArr);
        d_FdataCur->read(HdataArr);
        d_FdataRec->read(HdataArr);
        d_EdataRec->read(HdataArr);
        d_EdataCur->read(HdataArr);
        d_directionsCur->read(HdataArr);
        d_directionsPrev->read(HdataArr);
        d_directionsRec->read(HdataArr);
        d_isActual.read(isActual);    

        // start main loop
        for (int i = 0; i < (sLen + qLen - 1); i++) {            
            // isActualArr variable indicate that vector contain actual result
            isActual[0] = false;
            d_isActual.read(isActual);
            clock_t clocks = clock();
            calculateMatrixOnGPU(d_seqLib, sLen, d_queryProf, qLen, (-1) * gapOpen, (-1) * gapExtension,
                *d_HdataPrev, *d_HdataCur, *d_HdataRec, *d_EdataCur,
                *d_EdataRec, *d_FdataCur, *d_FdataRec, *d_directionsPrev, *d_directionsCur,
                *d_directionsRec, minScore, d_isActual, i);        
            
            clocks = clock() - clocks;
            log.info(QString("Kernel time: %1").arg(QString::number(double(clocks)/CLOCKS_PER_SEC)));

            d_isActual.write(isActual);
            if (isActual[0] == 1) {
                //Copy vector on host and find actual results
                d_HdataRec->write(HdataArr);
                d_directionsRec->write(directionArr);        

                for (int j = 0; j < qLen; j++) {
                    PairAlignSequences res;
                    if (HdataArr[j] >= minScore && i - j < sLen) {                    
                        res.intervalSeq1.startPos = directionArr[j];
                        res.intervalSeq1.len = i - j - res.intervalSeq1.startPos + 1;                    
                        res.score = HdataArr[j];
                        pairAlignmentStrings.append(res);
                    }
                }
            }

            //reverting arrays H, E and directions
            d_temp1 = d_HdataCur;
            d_HdataCur = d_HdataPrev;
            d_temp2 = d_HdataRec;
            d_HdataRec = d_temp1;
            d_HdataPrev = d_temp2;

            d_temp1 = d_directionsCur;
            d_directionsCur = d_directionsPrev;
            d_temp2 = d_directionsRec;
            d_directionsRec = d_temp1;
            d_directionsPrev = d_temp2;

            d_temp1 = d_EdataCur;
            d_EdataCur = d_EdataRec;
            d_EdataRec = d_temp1;

            d_temp1 = d_FdataCur;
            d_FdataCur = d_FdataRec;
            d_FdataRec = d_temp1;
        }

        //free memory
        delete d_HdataCur;        
        delete d_HdataPrev;
        delete d_HdataRec;
        delete d_FdataCur;
        delete d_FdataRec;
        delete d_EdataRec;
        delete d_EdataCur;
        delete d_directionsCur;
        delete d_directionsPrev;
        delete d_directionsRec;                

#ifdef Q_OS_WIN        
        _aligned_free(HdataArr);
        _aligned_free(directionArr);
        _aligned_free(isActual);
        _aligned_free(queryProfile);
#else
//        free(HdataArr);
//        free(directionArr);
//        free(isActual);
//        free(queryProfile);    
        
        delete[] HdataArr;
        delete[] directionArr;
        delete[] isActual;
        delete[] queryProfile;
#endif        
        
        log.info("FINISH SmithWatermanAlgorithmATISTREAM::launch");        
    }

quint64 SmithWatermanAlgorithmATISTREAM::estimateNeededGpuMemory( const SubstMatrix * substitutionMatrix, QByteArray const & patternSeq, QByteArray const & searchSeq )
{
    int subLen = substitutionMatrix->getAlphabet()->getNumAlphabetChars();

    quint64 neededMem = 0;
    neededMem += searchSeq.size();
    neededMem += patternSeq.size() * 10 * sizeof(int);
    neededMem += patternSeq.size() * 'Z' * subLen * sizeof(int);
    neededMem += 1 * sizeof(int);

    return neededMem;
}
} //namespace

#endif //SW2_BUILD_WITH_ATISTREAM
