/*****************************************************************
* Unipro UGENE - Integrated Bioinformatics Suite
* Copyright (C) 2008,2009 Unipro, Russia (http://ugene.unipro.ru)
* All Rights Reserved
* 
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/

#include "EMBLPlainTextFormat.h"
#include "GenbankLocationParser.h"
#include "DocumentFormatUtils.h"

#include <gobjects/AnnotationTableObject.h>
#include <gobjects/DNASequenceObject.h>
#include <gobjects/GObjectUtils.h>

#include <core_api/IOAdapter.h>
#include <core_api/Task.h>
#include <core_api/DNAAlphabet.h>
#include <datatype/DNAInfo.h>
#include <workflow_support/QVariantUtils.h>
#include <util_text/TextUtils.h>

namespace GB2 {

/* TRANSLATOR GB2::EMBLPlainTextFormat */    
/* TRANSLATOR GB2::EMBLGenbankAbstractDocument */ 

EMBLPlainTextFormat::EMBLPlainTextFormat(QObject* p) 
: EMBLGenbankAbstractDocument(BaseDocumentFormats::PLAIN_EMBL, tr("EMBL"), 80, DocumentFormatFlag_SupportStreaming, p) 
{
	fileExtensions << "em" << "emb" << "embl";
    sequenceStartPrefix = "SQ";
    fPrefix = "FT";

    tagMap["DT"] = DNAInfo::DATE;
    tagMap["PR"] = DNAInfo::PROJECT;
    tagMap["DE"] = DNAInfo::DEFINITION;
    tagMap["KW"] = DNAInfo::KEYWORDS;
    tagMap["CC"] = DNAInfo::COMMENT;
    tagMap["CO"] = DNAInfo::CONTIG;
}

bool EMBLPlainTextFormat::checkRawData(const QByteArray& rawData) const {
    //TODO: improve format checking

    const char* data = rawData.constData();
    int size = rawData.size();

	bool textOnly = !TextUtils::contains(TextUtils::BINARY, data, size);
	if (!textOnly || size < 100) {
		return false;
	}
	return TextUtils::equals("ID   ", data, 5);
}

//////////////////////////////////////////////////////////////////////////
// loading

bool EMBLPlainTextFormat::readIdLine(ParserState* s) {
    if (!s->hasKey("ID", 2)) {
        s->si.setError(EMBLPlainTextFormat::tr("ID is not the first line"));
        return false;
    }

    QString idLineStr= s->value();
	QStringList tokens = idLineStr.split(";");
    if (idLineStr.length() < 6 || tokens.isEmpty()) {
        s->si.setError(EMBLPlainTextFormat::tr("Error parsing ID line"));
        return false;
    }
    s->entry->name = tokens[0];
    DNALocusInfo loi;
    loi.name = tokens[0];
    if (tokens.size() > 1) {
        QString sec = tokens[1];
        if (sec.startsWith("SV ")) {
            s->entry->tags[DNAInfo::VERSION] = tokens[0] + "." + sec.mid(3);
        }
        QString last = tokens.last();
        if (last.endsWith("BP.")) {
            last.chop(3);
            s->entry->seqLen = last.toInt();
        }
    }
    if (tokens.size() == 7) {
        // seems to be canonical header
        // http://www.ebi.ac.uk/embl/Documentation/User_manual/printable.html
        //1. Primary accession number
        //2. Sequence version number
        //3. Topology: 'circular' or 'linear'
        //4. Molecule type (see note 1 below)
        //5. Data class (methodological approach)
        //6. Taxonomic division (see section 3.2)
        //7. Sequence length (see note 2 below)
        loi.topology = tokens[2];
        loi.molecule = tokens[3];
        loi.division = tokens[5];
    } else {
        // remember just in case
        s->entry->tags.insert(DNAInfo::EMBL_ID, idLineStr);
    }
    s->entry->tags.insert(DNAInfo::LOCUS, qVariantFromValue<DNALocusInfo>(loi));
	return true;
}

bool EMBLPlainTextFormat::readEntry(QByteArray& sequence, ParserState* st) {
    TaskStateInfo& si = st->si;
    QString lastTagName;
    bool hasLine = false;
    while (hasLine || st->readNextLine(false)) {
        hasLine = false;
        if (st->entry->name.isEmpty()) {
            readIdLine(st);
		    assert(si.hasErrors() || !st->entry->name.isEmpty());
		    if (si.hasErrors()) {
			    break;			
		    }
            continue;
		}
        if (st->hasKey("FH") || st->hasKey("XX") || st->hasKey("AH")) {
            continue;
        }
        if (st->hasKey("AC")) {
            QVariant v = st->entry->tags.value(DNAInfo::ACCESSION);
            QStringList l = st->value().split(QRegExp(";\\s*"), QString::SkipEmptyParts);
            st->entry->tags[DNAInfo::ACCESSION] = QVariantUtils::addStr2List(v, l);
            continue;
        }
        if (st->hasKey("OS")) {
            DNASourceInfo soi;
            soi.name = st->value();
            soi.organism = soi.name;
            while (st->readNextLine()) {
                if (st->hasKey("OS")) {
                    soi.organism.append(" ").append(st->value());
                } else if (!st->hasKey("XX")) {
                    break;
                }
            }
            if (st->hasKey("OC")) {
                soi.taxonomy += st->value();
                while (st->readNextLine()) {
                    if (st->hasKey("OC")) {
                        soi.taxonomy.append(st->value());
                    } else if (!st->hasKey("XX")) {
                        break;
                    }
                }
            }
            if (st->hasKey("OG")) {
                soi.organelle = st->value();
            } else {
                hasLine = true;
            }
            st->entry->tags.insertMulti(DNAInfo::SOURCE, qVariantFromValue<DNASourceInfo>(soi));
            continue;
        }
        if (st->hasKey("RF") || st->hasKey("RN")) {
            while (st->readNextLine() && st->buff[0] == 'R')
            {
                //TODO
            }
            hasLine = true;
            continue;
        }

        if (st->hasKey("FT", 2)) {
            readAnnotations(st, sequence.size());
            hasLine = true;
            continue;
        }
        //read simple tag;	
        if (st->hasKey("//", 2)) {
            // end of entry
            return true;
        }
        else if (st->hasKey("SQ", 2)) {
            //reading sequence
            readSequence(sequence, st);
            return true;
        }

        QString key = st->key().trimmed();
        if (tagMap.contains(key)) {
            key = tagMap.value(key);
        }
        if (lastTagName == key) {
            QVariant v = st->entry->tags.take(lastTagName);
            v = QVariantUtils::addStr2List(v, st->value());
            st->entry->tags.insert(lastTagName, v);
        } else if (st->hasValue()) {
            lastTagName = key;
            st->entry->tags.insertMulti(lastTagName, st->value());
        }
	}
    if (!st->isNull() && !si.hasErrors() && !si.cancelFlag) {
        si.setError(GB2::EMBLGenbankAbstractDocument::tr("Record is truncated."));
    }

	return false;
}

}//namespace
