/*****************************************************************
* Unipro UGENE - Integrated Bioinformatics Suite
* Copyright (C) 2008,2009 Unipro, Russia (http://ugene.unipro.ru)
* All Rights Reserved
* 
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/

#include "MSAConsensusAlgorithmClustal.h"

#include <datatype/MAlignment.h>
#include <core_api/DNAAlphabet.h>

namespace GB2 {

QString MSAConsensusAlgorithmFactoryClustal::getDescription() const {
	return tr("Emulates ClustalW program and file format behavior.");
}

QString MSAConsensusAlgorithmFactoryClustal::getName() const {
	return tr("ClustalW");
}


MSAConsensusAlgorithm* MSAConsensusAlgorithmFactoryClustal::createAlgorithm(const MAlignment&, QObject* p) {
    return new MSAConsensusAlgorithmClustal(this, p);
}
    
//////////////////////////////////////////////////////////////////////////
//Algorithm

char MSAConsensusAlgorithmClustal::getConsensusChar(const MAlignment& msa, int pos) const {
	if (!msa.getAlphabet()->isAmino()) { 
		// for nucleic alphabet work as strict algorithm but use ' ' as default
		char  defChar = ' ';
		char pc = msa.getRows().first().chatAt(pos);
		if (pc == MAlignment_GapChar) {
			pc = defChar;
		}
		for (int s = 1, nSeq = msa.getNumRows(); s < nSeq; s++) {
			const MAlignmentRow& row = msa.getRow(s);
			char c = row.chatAt(pos);
			if (c != pc) {
				pc = defChar;
				break;
			}
		}
		char res = (pc == defChar) ? defChar : '*';
		return res;
	} else {
		/* From ClustalW doc:
		'*' indicates positions which have a single, fully conserved residue
		':' indicates that one of the following 'strong' groups is fully conserved:
		STA, NEQK, NHQK, NDEQ, QHRK, MILV, MILF, HY, FYW, 
		'.' indicates that one of the following 'weaker' groups is fully conserved:
		CSA, ATV, SAG, STNK, STPA, SGND, SNDEQK, NDEQHK, NEQHRK, FVLIM, HFY
		*/
		static QByteArray strongGroups[] = {"STA", "NEQK", "NHQK", "NDEQ", "QHRK", "MILV", "MILF", "HY", "FYW"};
		static QByteArray weakGroups[]   = {"CSA", "ATV", "SAG", "STNK", "STPA", "SGND", "SNDEQK", "NDEQHK", "NEQHRK", "FVLIM", "HFY"};
		static int maxStrongGroupLen = 4;
		static int maxWeakGroupLen = 6;

		QByteArray currentGroup; //TODO: optimize 'currentGroup' related code!
		for (int s = 0, nSeq = msa.getNumRows(); s < nSeq; s++) {
			const MAlignmentRow& row = msa.getRow(s);
			char c = row.chatAt(pos);
			if (!currentGroup.contains(c)) {
				currentGroup.append(c);
			}
		}
		char consChar = MAlignment_GapChar;
		if (currentGroup.size() == 1) {
			consChar = (currentGroup[0] == MAlignment_GapChar) ? ' ' : '*';
		} else  {
			bool ok = false;
			int currentLen = currentGroup.length();
			const char* currentGroupData = currentGroup.data();
			//check strong groups
			if (currentLen <= maxStrongGroupLen) {
				for (int sgi=0, sgn = sizeof(strongGroups) / sizeof(QByteArray); sgi < sgn && !ok; sgi++) {
					bool matches = true;
					const QByteArray& sgroup = strongGroups[sgi];
					for (int j=0; j < currentLen && matches; j++) {
						char c = currentGroupData[j];
						matches = sgroup.contains(c);
					}
					ok = matches;
				}
				if (ok) {
					consChar = ':';
				}
			} 

			//check weak groups
			if (!ok && currentLen <= maxWeakGroupLen) {
				for (int wgi=0, wgn = sizeof(weakGroups) / sizeof(QByteArray); wgi < wgn && !ok; wgi++) {
					bool matches = true;
					const QByteArray& wgroup = weakGroups[wgi];
					for (int j=0; j < currentLen && matches; j++) {
						char c = currentGroupData[j];
						matches = wgroup.contains(c);
					}
					ok = matches;
				}
				if (ok) {
					consChar = '.';
				}
			} 
			//use default
			if (!ok) {
				consChar = ' ';
			}
		} //amino
		return consChar;
	} 
}

} //namespace
