From 1cf188b912d6da8f2cd03dd71cecef664a699c1a Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Wed, 4 May 2011 14:11:02 +0000
Subject: [PATCH] changes for chimera slayer

---
 blastdb.cpp              | 21 ++++++++++++---------
 blastdb.hpp              |  2 +-
 chimeraslayer.cpp        | 12 ++++++------
 chimeraslayercommand.cpp | 10 +++++-----
 database.hpp             |  2 +-
 decalc.cpp               | 30 ++++++++++++++++++++++++------
 decalc.h                 |  2 +-
 maligner.cpp             |  1 +
 slayer.cpp               | 40 ++++++++++++++++++++++++++++++----------
 9 files changed, 81 insertions(+), 39 deletions(-)
diff --git a/blastdb.cpp b/blastdb.cpp
index fdb5456..3e1210b 100644
--- a/blastdb.cpp
+++ b/blastdb.cpp
@@ -111,9 +111,10 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
 }
 /**************************************************************************************************/
 //assumes you have added all the template sequences using the addSequence function and run generateDB.
-vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) {
 	try{
 		vector<int> topMatches;
+		float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score;
 		Scores.clear();
 		
 		ofstream queryFile;
@@ -126,7 +127,7 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 		//	the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
 		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
 		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
-	
+//7000004128189528left	0	100		66	0	0	1	66	61	126	1e-31	 131	
 		string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
 		blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
 		system(blastCommand.c_str());
@@ -134,20 +135,22 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 		ifstream m8FileHandle;
 		m->openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
 	
-		string dummy;
+		string dummy, eScore;
 		int templateAccession;
 		m->gobble(m8FileHandle);
 		
 		while(!m8FileHandle.eof()){
-			m8FileHandle >> dummy >> templateAccession >> searchScore;
-			//cout << templateAccession << '\t' << searchScore << endl;
+			m8FileHandle >> dummy >> templateAccession >> searchScore >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
+			//cout << dummy << '\t' << templateAccession << '\t' << searchScore << '\t';
 			
 			//get rest of junk in line
-			while (!m8FileHandle.eof())	{	char c = m8FileHandle.get(); if (c == 10 || c == 13){	break;	}	} 
-			
+			//while (!m8FileHandle.eof())	{	char c = m8FileHandle.get(); if (c == 10 || c == 13){	break;	}else{ cout << c; }	} //
+				//cout << endl;
 			m->gobble(m8FileHandle);
-			topMatches.push_back(templateAccession);
-			Scores.push_back(searchScore);
+			if (score >= minPerID) {
+				topMatches.push_back(templateAccession);
+				Scores.push_back(searchScore);
+			}
 //cout << templateAccession << endl;
 		}
 		m8FileHandle.close();
diff --git a/blastdb.hpp b/blastdb.hpp
index 844f7f1..9b8965d 100644
--- a/blastdb.hpp
+++ b/blastdb.hpp
@@ -23,7 +23,7 @@ public:
 	void generateDB();
 	void addSequence(Sequence);
 	vector<int> findClosestSequences(Sequence*, int);
-	vector<int> findClosestMegaBlast(Sequence*, int);
+	vector<int> findClosestMegaBlast(Sequence*, int, int);
 	
 private:
 	string dbFileName;
diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp
index 59b2fea..cd268d3 100644
--- a/chimeraslayer.cpp
+++ b/chimeraslayer.cpp
@@ -999,7 +999,7 @@ vector<Sequence*> ChimeraSlayer::getRefSeqs(Sequence* q, vector<Sequence*>& this
 			//find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
 			Sequence* newSeq = new Sequence(q->getName(), q->getAligned());
 			runFilter(newSeq);
-			refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted);
+			refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted, minSim);
 			delete newSeq;
 		}else if (searchMethod == "blast")  {
 			refSeqs = getBlastSeqs(q, thisTemplate, numWanted); //fills indexes
@@ -1025,12 +1025,12 @@ vector<Sequence*> ChimeraSlayer::getBlastSeqs(Sequence* q, vector<Sequence*>& db
 		string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
 		string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
 		
-		Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
-		Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
-		
-		vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
-		vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
+		Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
+		Sequence* queryRight = new Sequence(q->getName(), rightQuery);
 		
+		vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1, minSim);
+		vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1, minSim);
+		cout << q->getName() << '\t' << leftQuery << '\t' << "leftMatches = " << tempIndexesLeft.size() << '\t' << rightQuery	<< " rightMatches = " << tempIndexesRight.size() << endl;
 		vector<int> smaller;
 		vector<int> larger;
 		
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp
index 9299f89..97edf11 100644
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -23,7 +23,7 @@ vector<string> ChimeraSlayerCommand::setParameters(){
 		CommandParameter pmismatch("mismatch", "Number", "", "-4.0", "", "", "",false,false); parameters.push_back(pmismatch);
 		CommandParameter pminsim("minsim", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminsim);
 		CommandParameter pmincov("mincov", "Number", "", "70", "", "", "",false,false); parameters.push_back(pmincov);
-		CommandParameter pminsnp("minsnp", "Number", "", "100", "", "", "",false,false); parameters.push_back(pminsnp);
+		CommandParameter pminsnp("minsnp", "Number", "", "10", "", "", "",false,false); parameters.push_back(pminsnp);
 		CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs);
 		CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch);
 		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
@@ -31,7 +31,7 @@ vector<string> ChimeraSlayerCommand::setParameters(){
 		CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim);
 		CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit);
 		CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted);
-		CommandParameter piters("iters", "Number", "", "100", "", "", "",false,false); parameters.push_back(piters);
+		CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
 		CommandParameter pdivergence("divergence", "Number", "", "1.007", "", "", "",false,false); parameters.push_back(pdivergence);
 		CommandParameter pparents("parents", "Number", "", "3", "", "", "",false,false); parameters.push_back(pparents);
 		CommandParameter pincrement("increment", "Number", "", "5", "", "", "",false,false); parameters.push_back(pincrement);
@@ -72,11 +72,11 @@ string ChimeraSlayerCommand::getHelpString(){
 		helpString += "The parents parameter allows you to select the number of potential parents to investigate from the numwanted best matches after rating them, default is 3. \n";
 		helpString += "The mismatch parameter allows you to penalize mismatched bases in blast search, default is -4. \n";
 		helpString += "The divergence parameter allows you to set a cutoff for chimera determination, default is 1.007. \n";
-		helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=100.\n";
+		helpString += "The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method, default=1000.\n";
 		helpString += "The minsim parameter allows you to specify a minimum similarity with the parent fragments, default=90. \n";
 		helpString += "The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n";
 		helpString += "The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n";
-		helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 100) \n";
+		helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n";
 		helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n";
 		helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true.  \n";
 		helpString += "The chimera.slayer command should be in the following format: \n";
@@ -353,7 +353,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
 			
 			search = validParameter.validFile(parameters, "search", false);			if (search == "not found") { search = "distance"; }
 			
-			temp = validParameter.validFile(parameters, "iters", false);			if (temp == "not found") { temp = "100"; }		
+			temp = validParameter.validFile(parameters, "iters", false);			if (temp == "not found") { temp = "1000"; }		
 			convert(temp, iters); 
 			 
 			temp = validParameter.validFile(parameters, "increment", false);		if (temp == "not found") { temp = "5"; }
diff --git a/database.hpp b/database.hpp
index f9c0c48..f974ae9 100644
--- a/database.hpp
+++ b/database.hpp
@@ -50,7 +50,7 @@ public:
 	virtual void addSequence(Sequence) = 0;  //add sequence to search engine
 	virtual string getName(int) { return ""; }  
 	virtual vector<int> findClosestSequences(Sequence*, int) = 0;  // returns indexes of n closest sequences to query
-	virtual vector<int> findClosestMegaBlast(Sequence*, int){return results;}
+	virtual vector<int> findClosestMegaBlast(Sequence*, int, int){return results;}
 	virtual float getSearchScore();
 	virtual vector<float> getSearchScores() { return Scores; } //assumes you already called findClosestMegaBlast
 	virtual int getLongestBase(); 
diff --git a/decalc.cpp b/decalc.cpp
index 4e38053..3b3740b 100644
--- a/decalc.cpp
+++ b/decalc.cpp
@@ -12,7 +12,7 @@
 #include "dist.h"
 #include "eachgapdist.h"
 #include "ignoregaps.h"
-#include "eachgapdistignorens.h"
+#include "eachgapdist.h"
 
 //***************************************************************************************************************
 void DeCalculator::setMask(string ms) { 
@@ -683,7 +683,7 @@ float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
 }
 //***************************************************************************************************************
 //gets closest matches to each end, since chimeras will most likely have different parents on each end
-vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted) {
+vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted, int minSim) {
 	try {
 		//indexes.clear();
 		
@@ -692,7 +692,7 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
 		vector<SeqDist> distsLeft;
 		vector<SeqDist> distsRight;
 		
-		Dist* distcalculator = new eachGapDistIgnoreNs();
+		Dist* distcalculator = new eachGapDist();
 		
 		string queryUnAligned = querySeq->getUnaligned();
 		int numBases = int(queryUnAligned.length() * 0.33);
@@ -813,14 +813,32 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
 			
 		}
 		
+		//are we still above the minimum similarity cutoff
+		if ((lastLeft >= minSim) || (lastRight >= minSim)) {
+			//add in ties from left
+			int i = numWanted;
+			while (i < distsLeft.size()) {  
+				if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  }
+				else { break; }
+				i++;
+			}
+			
+			//add in ties from right
+			i = numWanted;
+			while (i < distsRight.size()) {  
+				if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  }
+				else { break; }
+				i++;
+			}
+		}
+		
 
-		//numWanted = dists.size();
 
 		//cout << numWanted << endl;
 		for (int i = 0; i < dists.size(); i++) {
 //			cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl;
 
-			if (thisTemplate[dists[i].index]->getName() != querySeq->getName()) {
+			if ((thisTemplate[dists[i].index]->getName() != querySeq->getName()) && (dists[i].dist >= minSim)) {
 				Sequence* temp = new Sequence(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
 			
 				seqsMatches.push_back(temp);
@@ -841,7 +859,7 @@ Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
 		
 		Sequence* seqsMatch;  
 		
-		Dist* distcalculator = new eachGapDistIgnoreNs();
+		Dist* distcalculator = new eachGapDist();
 		int index = 0;
 		int smallest = 1000000;
 		
diff --git a/decalc.h b/decalc.h
index 42e1441..40b3a10 100644
--- a/decalc.h
+++ b/decalc.h
@@ -39,7 +39,7 @@ class DeCalculator {
 		DeCalculator() { m = MothurOut::getInstance(); }
 		~DeCalculator() {};
 		
-		vector<Sequence*> findClosest(Sequence*, vector<Sequence*>&, vector<Sequence*>&, int);  //takes querySeq, a reference db, filteredRefDB, numWanted 
+		vector<Sequence*> findClosest(Sequence*, vector<Sequence*>&, vector<Sequence*>&, int, int);  //takes querySeq, a reference db, filteredRefDB, numWanted, minSim 
 		Sequence* findClosest(Sequence*, vector<Sequence*>);
 		set<int> getPos() {  return h;  }
 		void setMask(string); 
diff --git a/maligner.cpp b/maligner.cpp
index 52fd36f..f834db1 100644
--- a/maligner.cpp
+++ b/maligner.cpp
@@ -151,6 +151,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
 			temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
 			
 //			cout << temp.parent << '\t' << "NAST:" << temp.nastRegionStart << '-' << temp.nastRegionEnd << " G:" << temp.queryToParent << " L:" << temp.queryToParentLocal << endl;
+
 			outputResults.push_back(temp);
 		}
 		
diff --git a/slayer.cpp b/slayer.cpp
index 5ecb4ea..a244ea4 100644
--- a/slayer.cpp
+++ b/slayer.cpp
@@ -426,21 +426,41 @@ float Slayer::snpAB(vector<snps> data) {
 	}
 }
 /***********************************************************************/
-float Slayer::computePercentID(string queryFrag, string parent, int left, int right) {
+float Slayer::computePercentID(string queryAlign, string chimera, int left, int right) {
 	try {
-		int total = 0;
-		int matches = 0;
-
+				
+		int numIdentical = 0;
+		int countA = 0;
+		int countB = 0;
 		for (int i = left; i <= right; i++) {
-			total++;
-			if (queryFrag[i] == parent[i]) {
-				matches++;
+			if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) ||
+				((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {}
+			else {
+				
+				bool charA = false; bool charB = false;
+				if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; }
+				if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; }
+				
+				if (charA || charB) {
+					
+					if (charA) { countA++; }
+					if (charB) { countB++; }
+					
+					if (queryAlign[i] == chimera[i]) {
+						numIdentical++;
+					}
+				}
 			}
 		}
-
-		float percentID =( matches/(float)total) * 100;
 		
-		return percentID;
+		float numBases = (countA + countB) /(float) 2;
+		
+		if (numBases == 0) { return 0; }
+		
+		float percentIdentical = (numIdentical/(float)numBases) * 100;
+		
+		return percentIdentical;
+		
 	}
 	catch(exception& e) {
 		m->errorOut(e, "Slayer", "computePercentID");
-- 
2.39.5