From: westcott <westcott>
Date: Tue, 16 Feb 2010 15:23:05 +0000 (+0000)
Subject: chimeracode
X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=9ada98592a54c82d08f3d46c9b1d8c3e472a922d

chimeracode
---

diff --git a/blastdb.cpp b/blastdb.cpp
index 50a0401..b139f3e 100644
--- a/blastdb.cpp
+++ b/blastdb.cpp
@@ -50,7 +50,7 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
 		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
 		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
 		
-		string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n);
+		string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
 		blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
 		system(blastCommand.c_str());
 		
@@ -82,9 +82,9 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
 }
 /**************************************************************************************************/
 //assumes you have added all the template sequences using the addSequence function and run generateDB.
-map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 	try{
-		map<int, float> topMatches;
+		vector<int> topMatches;
 		
 		ofstream queryFile;
 		openOutputFile(queryFileName, queryFile);
@@ -96,7 +96,7 @@ map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
 		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
 		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
 	
-		string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -b " + toString(n) + " -v " + toString(n);
+		string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
 		blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
 		system(blastCommand.c_str());
 		
@@ -106,22 +106,18 @@ map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
 		string dummy;
 		int templateAccession;
 		gobble(m8FileHandle);
-//string name = seq->getName();
-//ofstream out;
-//openOutputFileAppend(name, out);	
+		
 		while(!m8FileHandle.eof()){
 			m8FileHandle >> dummy >> templateAccession >> searchScore;
-//out << dummy << '\t' <<  templateAccession	<< '\t' << searchScore << endl;
+			
 			//get rest of junk in line
-			while (!m8FileHandle.eof())	{	char c = m8FileHandle.get(); 
-			//out << c; 
-			if (c == 10 || c == 13){	break;	}	} 
+			while (!m8FileHandle.eof())	{	char c = m8FileHandle.get(); if (c == 10 || c == 13){	break;	}	} 
 			
 			gobble(m8FileHandle);
-			topMatches[templateAccession] = searchScore;
+			topMatches.push_back(templateAccession);
 		}
 		m8FileHandle.close();
-//out.close();		
+		
 		return topMatches;
 	}
 	catch(exception& e) {
diff --git a/blastdb.hpp b/blastdb.hpp
index fea55c8..0f8fccd 100644
--- a/blastdb.hpp
+++ b/blastdb.hpp
@@ -23,7 +23,7 @@ public:
 	void generateDB();
 	void addSequence(Sequence);
 	vector<int> findClosestSequences(Sequence*, int);
-	map<int, float> findClosest(Sequence*, int); //template index -> searchscore
+	vector<int> findClosestMegaBlast(Sequence*, int);
 
 private:
 	string dbFileName;
diff --git a/chimera.h b/chimera.h
index a968a02..4d59012 100644
--- a/chimera.h
+++ b/chimera.h
@@ -50,23 +50,11 @@ struct results {
 	float divR;
 };
 /***********************************************************************/
-struct rank {
-		int num;
-		float score;
-		rank(int n, float s) : num(n), score(s) {}
-};
-/***********************************************************************/
-
 struct SeqDist {
 	Sequence* seq;
 	float dist;
 };
 //********************************************************************************************************************
-//sorts highest to lowest
-inline bool compareMembers(rank left, rank right){
-	return (left.score > right.score);	
-} 
-//********************************************************************************************************************
 //sorts lowest to highest
 inline bool compareRegionStart(results left, results right){
 	return (left.nastRegionStart < right.nastRegionStart);	
diff --git a/chimerarealigner.cpp b/chimerarealigner.cpp
index 57c3641..f45b9a8 100644
--- a/chimerarealigner.cpp
+++ b/chimerarealigner.cpp
@@ -29,19 +29,23 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
 			sort(parents.begin(), parents.end(), compareRegionStart);
 
 			//make sure you don't cutoff beginning of query 
-			if (parents[0].nastRegionStart > 0) {  newQuery += qAligned.substr(0, parents[0].nastRegionStart+1);  }
+			if (parents[0].nastRegionStart > 0) {  newQuery += qAligned.substr(0, parents[0].nastRegionStart);  }
 			int longest = 0;
 			//take query and break apart into pieces using breakpoints given by results of parents
 			for (int i = 0; i < parents.size(); i++) {
-				int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
+	cout << parents[i].parent << '\t' << parents[i].nastRegionStart <<  '\t' << parents[i].nastRegionEnd	<< endl;	
+				int length = parents[i].nastRegionEnd - parents[i].nastRegionStart;
 				string q = qAligned.substr(parents[i].nastRegionStart, length);
+	cout << "query = " << q << endl;
 				Sequence* queryFrag = new Sequence(query->getName(), q);
 				
 				queryParts.push_back(queryFrag);
 				
 				Sequence* parent = getSequence(parents[i].parent);
 				string p = parent->getAligned();
+		
 				p = p.substr(parents[i].nastRegionStart, length);
+	cout << "parent = " << p << endl;		
 				parent->setAligned(p);
 				
 				parentParts.push_back(parent);
@@ -62,8 +66,9 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
 				newQuery += queryParts[i]->getAligned();
 			}
 			
+			
 			//make sure you don't cutoff end of query 
-			if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd-1);  }
+			if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd);  }
 			
 			//set query to new aligned string
 			query->setAligned(newQuery);
diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp
index b2c0a05..2cbd5d8 100644
--- a/chimeraslayer.cpp
+++ b/chimeraslayer.cpp
@@ -64,9 +64,9 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
 		vector<results> Results = maligner->getOutput();
 
 		//realign query to parents to improve slayers detection rate
-		//ChimeraReAligner realigner(templateSeqs, match, misMatch);
-		//realigner.reAlign(query, Results);
-			
+		ChimeraReAligner realigner(templateSeqs, match, misMatch);
+		realigner.reAlign(query, Results);
+cout << query->getName() << '\n' << query->getAligned() << endl;
 			//if (chimeraFlag == "yes") {
 			
 		//get sequence that were given from maligner results
diff --git a/database.hpp b/database.hpp
index 7905077..7fd29b3 100644
--- a/database.hpp
+++ b/database.hpp
@@ -48,7 +48,7 @@ public:
 	virtual void generateDB() = 0; 
 	virtual void addSequence(Sequence) = 0;  //add sequence to search engine
 	virtual vector<int> findClosestSequences(Sequence*, int) = 0;  // returns indexes of n closest sequences to query
-	virtual map<int, float> findClosest(Sequence*, int){ return results; }  // returns of n closest sequences to query and their search scores
+	virtual vector<int> findClosestMegaBlast(Sequence*, int){return results;}
 	virtual float getSearchScore();
 	virtual int getLongestBase(); 
 	virtual void readKmerDB(ifstream&){};
@@ -60,7 +60,7 @@ public:
 protected:
 	int numSeqs, longest;
 	float searchScore;
-	map<int, float> results;
+	vector<int> results;
 };
 /**************************************************************************************************/
 #endif
diff --git a/maligner.cpp b/maligner.cpp
index de6004d..b393194 100644
--- a/maligner.cpp
+++ b/maligner.cpp
@@ -32,28 +32,8 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
 			refSeqs = getBlastSeqs(query, numWanted);
 		}
 		
-//ofstream out;
-//string name = toString(numi+1); 
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = name + ".seqsformaligner";
-//openOutputFile(filename, out);	
-//for (int u = 0; u < refSeqs.size(); u++) {  refSeqs[u]->printSequence(out);	}
-//out.close();
-//filename = name + ".fasta";
-//openOutputFile(filename, out);	
-//query->printSequence(out);
-//out.close();
-
-//for (int i = 0; i < refSeqs.size(); i++)  { cout << refSeqs[i]->getName() << endl;  }	
-//cout << "before = " << refSeqs.size() << endl;
 		refSeqs = minCoverageFilter(refSeqs);
-//cout << "after = " << refSeqs.size() << endl;		
+		
 		if (refSeqs.size() < 2)  { 
 			for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];	}
 			percentIdenticalQueryChimera = 0.0;
@@ -61,7 +41,7 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
 		}
 		
 		int chimeraPenalty = computeChimeraPenalty();
-//cout << chimeraPenalty << endl;		
+	
 		//fills outputResults
 		chimera = chimeraMaligner(chimeraPenalty, decalc);
 		
@@ -500,8 +480,7 @@ float Maligner::computePercentID(string queryAlign, string chimera) {
 }
 //***************************************************************************************************************
 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
-	try {
-cout << q->getName() << endl;	
+	try {	
 		//generate blastdb
 		Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
 		for (int i = 0; i < db.size(); i++) { 	database->addSequence(*db[i]);	}
@@ -509,55 +488,42 @@ cout << q->getName() << endl;
 		database->setNumSeqs(db.size());
 		
 		//get parts of query
-		string queryAligned = q->getAligned();
-		string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
-		string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
-		
+		string queryUnAligned = q->getUnaligned();
+		string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
+		string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
+
 		Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
 		Sequence* queryRight = new Sequence(q->getName(), rightQuery);
 		
-		map<int, float> tempIndexesRight = database->findClosest(queryRight, num);
-		map<int, float> tempIndexesLeft = database->findClosest(queryLeft, num);
-		
-		//merge results
-		vector<rank> mergedResults;
+		vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
+		vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
 		
-		map<int, float>::iterator it;
-		map<int, float>::iterator it2;
+		//merge results		
+		map<int, int> seen;
+		map<int, int>::iterator it;
 		
-		//add in right guys merging common finds
-		for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) {
-			it2 = tempIndexesLeft.find(it->first);
-			
-			if (it2 == tempIndexesLeft.end()) {  //result only present in right
-				rank temp(it->first, it->second);
-				mergedResults.push_back(temp); 
+		vector<int> mergedResults;
+		for (int i = 0; i < tempIndexesLeft.size(); i++) {
+			//add left if you havent already
+			it = seen.find(tempIndexesLeft[i]);
+			if (it == seen.end()) {  
+				mergedResults.push_back(tempIndexesLeft[i]);
+				seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
+			}
 
-			}else { //result present in both save best score
-				float bestscore;
-				if (it->second > it2->second)	{  bestscore = it->second;  }
-				else							{  bestscore = it2->second; }
-				
-				rank temp(it->first, bestscore);
-				mergedResults.push_back(temp);
-				
-				tempIndexesLeft.erase(it2);
+			//add right if you havent already
+			it = seen.find(tempIndexesRight[i]);
+			if (it == seen.end()) {  
+				mergedResults.push_back(tempIndexesRight[i]);
+				seen[tempIndexesRight[i]] = tempIndexesRight[i];
 			}
 		}
 		
-		//add in unique left guys
-		for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) {
-			rank temp(it->first, it->second);
-			mergedResults.push_back(temp); 
-		}
-		
-		sort(mergedResults.begin(), mergedResults.end(), compareMembers);
 		
 		vector<Sequence*> refResults;
 		for (int i = 0; i < numWanted; i++) {
-			Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned());
+			Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
 			refResults.push_back(temp);
-cout << db[mergedResults[i].num]->getName() << endl;
 		}
 		
 		delete queryRight;