From: westcott <westcott>
Date: Tue, 1 Sep 2009 12:17:14 +0000 (+0000)
Subject: added more descriptive error messages to sharedlist
X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=6d7408400b6bbdde4173922c5dca528f9f4e0a22

added more descriptive error messages to sharedlist
---

diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index c6d6c0e..a0ba7c2 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -649,12 +649,12 @@
 			children = (
 				374F2FD51006320200E97C66 /* chimera.h */,
 				372095C1103196D70004D347 /* chimera.cpp */,
+				A75B887A104C16860083C454 /* alignedsimilarity.h */,
+				A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
 				374F2FE9100634B600E97C66 /* bellerophon.h */,
 				374F2FEA100634B600E97C66 /* bellerophon.cpp */,
-				A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
-				A75B887A104C16860083C454 /* alignedsimilarity.h */,
-				A75B887B104C16860083C454 /* ccode.cpp */,
 				A75B887C104C16860083C454 /* ccode.h */,
+				A75B887B104C16860083C454 /* ccode.cpp */,
 				372A55531017922B00C5194B /* decalc.h */,
 				372A55541017922B00C5194B /* decalc.cpp */,
 				374F301110063B6F00E97C66 /* pintail.h */,
diff --git a/ccode.cpp b/ccode.cpp
index a3ef5e6..4ec3a12 100644
--- a/ccode.cpp
+++ b/ccode.cpp
@@ -9,6 +9,7 @@
 
 #include "ccode.h"
 #include "ignoregaps.h"
+#include "eachgapdist.h"
 
 
 //***************************************************************************************************************
@@ -86,7 +87,8 @@ void Ccode::getChimeras() {
 			templateLines.push_back(new linePair(0, templateSeqs.size()));
 		#endif
 	
-		distCalc = new ignoreGaps();
+		distCalc = new eachGapDist();
+		decalc = new DeCalculator();
 		
 		//find closest
 		if (processors == 1) { 
@@ -96,15 +98,52 @@ void Ccode::getChimeras() {
 		}else {		createProcessesClosest();		}
 
 		
-		for (int i = 0; i < closest.size(); i++) {
-			cout << querySeqs[i]->getName() << ": ";
-			for (int j = 0; j < closest[i].size(); j++) {
+for (int i = 0; i < closest.size(); i++) {
+	cout << querySeqs[i]->getName() << ": ";
+	for (int j = 0; j < closest[i].size(); j++) {
 			
-				cout << closest[i][j]->getName() << '\t';
+		cout << closest[i][j]->getName() << '\t';
+	}
+	cout << endl;
+}	
+
+		//mask sequences if the user wants to 
+		if (seqMask != "") {
+			//mask querys
+			for (int i = 0; i < querySeqs.size(); i++) {
+				decalc->runMask(querySeqs[i]);
+			}
+		
+			//mask templates
+			for (int i = 0; i < templateSeqs.size(); i++) {
+				decalc->runMask(templateSeqs[i]);
 			}
-			cout << endl;
 		}
+		
+		if (filter) {
+			vector<Sequence*> temp = templateSeqs;
+			for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
+			
+			createFilter(temp);
 			
+			runFilter(querySeqs);
+			runFilter(templateSeqs);
+		}
+
+		//trim sequences - this follows ccodes remove_extra_gaps 
+		//just need to pass it query and template since closest points to template
+		trimSequences();
+		
+		//windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
+		//Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
+		windows = findWindows();  
+		
+		//remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
+		for (int i = 0; i < closest.size(); i++) {
+			removeBadReferenceSeqs(closest[i], i);
+		}
+			
+					
 		//free memory
 		for (int i = 0; i < lines.size(); i++)					{	delete lines[i];				}
 		for (int i = 0; i < templateLines.size(); i++)			{	delete templateLines[i];		}
@@ -115,15 +154,141 @@ void Ccode::getChimeras() {
 		exit(1);
 	}
 }
-/***************************************************************************************************************
+/***************************************************************************************************************/
+//ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
+void Ccode::trimSequences() {
+	try {
+		
+		int frontPos = 0;  //should contain first position in all seqs that is not a gap character
+		int rearPos = querySeqs[0]->getAligned().length();
+		
+		//********find first position in all seqs that is a non gap character***********//
+		//find first position all query seqs that is a non gap character
+		for (int i = 0; i < querySeqs.size(); i++) {
+			
+			string aligned = querySeqs[i]->getAligned();
+			int pos = 0;
+			
+			//find first spot in this seq
+			for (int j = 0; j < aligned.length(); j++) {
+				if (isalpha(aligned[j])) {
+					pos = j;
+					break;
+				}
+			}
+			
+			//save this spot if it is the farthest
+			if (pos > frontPos) { frontPos = pos; }
+		}
+		
+		//find first position all template seqs that is a non gap character
+		for (int i = 0; i < templateSeqs.size(); i++) {
+			
+			string aligned = templateSeqs[i]->getAligned();
+			int pos = 0;
+			
+			//find first spot in this seq
+			for (int j = 0; j < aligned.length(); j++) {
+				if (isalpha(aligned[j])) {
+					pos = j;
+					break;
+				}
+			}
+			
+			//save this spot if it is the farthest
+			if (pos > frontPos) { frontPos = pos; }
+		}
+
+		
+		//********find last position in all seqs that is a non gap character***********//
+		//find last position all query seqs that is a non gap character
+		for (int i = 0; i < querySeqs.size(); i++) {
+			
+			string aligned = querySeqs[i]->getAligned();
+			int pos = aligned.length();
+			
+			//find first spot in this seq
+			for (int j = aligned.length()-1; j >= 0; j--) {
+				if (isalpha(aligned[j])) {
+					pos = j;
+					break;
+				}
+			}
+			
+			//save this spot if it is the farthest
+			if (pos < rearPos) { rearPos = pos; }
+		}
+		
+		//find last position all template seqs that is a non gap character
+		for (int i = 0; i < templateSeqs.size(); i++) {
+			
+			string aligned = templateSeqs[i]->getAligned();
+			int pos = aligned.length();
+			
+			//find first spot in this seq
+			for (int j = aligned.length()-1; j >= 0; j--) {
+				if (isalpha(aligned[j])) {
+					pos = j;
+					break;
+				}
+			}
+			
+			//save this spot if it is the farthest
+			if (pos < rearPos) { rearPos = pos; }
+		}
+
+		
+		//check to make sure that is not whole seq
+		if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
+		
+		//***********trim all seqs to that position***************//
+		for (int i = 0; i < querySeqs.size(); i++) {
+			
+			string aligned = querySeqs[i]->getAligned();
+			
+			//between the two points
+			aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
+			
+			querySeqs[i]->setAligned(aligned);
+		}
+		
+		for (int i = 0; i < templateSeqs.size(); i++) {
+			
+			string aligned = templateSeqs[i]->getAligned();
+			
+			//between the two points
+			aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
+			
+			templateSeqs[i]->setAligned(aligned);
+		}
+	
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "trimSequences");
+		exit(1);
+	}
+
+}
+/***************************************************************************************************************/
 vector<int> Ccode::findWindows() {
 	try {
 		
 		vector<int> win; 
+		int length = querySeqs[0]->getAligned().length();
 		
-		if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
+		//default is wanted = 10% of total length
+		if (window > length) { 
+			mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
+			window = length / 10;
+		}else if (window == 0) { window = length / 10;  }
+		else if (window > (length / 20)) {
+			mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
+		}else if (window < (length / 5)) {
+			mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
+		}
 		
-		for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
+		//save starting points of each window
+		for (int m = 0;  m < (length-window); m+=window) {  win.push_back(m);  }
 
 		return win;
 	
@@ -133,7 +298,102 @@ vector<int> Ccode::findWindows() {
 		exit(1);
 	}
 }
-*/
+//***************************************************************************************************************
+int Ccode::getDiff(string seqA, string seqB) {
+	try {
+		
+		int numDiff = 0;
+		
+		for (int i = 0; i < seqA.length(); i++) {
+			//if you are both not gaps
+			//if (isalpha(seqA[i]) && isalpha(seqA[i])) {
+				//are you different
+				if (seqA[i] != seqB[i]) { numDiff++; }
+			//}
+		}
+		
+		return numDiff;
+	
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "getDiff");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+//tried to make this look most like ccode original implementation
+void Ccode::removeBadReferenceSeqs(vector<Sequence*>& seqs, int query) {
+	try {
+		
+		vector< vector<int> > numDiffBases;
+		numDiffBases.resize(seqs.size());
+		//initialize to 0
+		for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
+		
+		int length = seqs[0]->getAligned().length();
+		
+		//calc differences from each sequence to everyother seq in the set
+		for (int i = 0; i < seqs.size(); i++) {
+			
+			string seqA = seqs[i]->getAligned();
+			
+			//so you don't calc i to j and j to i since they are the same
+			for (int j = 0; j < i; j++) {
+				
+				string seqB = seqs[j]->getAligned();
+				
+				//compare strings
+				int numDiff = getDiff(seqA, seqB);
+				
+				numDiffBases[i][j] = numDiff;
+				numDiffBases[j][i] = numDiff;
+			}
+		}
+		
+		//initailize remove to 0
+		vector<int> remove;  remove.resize(seqs.size(), 0);
+		
+		//check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
+		for (int i = 0; i < numDiffBases.size(); i++) {
+			for (int j = 0; j < numDiffBases[i].size(); j++) {
+				
+				//are you more than 20% different
+				if (numDiffBases[i][j] > ((20*length) / 100))		{  remove[j] = 1;  }
+				//are you less than 0.5% different
+				if (numDiffBases[i][j] < ((0.5*length) / 100))	{  remove[j] = 1;  }
+			}
+		}
+		
+		int numSeqsLeft = 0;
+		
+		//count seqs that are not going to be removed
+		for (int i = 0; i < remove.size(); i++) {  
+			if (remove[i] == 0)  { numSeqsLeft++;  }
+		}
+		
+		//if you have enough then remove bad ones
+		if (numSeqsLeft >= 3) {
+			vector<Sequence*> goodSeqs;
+			//remove bad seqs
+			for (int i = 0; i < remove.size(); i++) {
+				if (remove[i] == 0) { 
+					goodSeqs.push_back(seqs[i]);
+				}
+			}
+			
+			seqs = goodSeqs;
+			
+		}else { //warn, but dont remove any
+			mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
+		}
+			
+
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "removeBadReferenceSeqs");
+		exit(1);
+	}
+}
 //***************************************************************************************************************
 vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
 	try{
@@ -184,6 +444,26 @@ vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted
 	}
 }
 /**************************************************************************************************/
+vector<float> Ccode::getAverageRef(vector<Sequence*> ref) {
+	try {
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "getAverageRef");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+vector<float> Ccode::getAverageQuery (vector<Sequence*> ref, int query) {
+	try {
+	
+	
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "getAverageQuery");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
 void Ccode::createProcessesClosest() {
 	try {
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
@@ -210,7 +490,6 @@ void Ccode::createProcessesClosest() {
 				
 				//output pairs
 				for (int i = lines[process]->start; i < lines[process]->end; i++) {
-					 out << closest[i].size() << endl;
 					 for (int j = 0; j < closest[i].size(); j++) {
 						closest[i][j]->printSequence(out);
 					 }
@@ -235,13 +514,10 @@ void Ccode::createProcessesClosest() {
 			
 			//get pairs
 			for (int k = lines[i]->start; k < lines[i]->end; k++) {
-				int size;
-				in >> size;
-				gobble(in);
 				
 				vector<Sequence*> tempVector;
 				
-				for (int j = 0; j < size; j++) {
+				for (int j = 0; j < numWanted; j++) {
 				
 					Sequence* temp = new Sequence(in);
 					gobble(in);
diff --git a/ccode.h b/ccode.h
index a4860dc..cced0b7 100644
--- a/ccode.h
+++ b/ccode.h
@@ -46,10 +46,20 @@ class Ccode : public Chimera {
 		vector<Sequence*> querySeqs;
 		vector<Sequence*> templateSeqs;
 		
-		vector< vector<Sequence*> > closest;  //bestfit[0] is a vector of sequence at are closest to queryseqs[0]...
+		vector<int> windows;
+		vector< vector<Sequence*> > closest;  //closest[0] is a vector of sequence at are closest to queryseqs[0]...
+		vector< vector<float> > averageRef;  //averageRef[0] is the average distance at each window for the references for querySeqs[0]
+		vector< vector<float> > averageQuery;  //averageQuery[0] is the average distance at each winow for the query for querySeqs[0]
 		
 		vector< vector<Sequence*> > findClosest(int, int, int); 
-		void removeSeqs(vector<Sequence*>);  //removes sequences from closest that are to different of too similar to eachother. 
+		void removeBadReferenceSeqs(vector<Sequence*>&, int);  //removes sequences from closest that are to different of too similar to eachother. 
+		void trimSequences();
+		vector<int> findWindows();
+		vector<float> getAverageRef(vector<Sequence*>);
+		vector<float> getAverageQuery (vector<Sequence*>, int);
+		
+		
+		int getDiff(string, string);  //return number of mismatched bases, a gap to base is not counted as a mismatch
 		
 		void createProcessesClosest();
 		
diff --git a/chimera.cpp b/chimera.cpp
index 8aaab4b..75e81c0 100644
--- a/chimera.cpp
+++ b/chimera.cpp
@@ -41,13 +41,16 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
 		}
 		
 		//zero out spot where all sequences have blanks
+		int numColRemoved = 0;
 		for(int i = 0;i < seqs[0]->getAligned().length(); i++){
-			if(gaps[i] == seqs.size())	{	filterString[i] = '0'; 	}
+			if(gaps[i] == seqs.size())	{	filterString[i] = '0'; 	numColRemoved++;  }
 			
-			else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {	filterString[i] = '0';	}
+			else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {	filterString[i] = '0';	numColRemoved++;  }
 			//cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
 		}
 //cout << "filter = " << filterString << endl;	
+
+		mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  mothurOutEndLine();
 	}
 	catch(exception& e) {
 		errorOut(e, "Chimera", "createFilter");
diff --git a/sharedlistvector.cpp b/sharedlistvector.cpp
index ba2ac1f..a113ebe 100644
--- a/sharedlistvector.cpp
+++ b/sharedlistvector.cpp
@@ -200,10 +200,14 @@ SharedOrderVector* SharedListVector::getSharedOrderVector(){
 				name = names.substr(0,names.find_first_of(','));
 				names = names.substr(names.find_first_of(',')+1, names.length());
 				groupName = groupmap->getGroup(name);
+				
+				if(groupName == "not found") {	mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
+				
 				order->push_back(i, binSize, groupName);  //i represents what bin you are in
 			}
 			//get last name
 			groupName = groupmap->getGroup(names);
+			if(groupName == "not found") {	mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
 			order->push_back(i, binSize, groupName);
 		}
 
@@ -229,6 +233,7 @@ SharedRAbundVector SharedListVector::getSharedRAbundVector(string groupName) {
 				name = names.substr(0,names.find_first_of(','));
 				names = names.substr(names.find_first_of(',')+1, names.length());
 				group = groupmap->getGroup(name);
+				if(group == "not found") {	mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
 				if (group == groupName) { //this name is in the group you want the vector for.
 					rav.set(i, rav.getAbundance(i) + 1, group);  //i represents what bin you are in
 				}
@@ -236,6 +241,7 @@ SharedRAbundVector SharedListVector::getSharedRAbundVector(string groupName) {
 			
 			//get last name
 			groupName = groupmap->getGroup(names);
+			if(groupName == "not found") {	mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
 			if (group == groupName) { //this name is in the group you want the vector for.
 					rav.set(i, rav.getAbundance(i) + 1, group);  //i represents what bin you are in
 			}
@@ -281,11 +287,13 @@ vector<SharedRAbundVector*> SharedListVector::getSharedRAbundVector() {
 				name = names.substr(0,names.find_first_of(','));
 				names = names.substr(names.find_first_of(',')+1, names.length());
 				group = groupmap->getGroup(name);
+				if(group == "not found") {	mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
 				finder[group]->set(i, finder[group]->getAbundance(i) + 1, group);  //i represents what bin you are in
 			}
 			
 			//get last name
 			group = groupmap->getGroup(names);
+			if(group == "not found") {	mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
 			finder[group]->set(i, finder[group]->getAbundance(i) + 1, group);  //i represents what bin you are in
 			
 		}