From: westcott Date: Tue, 1 Sep 2009 12:17:14 +0000 (+0000) Subject: added more descriptive error messages to sharedlist X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=6d7408400b6bbdde4173922c5dca528f9f4e0a22 added more descriptive error messages to sharedlist --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index c6d6c0e..a0ba7c2 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -649,12 +649,12 @@ children = ( 374F2FD51006320200E97C66 /* chimera.h */, 372095C1103196D70004D347 /* chimera.cpp */, + A75B887A104C16860083C454 /* alignedsimilarity.h */, + A75B8879104C16860083C454 /* alignedsimilarity.cpp */, 374F2FE9100634B600E97C66 /* bellerophon.h */, 374F2FEA100634B600E97C66 /* bellerophon.cpp */, - A75B8879104C16860083C454 /* alignedsimilarity.cpp */, - A75B887A104C16860083C454 /* alignedsimilarity.h */, - A75B887B104C16860083C454 /* ccode.cpp */, A75B887C104C16860083C454 /* ccode.h */, + A75B887B104C16860083C454 /* ccode.cpp */, 372A55531017922B00C5194B /* decalc.h */, 372A55541017922B00C5194B /* decalc.cpp */, 374F301110063B6F00E97C66 /* pintail.h */, diff --git a/ccode.cpp b/ccode.cpp index a3ef5e6..4ec3a12 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -9,6 +9,7 @@ #include "ccode.h" #include "ignoregaps.h" +#include "eachgapdist.h" //*************************************************************************************************************** @@ -86,7 +87,8 @@ void Ccode::getChimeras() { templateLines.push_back(new linePair(0, templateSeqs.size())); #endif - distCalc = new ignoreGaps(); + distCalc = new eachGapDist(); + decalc = new DeCalculator(); //find closest if (processors == 1) { @@ -96,15 +98,52 @@ void Ccode::getChimeras() { }else { createProcessesClosest(); } - for (int i = 0; i < closest.size(); i++) { - cout << querySeqs[i]->getName() << ": "; - for (int j = 0; j < closest[i].size(); j++) { +for (int i = 0; i < closest.size(); i++) { + cout << querySeqs[i]->getName() << ": "; + for (int j = 0; j < closest[i].size(); j++) { - cout << closest[i][j]->getName() << '\t'; + cout << closest[i][j]->getName() << '\t'; + } + cout << endl; +} + + //mask sequences if the user wants to + if (seqMask != "") { + //mask querys + for (int i = 0; i < querySeqs.size(); i++) { + decalc->runMask(querySeqs[i]); + } + + //mask templates + for (int i = 0; i < templateSeqs.size(); i++) { + decalc->runMask(templateSeqs[i]); } - cout << endl; } + + if (filter) { + vector temp = templateSeqs; + for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); } + + createFilter(temp); + runFilter(querySeqs); + runFilter(templateSeqs); + } + + //trim sequences - this follows ccodes remove_extra_gaps + //just need to pass it query and template since closest points to template + trimSequences(); + + //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length(). + //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations + windows = findWindows(); + + //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later + for (int i = 0; i < closest.size(); i++) { + removeBadReferenceSeqs(closest[i], i); + } + + //free memory for (int i = 0; i < lines.size(); i++) { delete lines[i]; } for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } @@ -115,15 +154,141 @@ void Ccode::getChimeras() { exit(1); } } -/*************************************************************************************************************** +/***************************************************************************************************************/ +//ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences." +void Ccode::trimSequences() { + try { + + int frontPos = 0; //should contain first position in all seqs that is not a gap character + int rearPos = querySeqs[0]->getAligned().length(); + + //********find first position in all seqs that is a non gap character***********// + //find first position all query seqs that is a non gap character + for (int i = 0; i < querySeqs.size(); i++) { + + string aligned = querySeqs[i]->getAligned(); + int pos = 0; + + //find first spot in this seq + for (int j = 0; j < aligned.length(); j++) { + if (isalpha(aligned[j])) { + pos = j; + break; + } + } + + //save this spot if it is the farthest + if (pos > frontPos) { frontPos = pos; } + } + + //find first position all template seqs that is a non gap character + for (int i = 0; i < templateSeqs.size(); i++) { + + string aligned = templateSeqs[i]->getAligned(); + int pos = 0; + + //find first spot in this seq + for (int j = 0; j < aligned.length(); j++) { + if (isalpha(aligned[j])) { + pos = j; + break; + } + } + + //save this spot if it is the farthest + if (pos > frontPos) { frontPos = pos; } + } + + + //********find last position in all seqs that is a non gap character***********// + //find last position all query seqs that is a non gap character + for (int i = 0; i < querySeqs.size(); i++) { + + string aligned = querySeqs[i]->getAligned(); + int pos = aligned.length(); + + //find first spot in this seq + for (int j = aligned.length()-1; j >= 0; j--) { + if (isalpha(aligned[j])) { + pos = j; + break; + } + } + + //save this spot if it is the farthest + if (pos < rearPos) { rearPos = pos; } + } + + //find last position all template seqs that is a non gap character + for (int i = 0; i < templateSeqs.size(); i++) { + + string aligned = templateSeqs[i]->getAligned(); + int pos = aligned.length(); + + //find first spot in this seq + for (int j = aligned.length()-1; j >= 0; j--) { + if (isalpha(aligned[j])) { + pos = j; + break; + } + } + + //save this spot if it is the farthest + if (pos < rearPos) { rearPos = pos; } + } + + + //check to make sure that is not whole seq + if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); } + + //***********trim all seqs to that position***************// + for (int i = 0; i < querySeqs.size(); i++) { + + string aligned = querySeqs[i]->getAligned(); + + //between the two points + aligned = aligned.substr(frontPos, (rearPos-frontPos-1)); + + querySeqs[i]->setAligned(aligned); + } + + for (int i = 0; i < templateSeqs.size(); i++) { + + string aligned = templateSeqs[i]->getAligned(); + + //between the two points + aligned = aligned.substr(frontPos, (rearPos-frontPos-1)); + + templateSeqs[i]->setAligned(aligned); + } + + } + catch(exception& e) { + errorOut(e, "Ccode", "trimSequences"); + exit(1); + } + +} +/***************************************************************************************************************/ vector Ccode::findWindows() { try { vector win; + int length = querySeqs[0]->getAligned().length(); - if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; } + //default is wanted = 10% of total length + if (window > length) { + mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length."); + window = length / 10; + }else if (window == 0) { window = length / 10; } + else if (window > (length / 20)) { + mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); + }else if (window < (length / 5)) { + mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); + } - for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); } + //save starting points of each window + for (int m = 0; m < (length-window); m+=window) { win.push_back(m); } return win; @@ -133,7 +298,102 @@ vector Ccode::findWindows() { exit(1); } } -*/ +//*************************************************************************************************************** +int Ccode::getDiff(string seqA, string seqB) { + try { + + int numDiff = 0; + + for (int i = 0; i < seqA.length(); i++) { + //if you are both not gaps + //if (isalpha(seqA[i]) && isalpha(seqA[i])) { + //are you different + if (seqA[i] != seqB[i]) { numDiff++; } + //} + } + + return numDiff; + + } + catch(exception& e) { + errorOut(e, "Ccode", "getDiff"); + exit(1); + } +} +//*************************************************************************************************************** +//tried to make this look most like ccode original implementation +void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { + try { + + vector< vector > numDiffBases; + numDiffBases.resize(seqs.size()); + //initialize to 0 + for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); } + + int length = seqs[0]->getAligned().length(); + + //calc differences from each sequence to everyother seq in the set + for (int i = 0; i < seqs.size(); i++) { + + string seqA = seqs[i]->getAligned(); + + //so you don't calc i to j and j to i since they are the same + for (int j = 0; j < i; j++) { + + string seqB = seqs[j]->getAligned(); + + //compare strings + int numDiff = getDiff(seqA, seqB); + + numDiffBases[i][j] = numDiff; + numDiffBases[j][i] = numDiff; + } + } + + //initailize remove to 0 + vector remove; remove.resize(seqs.size(), 0); + + //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set + for (int i = 0; i < numDiffBases.size(); i++) { + for (int j = 0; j < numDiffBases[i].size(); j++) { + + //are you more than 20% different + if (numDiffBases[i][j] > ((20*length) / 100)) { remove[j] = 1; } + //are you less than 0.5% different + if (numDiffBases[i][j] < ((0.5*length) / 100)) { remove[j] = 1; } + } + } + + int numSeqsLeft = 0; + + //count seqs that are not going to be removed + for (int i = 0; i < remove.size(); i++) { + if (remove[i] == 0) { numSeqsLeft++; } + } + + //if you have enough then remove bad ones + if (numSeqsLeft >= 3) { + vector goodSeqs; + //remove bad seqs + for (int i = 0; i < remove.size(); i++) { + if (remove[i] == 0) { + goodSeqs.push_back(seqs[i]); + } + } + + seqs = goodSeqs; + + }else { //warn, but dont remove any + mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine(); + } + + + } + catch(exception& e) { + errorOut(e, "Ccode", "removeBadReferenceSeqs"); + exit(1); + } +} //*************************************************************************************************************** vector< vector > Ccode::findClosest(int start, int end, int numWanted) { try{ @@ -184,6 +444,26 @@ vector< vector > Ccode::findClosest(int start, int end, int numWanted } } /**************************************************************************************************/ +vector Ccode::getAverageRef(vector ref) { + try { + } + catch(exception& e) { + errorOut(e, "Ccode", "getAverageRef"); + exit(1); + } +} +/**************************************************************************************************/ +vector Ccode::getAverageQuery (vector ref, int query) { + try { + + + } + catch(exception& e) { + errorOut(e, "Ccode", "getAverageQuery"); + exit(1); + } +} +/**************************************************************************************************/ void Ccode::createProcessesClosest() { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -210,7 +490,6 @@ void Ccode::createProcessesClosest() { //output pairs for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << closest[i].size() << endl; for (int j = 0; j < closest[i].size(); j++) { closest[i][j]->printSequence(out); } @@ -235,13 +514,10 @@ void Ccode::createProcessesClosest() { //get pairs for (int k = lines[i]->start; k < lines[i]->end; k++) { - int size; - in >> size; - gobble(in); vector tempVector; - for (int j = 0; j < size; j++) { + for (int j = 0; j < numWanted; j++) { Sequence* temp = new Sequence(in); gobble(in); diff --git a/ccode.h b/ccode.h index a4860dc..cced0b7 100644 --- a/ccode.h +++ b/ccode.h @@ -46,10 +46,20 @@ class Ccode : public Chimera { vector querySeqs; vector templateSeqs; - vector< vector > closest; //bestfit[0] is a vector of sequence at are closest to queryseqs[0]... + vector windows; + vector< vector > closest; //closest[0] is a vector of sequence at are closest to queryseqs[0]... + vector< vector > averageRef; //averageRef[0] is the average distance at each window for the references for querySeqs[0] + vector< vector > averageQuery; //averageQuery[0] is the average distance at each winow for the query for querySeqs[0] vector< vector > findClosest(int, int, int); - void removeSeqs(vector); //removes sequences from closest that are to different of too similar to eachother. + void removeBadReferenceSeqs(vector&, int); //removes sequences from closest that are to different of too similar to eachother. + void trimSequences(); + vector findWindows(); + vector getAverageRef(vector); + vector getAverageQuery (vector, int); + + + int getDiff(string, string); //return number of mismatched bases, a gap to base is not counted as a mismatch void createProcessesClosest(); diff --git a/chimera.cpp b/chimera.cpp index 8aaab4b..75e81c0 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -41,13 +41,16 @@ void Chimera::createFilter(vector seqs) { } //zero out spot where all sequences have blanks + int numColRemoved = 0; for(int i = 0;i < seqs[0]->getAligned().length(); i++){ - if(gaps[i] == seqs.size()) { filterString[i] = '0'; } + if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; } - else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) { filterString[i] = '0'; } + else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) { filterString[i] = '0'; numColRemoved++; } //cout << "a = " << a[i] << " t = " << t[i] << " g = " << g[i] << " c = " << c[i] << endl; } //cout << "filter = " << filterString << endl; + + mothurOut("Filter removed " + toString(numColRemoved) + " columns."); mothurOutEndLine(); } catch(exception& e) { errorOut(e, "Chimera", "createFilter"); diff --git a/sharedlistvector.cpp b/sharedlistvector.cpp index ba2ac1f..a113ebe 100644 --- a/sharedlistvector.cpp +++ b/sharedlistvector.cpp @@ -200,10 +200,14 @@ SharedOrderVector* SharedListVector::getSharedOrderVector(){ name = names.substr(0,names.find_first_of(',')); names = names.substr(names.find_first_of(',')+1, names.length()); groupName = groupmap->getGroup(name); + + if(groupName == "not found") { mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } + order->push_back(i, binSize, groupName); //i represents what bin you are in } //get last name groupName = groupmap->getGroup(names); + if(groupName == "not found") { mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } order->push_back(i, binSize, groupName); } @@ -229,6 +233,7 @@ SharedRAbundVector SharedListVector::getSharedRAbundVector(string groupName) { name = names.substr(0,names.find_first_of(',')); names = names.substr(names.find_first_of(',')+1, names.length()); group = groupmap->getGroup(name); + if(group == "not found") { mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } if (group == groupName) { //this name is in the group you want the vector for. rav.set(i, rav.getAbundance(i) + 1, group); //i represents what bin you are in } @@ -236,6 +241,7 @@ SharedRAbundVector SharedListVector::getSharedRAbundVector(string groupName) { //get last name groupName = groupmap->getGroup(names); + if(groupName == "not found") { mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } if (group == groupName) { //this name is in the group you want the vector for. rav.set(i, rav.getAbundance(i) + 1, group); //i represents what bin you are in } @@ -281,11 +287,13 @@ vector SharedListVector::getSharedRAbundVector() { name = names.substr(0,names.find_first_of(',')); names = names.substr(names.find_first_of(',')+1, names.length()); group = groupmap->getGroup(name); + if(group == "not found") { mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } finder[group]->set(i, finder[group]->getAbundance(i) + 1, group); //i represents what bin you are in } //get last name group = groupmap->getGroup(names); + if(group == "not found") { mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine(); exit(1); } finder[group]->set(i, finder[group]->getAbundance(i) + 1, group); //i represents what bin you are in }