]> git.donarmstrong.com Git - mothur.git/commitdiff
added more descriptive error messages to sharedlist
authorwestcott <westcott>
Tue, 1 Sep 2009 12:17:14 +0000 (12:17 +0000)
committerwestcott <westcott>
Tue, 1 Sep 2009 12:17:14 +0000 (12:17 +0000)
Mothur.xcodeproj/project.pbxproj
ccode.cpp
ccode.h
chimera.cpp
sharedlistvector.cpp

index c6d6c0ed29fa33724df4aae7934582ad63b14ddd..a0ba7c2d551cd765ce82ab5ea53362a3191434a6 100644 (file)
                        children = (
                                374F2FD51006320200E97C66 /* chimera.h */,
                                372095C1103196D70004D347 /* chimera.cpp */,
+                               A75B887A104C16860083C454 /* alignedsimilarity.h */,
+                               A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
                                374F2FE9100634B600E97C66 /* bellerophon.h */,
                                374F2FEA100634B600E97C66 /* bellerophon.cpp */,
-                               A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
-                               A75B887A104C16860083C454 /* alignedsimilarity.h */,
-                               A75B887B104C16860083C454 /* ccode.cpp */,
                                A75B887C104C16860083C454 /* ccode.h */,
+                               A75B887B104C16860083C454 /* ccode.cpp */,
                                372A55531017922B00C5194B /* decalc.h */,
                                372A55541017922B00C5194B /* decalc.cpp */,
                                374F301110063B6F00E97C66 /* pintail.h */,
index a3ef5e6bcb7ac8fed3fa62c49929c94b5fcdaa51..4ec3a12655d8a32fed228b5c9dc28717b194b6dc 100644 (file)
--- a/ccode.cpp
+++ b/ccode.cpp
@@ -9,6 +9,7 @@
 
 #include "ccode.h"
 #include "ignoregaps.h"
+#include "eachgapdist.h"
 
 
 //***************************************************************************************************************
@@ -86,7 +87,8 @@ void Ccode::getChimeras() {
                        templateLines.push_back(new linePair(0, templateSeqs.size()));
                #endif
        
-               distCalc = new ignoreGaps();
+               distCalc = new eachGapDist();
+               decalc = new DeCalculator();
                
                //find closest
                if (processors == 1) { 
@@ -96,15 +98,52 @@ void Ccode::getChimeras() {
                }else {         createProcessesClosest();               }
 
                
-               for (int i = 0; i < closest.size(); i++) {
-                       cout << querySeqs[i]->getName() << ": ";
-                       for (int j = 0; j < closest[i].size(); j++) {
+for (int i = 0; i < closest.size(); i++) {
+       cout << querySeqs[i]->getName() << ": ";
+       for (int j = 0; j < closest[i].size(); j++) {
                        
-                               cout << closest[i][j]->getName() << '\t';
+               cout << closest[i][j]->getName() << '\t';
+       }
+       cout << endl;
+}      
+
+               //mask sequences if the user wants to 
+               if (seqMask != "") {
+                       //mask querys
+                       for (int i = 0; i < querySeqs.size(); i++) {
+                               decalc->runMask(querySeqs[i]);
+                       }
+               
+                       //mask templates
+                       for (int i = 0; i < templateSeqs.size(); i++) {
+                               decalc->runMask(templateSeqs[i]);
                        }
-                       cout << endl;
                }
+               
+               if (filter) {
+                       vector<Sequence*> temp = templateSeqs;
+                       for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
+                       
+                       createFilter(temp);
                        
+                       runFilter(querySeqs);
+                       runFilter(templateSeqs);
+               }
+
+               //trim sequences - this follows ccodes remove_extra_gaps 
+               //just need to pass it query and template since closest points to template
+               trimSequences();
+               
+               //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
+               //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
+               windows = findWindows();  
+               
+               //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
+               for (int i = 0; i < closest.size(); i++) {
+                       removeBadReferenceSeqs(closest[i], i);
+               }
+                       
+                                       
                //free memory
                for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
                for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
@@ -115,15 +154,141 @@ void Ccode::getChimeras() {
                exit(1);
        }
 }
-/***************************************************************************************************************
+/***************************************************************************************************************/
+//ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
+void Ccode::trimSequences() {
+       try {
+               
+               int frontPos = 0;  //should contain first position in all seqs that is not a gap character
+               int rearPos = querySeqs[0]->getAligned().length();
+               
+               //********find first position in all seqs that is a non gap character***********//
+               //find first position all query seqs that is a non gap character
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       
+                       string aligned = querySeqs[i]->getAligned();
+                       int pos = 0;
+                       
+                       //find first spot in this seq
+                       for (int j = 0; j < aligned.length(); j++) {
+                               if (isalpha(aligned[j])) {
+                                       pos = j;
+                                       break;
+                               }
+                       }
+                       
+                       //save this spot if it is the farthest
+                       if (pos > frontPos) { frontPos = pos; }
+               }
+               
+               //find first position all template seqs that is a non gap character
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       
+                       string aligned = templateSeqs[i]->getAligned();
+                       int pos = 0;
+                       
+                       //find first spot in this seq
+                       for (int j = 0; j < aligned.length(); j++) {
+                               if (isalpha(aligned[j])) {
+                                       pos = j;
+                                       break;
+                               }
+                       }
+                       
+                       //save this spot if it is the farthest
+                       if (pos > frontPos) { frontPos = pos; }
+               }
+
+               
+               //********find last position in all seqs that is a non gap character***********//
+               //find last position all query seqs that is a non gap character
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       
+                       string aligned = querySeqs[i]->getAligned();
+                       int pos = aligned.length();
+                       
+                       //find first spot in this seq
+                       for (int j = aligned.length()-1; j >= 0; j--) {
+                               if (isalpha(aligned[j])) {
+                                       pos = j;
+                                       break;
+                               }
+                       }
+                       
+                       //save this spot if it is the farthest
+                       if (pos < rearPos) { rearPos = pos; }
+               }
+               
+               //find last position all template seqs that is a non gap character
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       
+                       string aligned = templateSeqs[i]->getAligned();
+                       int pos = aligned.length();
+                       
+                       //find first spot in this seq
+                       for (int j = aligned.length()-1; j >= 0; j--) {
+                               if (isalpha(aligned[j])) {
+                                       pos = j;
+                                       break;
+                               }
+                       }
+                       
+                       //save this spot if it is the farthest
+                       if (pos < rearPos) { rearPos = pos; }
+               }
+
+               
+               //check to make sure that is not whole seq
+               if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
+               
+               //***********trim all seqs to that position***************//
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       
+                       string aligned = querySeqs[i]->getAligned();
+                       
+                       //between the two points
+                       aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
+                       
+                       querySeqs[i]->setAligned(aligned);
+               }
+               
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       
+                       string aligned = templateSeqs[i]->getAligned();
+                       
+                       //between the two points
+                       aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
+                       
+                       templateSeqs[i]->setAligned(aligned);
+               }
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "trimSequences");
+               exit(1);
+       }
+
+}
+/***************************************************************************************************************/
 vector<int> Ccode::findWindows() {
        try {
                
                vector<int> win; 
+               int length = querySeqs[0]->getAligned().length();
                
-               if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
+               //default is wanted = 10% of total length
+               if (window > length) { 
+                       mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
+                       window = length / 10;
+               }else if (window == 0) { window = length / 10;  }
+               else if (window > (length / 20)) {
+                       mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
+               }else if (window < (length / 5)) {
+                       mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
+               }
                
-               for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
+               //save starting points of each window
+               for (int m = 0;  m < (length-window); m+=window) {  win.push_back(m);  }
 
                return win;
        
@@ -133,7 +298,102 @@ vector<int> Ccode::findWindows() {
                exit(1);
        }
 }
-*/
+//***************************************************************************************************************
+int Ccode::getDiff(string seqA, string seqB) {
+       try {
+               
+               int numDiff = 0;
+               
+               for (int i = 0; i < seqA.length(); i++) {
+                       //if you are both not gaps
+                       //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
+                               //are you different
+                               if (seqA[i] != seqB[i]) { numDiff++; }
+                       //}
+               }
+               
+               return numDiff;
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "getDiff");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+//tried to make this look most like ccode original implementation
+void Ccode::removeBadReferenceSeqs(vector<Sequence*>& seqs, int query) {
+       try {
+               
+               vector< vector<int> > numDiffBases;
+               numDiffBases.resize(seqs.size());
+               //initialize to 0
+               for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
+               
+               int length = seqs[0]->getAligned().length();
+               
+               //calc differences from each sequence to everyother seq in the set
+               for (int i = 0; i < seqs.size(); i++) {
+                       
+                       string seqA = seqs[i]->getAligned();
+                       
+                       //so you don't calc i to j and j to i since they are the same
+                       for (int j = 0; j < i; j++) {
+                               
+                               string seqB = seqs[j]->getAligned();
+                               
+                               //compare strings
+                               int numDiff = getDiff(seqA, seqB);
+                               
+                               numDiffBases[i][j] = numDiff;
+                               numDiffBases[j][i] = numDiff;
+                       }
+               }
+               
+               //initailize remove to 0
+               vector<int> remove;  remove.resize(seqs.size(), 0);
+               
+               //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
+               for (int i = 0; i < numDiffBases.size(); i++) {
+                       for (int j = 0; j < numDiffBases[i].size(); j++) {
+                               
+                               //are you more than 20% different
+                               if (numDiffBases[i][j] > ((20*length) / 100))           {  remove[j] = 1;  }
+                               //are you less than 0.5% different
+                               if (numDiffBases[i][j] < ((0.5*length) / 100))  {  remove[j] = 1;  }
+                       }
+               }
+               
+               int numSeqsLeft = 0;
+               
+               //count seqs that are not going to be removed
+               for (int i = 0; i < remove.size(); i++) {  
+                       if (remove[i] == 0)  { numSeqsLeft++;  }
+               }
+               
+               //if you have enough then remove bad ones
+               if (numSeqsLeft >= 3) {
+                       vector<Sequence*> goodSeqs;
+                       //remove bad seqs
+                       for (int i = 0; i < remove.size(); i++) {
+                               if (remove[i] == 0) { 
+                                       goodSeqs.push_back(seqs[i]);
+                               }
+                       }
+                       
+                       seqs = goodSeqs;
+                       
+               }else { //warn, but dont remove any
+                       mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
+               }
+                       
+
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "removeBadReferenceSeqs");
+               exit(1);
+       }
+}
 //***************************************************************************************************************
 vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
        try{
@@ -184,6 +444,26 @@ vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted
        }
 }
 /**************************************************************************************************/
+vector<float> Ccode::getAverageRef(vector<Sequence*> ref) {
+       try {
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "getAverageRef");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+vector<float> Ccode::getAverageQuery (vector<Sequence*> ref, int query) {
+       try {
+       
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "getAverageQuery");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
 void Ccode::createProcessesClosest() {
        try {
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
@@ -210,7 +490,6 @@ void Ccode::createProcessesClosest() {
                                
                                //output pairs
                                for (int i = lines[process]->start; i < lines[process]->end; i++) {
-                                        out << closest[i].size() << endl;
                                         for (int j = 0; j < closest[i].size(); j++) {
                                                closest[i][j]->printSequence(out);
                                         }
@@ -235,13 +514,10 @@ void Ccode::createProcessesClosest() {
                        
                        //get pairs
                        for (int k = lines[i]->start; k < lines[i]->end; k++) {
-                               int size;
-                               in >> size;
-                               gobble(in);
                                
                                vector<Sequence*> tempVector;
                                
-                               for (int j = 0; j < size; j++) {
+                               for (int j = 0; j < numWanted; j++) {
                                
                                        Sequence* temp = new Sequence(in);
                                        gobble(in);
diff --git a/ccode.h b/ccode.h
index a4860dcb83ee16ddaca43afd6cc4adb4a1973adc..cced0b7d404b4145a437b677f37fa869bb8abb3f 100644 (file)
--- a/ccode.h
+++ b/ccode.h
@@ -46,10 +46,20 @@ class Ccode : public Chimera {
                vector<Sequence*> querySeqs;
                vector<Sequence*> templateSeqs;
                
-               vector< vector<Sequence*> > closest;  //bestfit[0] is a vector of sequence at are closest to queryseqs[0]...
+               vector<int> windows;
+               vector< vector<Sequence*> > closest;  //closest[0] is a vector of sequence at are closest to queryseqs[0]...
+               vector< vector<float> > averageRef;  //averageRef[0] is the average distance at each window for the references for querySeqs[0]
+               vector< vector<float> > averageQuery;  //averageQuery[0] is the average distance at each winow for the query for querySeqs[0]
                
                vector< vector<Sequence*> > findClosest(int, int, int); 
-               void removeSeqs(vector<Sequence*>);  //removes sequences from closest that are to different of too similar to eachother. 
+               void removeBadReferenceSeqs(vector<Sequence*>&, int);  //removes sequences from closest that are to different of too similar to eachother. 
+               void trimSequences();
+               vector<int> findWindows();
+               vector<float> getAverageRef(vector<Sequence*>);
+               vector<float> getAverageQuery (vector<Sequence*>, int);
+               
+               
+               int getDiff(string, string);  //return number of mismatched bases, a gap to base is not counted as a mismatch
                
                void createProcessesClosest();
                
index 8aaab4b12a4c5c261f8a7a4494ca7ef4f12ea923..75e81c076a9ce113ff26e1d62027b86b16571527 100644 (file)
@@ -41,13 +41,16 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
                }
                
                //zero out spot where all sequences have blanks
+               int numColRemoved = 0;
                for(int i = 0;i < seqs[0]->getAligned().length(); i++){
-                       if(gaps[i] == seqs.size())      {       filterString[i] = '0';  }
+                       if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
                        
-                       else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  }
+                       else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  numColRemoved++;  }
                        //cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
                }
 //cout << "filter = " << filterString << endl; 
+
+               mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  mothurOutEndLine();
        }
        catch(exception& e) {
                errorOut(e, "Chimera", "createFilter");
index ba2ac1f54082a44cc188611994ff4f5c61b65ad3..a113ebe7e82e93f135541b0bfa0ffc7a3b1d33b1 100644 (file)
@@ -200,10 +200,14 @@ SharedOrderVector* SharedListVector::getSharedOrderVector(){
                                name = names.substr(0,names.find_first_of(','));
                                names = names.substr(names.find_first_of(',')+1, names.length());
                                groupName = groupmap->getGroup(name);
+                               
+                               if(groupName == "not found") {  mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
+                               
                                order->push_back(i, binSize, groupName);  //i represents what bin you are in
                        }
                        //get last name
                        groupName = groupmap->getGroup(names);
+                       if(groupName == "not found") {  mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
                        order->push_back(i, binSize, groupName);
                }
 
@@ -229,6 +233,7 @@ SharedRAbundVector SharedListVector::getSharedRAbundVector(string groupName) {
                                name = names.substr(0,names.find_first_of(','));
                                names = names.substr(names.find_first_of(',')+1, names.length());
                                group = groupmap->getGroup(name);
+                               if(group == "not found") {      mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
                                if (group == groupName) { //this name is in the group you want the vector for.
                                        rav.set(i, rav.getAbundance(i) + 1, group);  //i represents what bin you are in
                                }
@@ -236,6 +241,7 @@ SharedRAbundVector SharedListVector::getSharedRAbundVector(string groupName) {
                        
                        //get last name
                        groupName = groupmap->getGroup(names);
+                       if(groupName == "not found") {  mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
                        if (group == groupName) { //this name is in the group you want the vector for.
                                        rav.set(i, rav.getAbundance(i) + 1, group);  //i represents what bin you are in
                        }
@@ -281,11 +287,13 @@ vector<SharedRAbundVector*> SharedListVector::getSharedRAbundVector() {
                                name = names.substr(0,names.find_first_of(','));
                                names = names.substr(names.find_first_of(',')+1, names.length());
                                group = groupmap->getGroup(name);
+                               if(group == "not found") {      mothurOut("Error: Sequence '" + name + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
                                finder[group]->set(i, finder[group]->getAbundance(i) + 1, group);  //i represents what bin you are in
                        }
                        
                        //get last name
                        group = groupmap->getGroup(names);
+                       if(group == "not found") {      mothurOut("Error: Sequence '" + names + "' was not found in the group file, please correct."); mothurOutEndLine();  exit(1); }
                        finder[group]->set(i, finder[group]->getAbundance(i) + 1, group);  //i represents what bin you are in
                        
                }