]> git.donarmstrong.com Git - mothur.git/blobdiff - pintail.cpp
modified mpi code to save ram by writing out every 10 seqs.
[mothur.git] / pintail.cpp
index 74e1ce47a80e60ec4c27e685f53e6dbfedfb6da9..6e9e95c9f7eaf20f91ba312b93a52be67c82b828 100644 (file)
@@ -32,14 +32,15 @@ Pintail::~Pintail() {
                delete decalc; 
        }
        catch(exception& e) {
-               errorOut(e, "Pintail", "~Pintail");
+               m->errorOut(e, "Pintail", "~Pintail");
                exit(1);
        }
 }
 //***************************************************************************************************************
-void Pintail::doPrep() {
+int Pintail::doPrep() {
        try {
-       
+               
+               mergedFilterString = "";
                windowSizesTemplate.resize(templateSeqs.size(), window);
                quantiles.resize(100);  //one for every percent mismatch
                quantilesMembers.resize(100);  //one for every percent mismatch
@@ -65,58 +66,117 @@ void Pintail::doPrep() {
                #endif
 
                
-               mothurOut("Getting conservation... "); cout.flush();
+               m->mothurOut("Getting conservation... "); cout.flush();
                if (consfile == "") { 
-                       mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command.  Providing the .freq file will improve speed.    "); cout.flush();
+                       m->mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command.  Providing the .freq file will improve speed.    "); cout.flush();
                        probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName)); 
-                       mothurOut("Done."); mothurOutEndLine();
-               }else                           {   probabilityProfile = readFreq();    mothurOut("Done.");               }
-               mothurOutEndLine();
+                       if (m->control_pressed) {  return 0;  }
+                       m->mothurOut("Done."); m->mothurOutEndLine();
+               }else                           {   probabilityProfile = readFreq();    m->mothurOut("Done.");            }
+               m->mothurOutEndLine();
+               
+               //make P into Q
+               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
+               
+               bool reRead = false;
+               //create filter if needed for later
+               if (filter) {
+                                               
+                       //read in all query seqs
+                       ifstream in; 
+                       openInputFile(fastafile, in);
+                       
+                       vector<Sequence*> tempQuerySeqs;
+                       while(!in.eof()){
+                               if (m->control_pressed) {  
+                                       for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+                                       return 0; 
+                               }
+                               
+                               Sequence* s = new Sequence(in);
+                               gobble(in);
+                               
+                               if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+                       }
+                       in.close();
+                       
+                       vector<Sequence*> temp;
+                       //merge query seqs and template seqs
+                       temp = templateSeqs;
+                       for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
        
+                       if (seqMask != "") {
+                           reRead = true;
+                               //mask templates
+                               for (int i = 0; i < temp.size(); i++) {
+                                       if (m->control_pressed) {  
+                                               for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+                                               return 0; 
+                                       }
+                                       decalc->runMask(temp[i]);
+                               }
+                       }
+
+                       mergedFilterString = createFilter(temp, 0.5);
+                       
+                       if (m->control_pressed) {  
+                               for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+                               return 0; 
+                       }
+                       
+                       //reread template seqs
+                       for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+               }
+               
+               
                //quantiles are used to determine whether the de values found indicate a chimera
                //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
                //combination of sequences in the template
-               if (quanfile != "") {  quantiles = readQuantiles();  }
-               else {
-                       
-                       //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras
-                       if (seqMask != "") {
+               if (quanfile != "") {  
+                       quantiles = readQuantiles(); 
+               }else {
+                       if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
+                               reRead = true;
                                //mask templates
                                for (int i = 0; i < templateSeqs.size(); i++) {
+                                       if (m->control_pressed) {  return 0;  }
                                        decalc->runMask(templateSeqs[i]);
                                }
                        }
                        
-                       if (filter) {
-                               createFilter(templateSeqs);
-                               for (int i = 0; i < templateSeqs.size(); i++) {  runFilter(templateSeqs[i]);  }
+                       if (filter) { 
+                               reRead = true;
+                               for (int i = 0; i < templateSeqs.size(); i++) {
+                                       if (m->control_pressed) {  return 0;  }
+                                       runFilter(templateSeqs[i]);
+                               }
                        }
-
                        
-                       mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
+                       m->mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
                        if (processors == 1) { 
                                quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
                        }else {         createProcessesQuan();          }
                
+                       if (m->control_pressed) {  return 0;  }
                        
                        ofstream out4, out5;
                        string noOutliers, outliers;
                        
                        if ((!filter) && (seqMask == "")) {
                                noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
-                       }else if ((filter) && (seqMask == "")) { 
-                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan";
                        }else if ((!filter) && (seqMask != "")) { 
                                noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
                        }else if ((filter) && (seqMask != "")) { 
-                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan";
+                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan";
+                       }else if ((filter) && (seqMask == "")) { 
+                               noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan";
                        }
 
-                                               
                        decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
                        
-                       openOutputFile(noOutliers, out5);
+                       if (m->control_pressed) {  return 0;  }
                        
+                       openOutputFile(noOutliers, out5);                       
                        //adjust quantiles
                        for (int i = 0; i < quantilesMembers.size(); i++) {
                                vector<float> temp;
@@ -154,27 +214,29 @@ void Pintail::doPrep() {
                                
                        }
 
-                       mothurOut("Done."); mothurOutEndLine();
-                       
-                       //if mask reread template
-                       if ((seqMask != "") || (filter)) {
-                               for (int i = 0; i < templateSeqs.size(); i++)           {  delete templateSeqs[i];              }
-                               templateSeqs = readSeqs(templateFileName);
-                       }
-                       
+                       m->mothurOut("Done."); m->mothurOutEndLine();
                }
                
+               if (reRead) {
+                       for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i];  }
+                       templateSeqs.clear();
+                       templateSeqs = readSeqs(templateFileName);
+               }
+
+               
                //free memory
                for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i];  }
                
+               return 0;
+               
        }
        catch(exception& e) {
-               errorOut(e, "Pintail", "doPrep");
+               m->errorOut(e, "Pintail", "doPrep");
                exit(1);
        }
 }
 //***************************************************************************************************************
-void Pintail::print(ostream& out) {
+int Pintail::print(ostream& out, ostream& outAcc) {
        try {
                int index = ceil(deviation);
                
@@ -191,7 +253,8 @@ void Pintail::print(ostream& out) {
                
                out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
                if (chimera == "Yes") {
-                       mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine();
+                       m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine();
+                       outAcc << querySeq->getName() << endl;
                }
                out << "Observed\t";
                
@@ -203,10 +266,11 @@ void Pintail::print(ostream& out) {
                for (int m = 0; m < expectedDistance.size(); m++) {  out << expectedDistance[m] << '\t';  }
                out << endl;
                
+               return 0;
                
        }
        catch(exception& e) {
-               errorOut(e, "Pintail", "print");
+               m->errorOut(e, "Pintail", "print");
                exit(1);
        }
 }
@@ -217,24 +281,24 @@ int Pintail::getChimeras(Sequence* query) {
                querySeq = query;
                trimmed.clear();
                windowSizes = window;
-                               
-               //find pairs
-               bestfit = findPairs(query);
                                                        
-               //make P into Q
-               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
-       
-               //mask sequences if the user wants to 
+               //find pairs has to be done before a mask
+               bestfit = findPairs(query);
+               
+               if (m->control_pressed) {  return 0; } 
+               
+               //if they mask  
                if (seqMask != "") {
                        decalc->runMask(query);
                        decalc->runMask(bestfit);
                }
-               
-               if (filter) {
+
+               if (filter) { //must be done after a mask
                        runFilter(query);
                        runFilter(bestfit);
                }
                
+                               
                //trim seq
                decalc->trimSeqs(query, bestfit, trimmed);  
                
@@ -244,8 +308,12 @@ int Pintail::getChimeras(Sequence* query) {
 
                //find observed distance
                obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
+               
+               if (m->control_pressed) {  return 0; } 
                                
                Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
+               
+               if (m->control_pressed) {  return 0; } 
 
                //find alpha                    
                seqCoef = decalc->getCoef(obsDistance, Qav);
@@ -253,9 +321,13 @@ int Pintail::getChimeras(Sequence* query) {
                //calculating expected distance
                expectedDistance = decalc->calcExpected(Qav, seqCoef);
                
+               if (m->control_pressed) {  return 0; } 
+               
                //finding de
                DE = decalc->calcDE(obsDistance, expectedDistance);
                
+               if (m->control_pressed) {  return 0; } 
+               
                //find distance between query and closest match
                it = trimmed.begin();
                deviation = decalc->calcDist(query, bestfit, it->first, it->second); 
@@ -265,7 +337,7 @@ int Pintail::getChimeras(Sequence* query) {
                return 0;
        }
        catch(exception& e) {
-               errorOut(e, "Pintail", "getChimeras");
+               m->errorOut(e, "Pintail", "getChimeras");
                exit(1);
        }
 }
@@ -307,7 +379,7 @@ vector<float> Pintail::readFreq() {
                
        }
        catch(exception& e) {
-               errorOut(e, "Pintail", "readFreq");
+               m->errorOut(e, "Pintail", "readFreq");
                exit(1);
        }
 }
@@ -319,14 +391,12 @@ Sequence* Pintail::findPairs(Sequence* q) {
                
                Sequence* seqsMatches;  
                
-               vector<Sequence*> copy = decalc->findClosest(q, templateSeqs, 1);
-               seqsMatches = copy[0];
-               
+               seqsMatches = decalc->findClosest(q, templateSeqs);
                return seqsMatches;
        
        }
        catch(exception& e) {
-               errorOut(e, "Pintail", "findPairs");
+               m->errorOut(e, "Pintail", "findPairs");
                exit(1);
        }
 }
@@ -366,7 +436,7 @@ void Pintail::createProcessesQuan() {
                                out.close();
                                
                                exit(0);
-                       }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+                       }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); }
                }
                
                //force parent to wait until all the processes are done
@@ -421,7 +491,7 @@ void Pintail::createProcessesQuan() {
 #endif         
        }
        catch(exception& e) {
-               errorOut(e, "Pintail", "createProcessesQuan");
+               m->errorOut(e, "Pintail", "createProcessesQuan");
                exit(1);
        }
 }