]> git.donarmstrong.com Git - mothur.git/blobdiff - chimeraseqscommand.cpp
chimera.seqs pintail is working.
[mothur.git] / chimeraseqscommand.cpp
index f19fe11bcb907b86bc67429388fddfaa485601aa..899d6ef33a5ec05ffe12ea24891c9680b33d0733 100644 (file)
@@ -8,7 +8,8 @@
  */
 
 #include "chimeraseqscommand.h"
-#include "eachgapdist.h"
+#include "bellerophon.h"
+#include "pintail.h"
 
 //***************************************************************************************************************
 
@@ -21,7 +22,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
+                       string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask" };
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -39,8 +40,32 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                        if (fastafile == "not open") { abort = true; }
                        else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true;  }     
                        
+                       templatefile = validParameter.validFile(parameters, "template", true);
+                       if (templatefile == "not open") { abort = true; }
+                       else if (templatefile == "not found") { templatefile = "";  }   
+                       
+                       consfile = validParameter.validFile(parameters, "conservation", true);
+                       if (consfile == "not open") { abort = true; }
+                       else if (consfile == "not found") { consfile = "";  }   
+                       
+                       quanfile = validParameter.validFile(parameters, "quantile", true);
+                       if (quanfile == "not open") { abort = true; }
+                       else if (quanfile == "not found") { quanfile = "";  }
+                               
+                       maskfile = validParameter.validFile(parameters, "mask", false);
+                       if (maskfile == "not found") { maskfile = "";  }        
+                       else if (maskfile != "default")  { 
+                               ifstream in;
+                               int     ableToOpen = openInputFile(maskfile, in);
+                               if (ableToOpen == 1) { abort = true; }
+                               in.close();
+                       }
+               
+
+                       
+
                        string temp;
-                       temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "T"; }
+                       temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "F"; }
                        filter = isTrue(temp);
                        
                        temp = validParameter.validFile(parameters, "correction", false);               if (temp == "not found") { temp = "T"; }
@@ -52,12 +77,13 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                        temp = validParameter.validFile(parameters, "window", false);                   if (temp == "not found") { temp = "0"; }
                        convert(temp, window);
                                        
-                       temp = validParameter.validFile(parameters, "increment", false);                        if (temp == "not found") { temp = "10"; }
+                       temp = validParameter.validFile(parameters, "increment", false);                        if (temp == "not found") { temp = "25"; }
                        convert(temp, increment);
                                
-                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "bellerophon"; }
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "pintail"; }
+                       
+                       if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true;  }
                        
-                       if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine();  abort = true; }
 
                }
        }
@@ -70,12 +96,20 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
 
 void ChimeraSeqsCommand::help(){
        try {
+               mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED.\n");
                mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
-               mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method.  fasta is required.\n");
-               mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter.  The default is false. \n");
-               mothurOut("The correction parameter allows you to .....  The default is true. \n");
+               mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n");
+               mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon.  The default is false. \n");
+               mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.   The default is true. This only applies when the method is bellerphon.\n");
                mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
-               mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is bellerophon. \n");
+               mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is pintail. Options include..... \n");
+               mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method.  The default is no mask.  If you enter mask=default, then the mask is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n");
+               mothurOut("The window parameter allows you to specify the window size for searching for chimeras.  The default is 300 is method is pintail unless the sequence length is less than 300, and 1/4 sequence length for bellerphon.\n");
+               mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences.  The default is 25.\n");
+               mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences for use by the pintail algorythm. It is a required parameter if using pintail.\n");
+               mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n");
+               mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n");
+               mothurOut("If you have run chimera.seqs using pintail a .quan and .freq file will be created if you have not provided them for use in future command executions.");
                mothurOut("The chimera.seqs command should be in the following format: \n");
                mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
                mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
@@ -86,11 +120,6 @@ void ChimeraSeqsCommand::help(){
                exit(1);
        }
 }
-//********************************************************************************************************************
-//sorts highest score to lowest
-inline bool comparePref(Preference left, Preference right){
-       return (left.score[0] > right.score[0]);        
-}
 
 //***************************************************************************************************************
 
@@ -103,364 +132,49 @@ int ChimeraSeqsCommand::execute(){
                
                if (abort == true) { return 0; }
                
-               
-               //do soft filter
-               if (filter)  {
-                       string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
-                       filterSeqs = new FilterSeqsCommand(optionString);
-                       filterSeqs->execute();
-                       delete filterSeqs;
+               if (method == "bellerophon")    {               chimera = new Bellerophon(fastafile);                   }
+               else if (method == "pintail")   {               chimera = new Pintail(fastafile, templatefile); 
+                       //saves time to avoid generating it
+                       if (consfile != "")                     {               chimera->setCons(consfile);                                             }
+                       else                                            {               chimera->setCons("");                                                   }
                        
-                       //reset fastafile to filtered file
-                       fastafile = getRootName(fastafile) + "filter.fasta";
-               }
-               
-               distCalculator = new eachGapDist();
-               
-               //read in sequences
-               readSeqs();
-               
-               int numSeqs = seqs.size();
-               
-               if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
-               
-               //set default window to 25% of sequence length
-               string seq0 = seqs[0].getAligned();
-               if (window == 0) { window = seq0.length() / 4;  }
-               else if (window > (seq0.length() / 2)) {  
-                       mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
-                       window = (seq0.length() / 2);
-               }
-               
-               if (increment > (seqs[0].getAlignLength() - (2*window))) { 
-                       if (increment != 10) {
+                       //saves time to avoid generating it
+                       if (quanfile != "")                     {               chimera->setQuantiles(quanfile);                                }
+                       else                                            {               chimera->setQuantiles("");                                              }
                        
-                               mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
-                               increment = 10;
-                               if (increment > (seqs[0].getAlignLength() - (2*window))) {  increment = 0;  }
-                               
-                       }else{ increment = 0; }
-               }
-cout << "increment = " << increment << endl;           
-               if (increment == 0) { iters = 1; }
-               else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
+                       if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine();  }
+                       chimera->setMask(maskfile);
+                                               
+               }else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;         }
                
-               //initialize pref
-               pref.resize(numSeqs);  
-               
-               for (int i = 0; i < numSeqs; i++ ) { 
-                       pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2);   pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
-                       pref[i].name = seqs[i].getName();
-                       pref[i].score[0] = 0.0;  pref[i].score[1] = 0.0; 
-                       pref[i].closestLeft[0] = 100000.0;  pref[i].closestLeft[1] = 100000.0;  
-                       pref[i].closestRight[0] = 100000.0;  pref[i].closestRight[1] = 100000.0;  
-               }
-
-               int midpoint = window;
-               int count = 0;
-               while (count < iters) {
-                               
-                               //create 2 vectors of sequences, 1 for left side and one for right side
-                               vector<Sequence> left;  vector<Sequence> right;
-                               
-                               for (int i = 0; i < seqs.size(); i++) {
-//cout << "whole = " << seqs[i].getAligned() << endl;
-                                       //save left side
-                                       string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
-                                       Sequence tempLeft;
-                                       tempLeft.setName(seqs[i].getName());
-                                       tempLeft.setAligned(seqLeft);
-                                       left.push_back(tempLeft);
-//cout << "left = " << tempLeft.getAligned() << endl;                  
-                                       //save right side
-                                       string seqRight = seqs[i].getAligned().substr(midpoint, window);
-                                       Sequence tempRight;
-                                       tempRight.setName(seqs[i].getName());
-                                       tempRight.setAligned(seqRight);
-                                       right.push_back(tempRight);
-//cout << "right = " << seqRight << endl;      
-                               }
-                               
-                               //adjust midpoint by increment
-                               midpoint += increment;
-                               
-                               
-                               //this should be parallelized
-                               //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
-                               //create a matrix containing the distance from left to left and right to right
-                               //calculate distances
-                               SparseMatrix* SparseLeft = new SparseMatrix();
-                               SparseMatrix* SparseRight = new SparseMatrix();
-                               
-                               createSparseMatrix(0, left.size(), SparseLeft, left);
-                               createSparseMatrix(0, right.size(), SparseRight, right);
+               //set user options
+               chimera->setFilter(filter);
+               chimera->setCorrection(correction);
+               chimera->setProcessors(processors);
+               chimera->setWindow(window);
+               chimera->setIncrement(increment);
                                
-                               vector<SeqMap> distMapRight;
-                               vector<SeqMap> distMapLeft;
-                               
-                               // Create a data structure to quickly access the distance information.
-                               // It consists of a vector of distance maps, where each map contains
-                               // all distances of a certain sequence. Vector and maps are accessed
-                               // via the index of a sequence in the distance matrix
-                               distMapRight = vector<SeqMap>(numSeqs); 
-                               distMapLeft = vector<SeqMap>(numSeqs); 
-                               //cout << "left" << endl << endl;
-                               for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
-                                       distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
-                                       //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
-                               }
-                               //cout << "right" << endl << endl;
-                               for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
-                                       distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
-                                       //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
-                               }
-                               
-                               delete SparseLeft;
-                               delete SparseRight;
-                               
-                               
-                               //fill preference structure
-                               generatePreferences(distMapLeft, distMapRight, midpoint);
-                               
-                               count++;
-                               
-               }
+               //find chimeras
+               chimera->getChimeras();
                
-               delete distCalculator;
-               
-               //find average pref score across windows
-               //if (increment != 0) {
-                
-                       //for (int i = 0; i < pref.size(); i++) {
-                               //pref[i].score[0] = pref[i].score[0] / iters;
-                       //}
-               //}
-               
-               //sort Preferences highest to lowest
-               sort(pref.begin(), pref.end(), comparePref);
-               
-               string outputFileName = getRootName(fastafile) + "chimeras";
+               string outputFileName = getRootName(fastafile) + method + maskfile + ".chimeras";
                ofstream out;
                openOutputFile(outputFileName, out);
                
-               int above1 = 0;
-               out << "Name\tScore\tLeft\tRight\t" << endl;
-               //output prefenence structure to .chimeras file
-               for (int i = 0; i < pref.size(); i++) {
-                       out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
-                       
-                       //calc # of seqs with preference above 1.0
-                       if (pref[i].score[0] > 1.0) { 
-                               above1++; 
-                               mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
-                               mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
-                       }
-                       
-                       
-               }
-               
-               //output results to screen
-               mothurOutEndLine();
-               mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
-               int spot;
-               spot = pref.size()-1;
-               mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
-               spot = pref.size() * 0.975;
-               mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
-               spot = pref.size() * 0.75;
-               mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
-               spot = pref.size() * 0.50;
-               mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
-               spot = pref.size() * 0.25;
-               mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
-               spot = pref.size() * 0.025;
-               mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
-               spot = 0;
-               mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
-               
-               return 0;
-       }
-       catch(exception& e) {
-               errorOut(e, "ChimeraSeqsCommand", "execute");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-void ChimeraSeqsCommand::readSeqs(){
-       try {
-               ifstream inFASTA;
-               openInputFile(fastafile, inFASTA);
-               
-               //read in seqs and store in vector
-               while(!inFASTA.eof()){
-                       Sequence current(inFASTA);
-                       
-                       if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
-                       
-                       seqs.push_back(current);
-                       
-                       gobble(inFASTA);
-               }
-               inFASTA.close();
-
-       }
-       catch(exception& e) {
-               errorOut(e, "ChimeraSeqsCommand", "readSeqs");
-               exit(1);
-       }
-}
-
-/***************************************************************************************************************/
-int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
-       try {
-
-               for(int i=startSeq; i<endSeq; i++){
-                       
-                       for(int j=0;j<i;j++){
-                       
-                               distCalculator->calcDist(s[i], s[j]);
-                               float dist = distCalculator->getDist();
-                       
-                               PCell temp(i, j, dist);
-                               sparse->addCell(temp);
-                               
-                       }
-               }
-                       
-       
-               return 1;
-       }
-       catch(exception& e) {
-               errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
-               exit(1);
-       }
-}
-/***************************************************************************************************************/
-void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
-       try {
-               
-               float dme = 0.0;
-               SeqMap::iterator itR;
-               SeqMap::iterator itL;
-               
-               //initialize pref[i]
-               for (int i = 0; i < pref.size(); i++) {
-                       pref[i].score[1] = 0.0;
-                       pref[i].closestLeft[1] = 100000.0; 
-                       pref[i].closestRight[1] = 100000.0; 
-                       pref[i].leftParent[1] = "";
-                       pref[i].rightParent[1] = "";
-               }
-
-//cout << "in generate left.size() = " << left.size() << endl;         
-               for (int i = 0; i < left.size(); i++) {
-                       
-                       SeqMap currentLeft = left[i];    //example i = 3;   currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
-                                                                                               //                                                                              1 to the distance of sequence 3 to sequence 1,
-                                                                                               //                                                                              2 to the distance of sequence 3 to sequence 2.
-                       SeqMap currentRight = right[i];         // same as left but with distances on the right side.
-                       
-                       for (int j = 0; j < i; j++) {
-                               
-                               itL = currentLeft.find(j);
-                               itR = currentRight.find(j);
-cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
-cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
-                               
-                               //if you can find this entry update the preferences
-                               if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
-                               
-                                       if (!correction) {
-                                               pref[i].score[1] += abs((itL->second - itR->second));
-                                               pref[j].score[1] += abs((itL->second - itR->second));
-cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
-cout << "abs = " << abs((itL->second - itR->second)) << endl;
-cout << i << " score = " << pref[i].score[1] << endl;
-cout << j << " score = " << pref[j].score[1] << endl;
-                                       }else {
-                                               pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
-                                               pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
-cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
-cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
-cout << i << " score = " << pref[i].score[1] << endl;
-cout << j << " score = " << pref[j].score[1] << endl;
-                                       }
-cout << "pref[" << i << "].closestLeft[1] = "  <<      pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;                        
-                                       //are you the closest left sequence
-                                       if (itL->second < pref[i].closestLeft[1]) {  
-
-                                               pref[i].closestLeft[1] = itL->second;
-                                               pref[i].leftParent[1] = seqs[j].getName();
-cout << "updating closest left to " << pref[i].leftParent[1] << endl;
-                                       }
-cout << "pref[" << j << "].closestLeft[1] = "  <<      pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;        
-                                       if (itL->second < pref[j].closestLeft[1]) { 
-                                               pref[j].closestLeft[1] = itL->second;
-                                               pref[j].leftParent[1] = seqs[i].getName();
-cout << "updating closest left to " << pref[j].leftParent[1] << endl;
-                                       }
-                                       
-                                       //are you the closest right sequence
-                                       if (itR->second < pref[i].closestRight[1]) {   
-                                               pref[i].closestRight[1] = itR->second;
-                                               pref[i].rightParent[1] = seqs[j].getName();
-                                       }
-                                       if (itR->second < pref[j].closestRight[1]) {   
-                                               pref[j].closestRight[1] = itR->second;
-                                               pref[j].rightParent[1] = seqs[i].getName();
-                                       }
-                                       
-                               }
-                       }
+               //print results
+               chimera->print(out);
                
-               }
+               out.close();
                
+               delete chimera;
                
-                 
-               //calculate the dme
-               int count0 = 0;
-               for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
-               
-               float expectedPercent = 1 / (float) (pref.size() - count0);
-cout << endl << "dme = " << dme << endl;
-               //recalculate prefernences based on dme
-               for (int i = 0; i < pref.size(); i++) {
-cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;  
-                       // gives the actual percentage of the dme this seq adds
-                       pref[i].score[1] = pref[i].score[1] / dme;
-                       
-                       //how much higher or lower is this than expected
-                       pref[i].score[1] = pref[i].score[1] / expectedPercent;
-                       
-                       //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
-cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                    
-               }
+               return 0;
                
-               //is this score bigger then the last score
-               for (int i = 0; i < pref.size(); i++) {  
-                       
-                       //update biggest score
-                       if (pref[i].score[1] > pref[i].score[0]) {
-                               pref[i].score[0] = pref[i].score[1];
-                               pref[i].leftParent[0] = pref[i].leftParent[1];
-                               pref[i].rightParent[0] = pref[i].rightParent[1];
-                               pref[i].closestLeft[0] = pref[i].closestLeft[1];
-                               pref[i].closestRight[0] = pref[i].closestRight[1];
-                               pref[i].midpoint = mid;
-                       }
-                       
-                       //total of preference scores across windows
-                       //pref[i].score[0] += pref[i].score[1];
-               }
-
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraSeqsCommand", "generatePreferences");
+               errorOut(e, "ChimeraSeqsCommand", "execute");
                exit(1);
        }
 }
 /**************************************************************************************************/
 
-/**************************************************************************************************/
-