]> git.donarmstrong.com Git - mothur.git/blobdiff - chimeraseqscommand.cpp
added list.seqs command
[mothur.git] / chimeraseqscommand.cpp
index 182c25e4b909ba2dd49fc1971beb7abee3b99188..f19fe11bcb907b86bc67429388fddfaa485601aa 100644 (file)
@@ -8,6 +8,7 @@
  */
 
 #include "chimeraseqscommand.h"
+#include "eachgapdist.h"
 
 //***************************************************************************************************************
 
@@ -20,7 +21,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta", "filter", "correction", "processors", "method" };
+                       string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -39,16 +40,24 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                        else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true;  }     
                        
                        string temp;
-                       temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "F"; }
+                       temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "T"; }
                        filter = isTrue(temp);
                        
                        temp = validParameter.validFile(parameters, "correction", false);               if (temp == "not found") { temp = "T"; }
                        correction = isTrue(temp);
                        
-                       temp = validParameter.validFile(parameters, "processors", true);                if (temp == "not found") { temp = "1"; }
+                       temp = validParameter.validFile(parameters, "processors", false);               if (temp == "not found") { temp = "1"; }
                        convert(temp, processors);
                        
+                       temp = validParameter.validFile(parameters, "window", false);                   if (temp == "not found") { temp = "0"; }
+                       convert(temp, window);
+                                       
+                       temp = validParameter.validFile(parameters, "increment", false);                        if (temp == "not found") { temp = "10"; }
+                       convert(temp, increment);
+                               
                        method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "bellerophon"; }
+                       
+                       if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine();  abort = true; }
 
                }
        }
@@ -77,6 +86,11 @@ void ChimeraSeqsCommand::help(){
                exit(1);
        }
 }
+//********************************************************************************************************************
+//sorts highest score to lowest
+inline bool comparePref(Preference left, Preference right){
+       return (left.score[0] > right.score[0]);        
+}
 
 //***************************************************************************************************************
 
@@ -89,9 +103,10 @@ int ChimeraSeqsCommand::execute(){
                
                if (abort == true) { return 0; }
                
+               
                //do soft filter
                if (filter)  {
-                       string optionString = "fasta=" + fastafile + ", soft=50.0, vertical=F";
+                       string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
                        filterSeqs = new FilterSeqsCommand(optionString);
                        filterSeqs->execute();
                        delete filterSeqs;
@@ -100,76 +115,169 @@ int ChimeraSeqsCommand::execute(){
                        fastafile = getRootName(fastafile) + "filter.fasta";
                }
                
+               distCalculator = new eachGapDist();
+               
                //read in sequences
                readSeqs();
                
-               //int numSeqs = seqs.size();
+               int numSeqs = seqs.size();
                
-               //find average midpoint of seqs
-               midpoint = findAverageMidPoint();
+               if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
                
-               //create 2 vectors of sequences, 1 for left side and one for right side
-               vector<Sequence> left;  vector<Sequence> right;
+               //set default window to 25% of sequence length
+               string seq0 = seqs[0].getAligned();
+               if (window == 0) { window = seq0.length() / 4;  }
+               else if (window > (seq0.length() / 2)) {  
+                       mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
+                       window = (seq0.length() / 2);
+               }
                
-               for (int i = 0; i < seqs.size(); i++) {
-                       //save left side
-                       string seqLeft = seqs[i].getAligned();
-                       seqLeft = seqLeft.substr(0, midpoint);
-                       Sequence tempLeft(seqs[i].getName(), seqLeft);
-                       left.push_back(tempLeft);
+               if (increment > (seqs[0].getAlignLength() - (2*window))) { 
+                       if (increment != 10) {
                        
-                       //save right side
-                       string seqRight = seqs[i].getAligned();
-                       seqRight = seqRight.substr(midpoint+1, (seqRight.length()-midpoint-1));
-                       Sequence tempRight(seqs[i].getName(), seqRight);
-                       right.push_back(tempRight);
+                               mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
+                               increment = 10;
+                               if (increment > (seqs[0].getAlignLength() - (2*window))) {  increment = 0;  }
+                               
+                       }else{ increment = 0; }
+               }
+cout << "increment = " << increment << endl;           
+               if (increment == 0) { iters = 1; }
+               else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
+               
+               //initialize pref
+               pref.resize(numSeqs);  
+               
+               for (int i = 0; i < numSeqs; i++ ) { 
+                       pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2);   pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
+                       pref[i].name = seqs[i].getName();
+                       pref[i].score[0] = 0.0;  pref[i].score[1] = 0.0; 
+                       pref[i].closestLeft[0] = 100000.0;  pref[i].closestLeft[1] = 100000.0;  
+                       pref[i].closestRight[0] = 100000.0;  pref[i].closestRight[1] = 100000.0;  
                }
+
+               int midpoint = window;
+               int count = 0;
+               while (count < iters) {
+                               
+                               //create 2 vectors of sequences, 1 for left side and one for right side
+                               vector<Sequence> left;  vector<Sequence> right;
+                               
+                               for (int i = 0; i < seqs.size(); i++) {
+//cout << "whole = " << seqs[i].getAligned() << endl;
+                                       //save left side
+                                       string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
+                                       Sequence tempLeft;
+                                       tempLeft.setName(seqs[i].getName());
+                                       tempLeft.setAligned(seqLeft);
+                                       left.push_back(tempLeft);
+//cout << "left = " << tempLeft.getAligned() << endl;                  
+                                       //save right side
+                                       string seqRight = seqs[i].getAligned().substr(midpoint, window);
+                                       Sequence tempRight;
+                                       tempRight.setName(seqs[i].getName());
+                                       tempRight.setAligned(seqRight);
+                                       right.push_back(tempRight);
+//cout << "right = " << seqRight << endl;      
+                               }
+                               
+                               //adjust midpoint by increment
+                               midpoint += increment;
+                               
+                               
+                               //this should be parallelized
+                               //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
+                               //create a matrix containing the distance from left to left and right to right
+                               //calculate distances
+                               SparseMatrix* SparseLeft = new SparseMatrix();
+                               SparseMatrix* SparseRight = new SparseMatrix();
+                               
+                               createSparseMatrix(0, left.size(), SparseLeft, left);
+                               createSparseMatrix(0, right.size(), SparseRight, right);
+                               
+                               vector<SeqMap> distMapRight;
+                               vector<SeqMap> distMapLeft;
+                               
+                               // Create a data structure to quickly access the distance information.
+                               // It consists of a vector of distance maps, where each map contains
+                               // all distances of a certain sequence. Vector and maps are accessed
+                               // via the index of a sequence in the distance matrix
+                               distMapRight = vector<SeqMap>(numSeqs); 
+                               distMapLeft = vector<SeqMap>(numSeqs); 
+                               //cout << "left" << endl << endl;
+                               for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
+                                       distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
+                                       //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
+                               }
+                               //cout << "right" << endl << endl;
+                               for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
+                                       distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
+                                       //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
+                               }
+                               
+                               delete SparseLeft;
+                               delete SparseRight;
                                
-               //this should be parallelized
-               //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
-               //create a matrix containing the distance from left to left and right to right
-               //calculate distances
-               SparseMatrix* SparseLeft = new SparseMatrix();
-               SparseMatrix* SparseRight = new SparseMatrix();
+                               
+                               //fill preference structure
+                               generatePreferences(distMapLeft, distMapRight, midpoint);
+                               
+                               count++;
+                               
+               }
                
-               createSparseMatrix(0, left.size(), SparseLeft, left);
-               createSparseMatrix(0, right.size(), SparseRight, right);
+               delete distCalculator;
                
+               //find average pref score across windows
+               //if (increment != 0) {
+                
+                       //for (int i = 0; i < pref.size(); i++) {
+                               //pref[i].score[0] = pref[i].score[0] / iters;
+                       //}
+               //}
                
-               //vector<SeqMap> distMapRight;
-               //vector<SeqMap> distMapLeft;
+               //sort Preferences highest to lowest
+               sort(pref.begin(), pref.end(), comparePref);
                
-               // Create a data structure to quickly access the distance information.
-               // It consists of a vector of distance maps, where each map contains
-               // all distances of a certain sequence. Vector and maps are accessed
-               // via the index of a sequence in the distance matrix
-               //distMapRight = vector<SeqMap>(globaldata->gListVector->size()); 
-               //distMapLeft = vector<SeqMap>(globaldata->gListVector->size()); 
-               for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
-                       //distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
-               }
-               for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
-                       //distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
-               }
-
+               string outputFileName = getRootName(fastafile) + "chimeras";
+               ofstream out;
+               openOutputFile(outputFileName, out);
                
-               //fill preference structure
-               //generatePreferences(distMapLeft, distMapRight);
+               int above1 = 0;
+               out << "Name\tScore\tLeft\tRight\t" << endl;
+               //output prefenence structure to .chimeras file
+               for (int i = 0; i < pref.size(); i++) {
+                       out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
+                       
+                       //calc # of seqs with preference above 1.0
+                       if (pref[i].score[0] > 1.0) { 
+                               above1++; 
+                               mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
+                               mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
+                       }
+                       
+                       
+               }
                
-                               
-               //output results to screen                                              
+               //output results to screen
                mothurOutEndLine();
-               mothurOut("\t\t"); mothurOutEndLine();
-               //mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
-               //mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); mothurOutEndLine();
-               //mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); mothurOutEndLine();
-               //mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); mothurOutEndLine();
-               //mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); mothurOutEndLine();
-               //mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); mothurOutEndLine();
-               //mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); mothurOutEndLine();
-               //mothurOut("# of Seqs:\t" + toString(numSeqs)); mothurOutEndLine();
+               mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
+               int spot;
+               spot = pref.size()-1;
+               mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+               spot = pref.size() * 0.975;
+               mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+               spot = pref.size() * 0.75;
+               mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+               spot = pref.size() * 0.50;
+               mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
+               spot = pref.size() * 0.25;
+               mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+               spot = pref.size() * 0.025;
+               mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+               spot = 0;
+               mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
                
-               //outSummary.close();
                return 0;
        }
        catch(exception& e) {
@@ -188,6 +296,8 @@ void ChimeraSeqsCommand::readSeqs(){
                while(!inFASTA.eof()){
                        Sequence current(inFASTA);
                        
+                       if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
+                       
                        seqs.push_back(current);
                        
                        gobble(inFASTA);
@@ -201,53 +311,6 @@ void ChimeraSeqsCommand::readSeqs(){
        }
 }
 
-
-//***************************************************************************************************************
-int ChimeraSeqsCommand::findAverageMidPoint(){
-       try {
-               int totalMids = 0;
-               int averageMid = 0;
-               
-               //loop through the seqs and find midpoint
-               for (int i = 0; i < seqs.size(); i++) {
-                       
-                       //get unaligned sequence
-                       seqs[i].setUnaligned(seqs[i].getUnaligned());  //if you read an aligned file the unaligned is really aligned, so we need to make sure its unaligned
-                       
-                       string unaligned = seqs[i].getUnaligned();
-                       string aligned = seqs[i].getAligned();
-                       
-                       //find midpoint of this seq
-                       int count = 0;
-                       int thismid = 0;
-                       for (int j = 0; j < aligned.length(); j++) {
-                               
-                               thismid++;
-                               
-                               //if you are part of the unaligned sequence increment
-                               if (isalpha(aligned[j])) {  count++;  }
-                               
-                               //if you have reached the halfway point stop
-                               if (count >= (unaligned.length() / 2)) { break; }
-                       }
-                       
-                       //add this mid to total
-                       totalMids += thismid;
-               
-               }
-               
-               averageMid = (totalMids / seqs.size());
-               
-               return averageMid; 
-       
-       
-       }
-       catch(exception& e) {
-               errorOut(e, "ChimeraSeqsCommand", "findAverageMidPoint");
-               exit(1);
-       }
-}
-
 /***************************************************************************************************************/
 int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
        try {
@@ -256,9 +319,9 @@ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatri
                        
                        for(int j=0;j<i;j++){
                        
-                               //distCalculator->calcDist(s.get(i), s.get(j));
+                               distCalculator->calcDist(s[i], s[j]);
                                float dist = distCalculator->getDist();
-                               
+                       
                                PCell temp(i, j, dist);
                                sparse->addCell(temp);
                                
@@ -273,23 +336,123 @@ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatri
                exit(1);
        }
 }
-/***************************************************************************************************************
-void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right){
+/***************************************************************************************************************/
+void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
        try {
+               
+               float dme = 0.0;
+               SeqMap::iterator itR;
+               SeqMap::iterator itL;
+               
+               //initialize pref[i]
+               for (int i = 0; i < pref.size(); i++) {
+                       pref[i].score[1] = 0.0;
+                       pref[i].closestLeft[1] = 100000.0; 
+                       pref[i].closestRight[1] = 100000.0; 
+                       pref[i].leftParent[1] = "";
+                       pref[i].rightParent[1] = "";
+               }
 
+//cout << "in generate left.size() = " << left.size() << endl;         
                for (int i = 0; i < left.size(); i++) {
                        
-                       int iscore = 0;
-                       float closestLeft = 100000.0;
-                       float closestRight = 100000.0;
+                       SeqMap currentLeft = left[i];    //example i = 3;   currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
+                                                                                               //                                                                              1 to the distance of sequence 3 to sequence 1,
+                                                                                               //                                                                              2 to the distance of sequence 3 to sequence 2.
+                       SeqMap currentRight = right[i];         // same as left but with distances on the right side.
                        
-                       for (int j = 0; j < left.size(); j++) {
+                       for (int j = 0; j < i; j++) {
                                
-                               //iscore += abs(left
-                       
+                               itL = currentLeft.find(j);
+                               itR = currentRight.find(j);
+cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
+cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
+                               
+                               //if you can find this entry update the preferences
+                               if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
+                               
+                                       if (!correction) {
+                                               pref[i].score[1] += abs((itL->second - itR->second));
+                                               pref[j].score[1] += abs((itL->second - itR->second));
+cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
+cout << "abs = " << abs((itL->second - itR->second)) << endl;
+cout << i << " score = " << pref[i].score[1] << endl;
+cout << j << " score = " << pref[j].score[1] << endl;
+                                       }else {
+                                               pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
+                                               pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
+cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
+cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
+cout << i << " score = " << pref[i].score[1] << endl;
+cout << j << " score = " << pref[j].score[1] << endl;
+                                       }
+cout << "pref[" << i << "].closestLeft[1] = "  <<      pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;                        
+                                       //are you the closest left sequence
+                                       if (itL->second < pref[i].closestLeft[1]) {  
+
+                                               pref[i].closestLeft[1] = itL->second;
+                                               pref[i].leftParent[1] = seqs[j].getName();
+cout << "updating closest left to " << pref[i].leftParent[1] << endl;
+                                       }
+cout << "pref[" << j << "].closestLeft[1] = "  <<      pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;        
+                                       if (itL->second < pref[j].closestLeft[1]) { 
+                                               pref[j].closestLeft[1] = itL->second;
+                                               pref[j].leftParent[1] = seqs[i].getName();
+cout << "updating closest left to " << pref[j].leftParent[1] << endl;
+                                       }
+                                       
+                                       //are you the closest right sequence
+                                       if (itR->second < pref[i].closestRight[1]) {   
+                                               pref[i].closestRight[1] = itR->second;
+                                               pref[i].rightParent[1] = seqs[j].getName();
+                                       }
+                                       if (itR->second < pref[j].closestRight[1]) {   
+                                               pref[j].closestRight[1] = itR->second;
+                                               pref[j].rightParent[1] = seqs[i].getName();
+                                       }
+                                       
+                               }
                        }
                
                }
+               
+               
+                 
+               //calculate the dme
+               int count0 = 0;
+               for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
+               
+               float expectedPercent = 1 / (float) (pref.size() - count0);
+cout << endl << "dme = " << dme << endl;
+               //recalculate prefernences based on dme
+               for (int i = 0; i < pref.size(); i++) {
+cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;  
+                       // gives the actual percentage of the dme this seq adds
+                       pref[i].score[1] = pref[i].score[1] / dme;
+                       
+                       //how much higher or lower is this than expected
+                       pref[i].score[1] = pref[i].score[1] / expectedPercent;
+                       
+                       //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
+cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                    
+               }
+               
+               //is this score bigger then the last score
+               for (int i = 0; i < pref.size(); i++) {  
+                       
+                       //update biggest score
+                       if (pref[i].score[1] > pref[i].score[0]) {
+                               pref[i].score[0] = pref[i].score[1];
+                               pref[i].leftParent[0] = pref[i].leftParent[1];
+                               pref[i].rightParent[0] = pref[i].rightParent[1];
+                               pref[i].closestLeft[0] = pref[i].closestLeft[1];
+                               pref[i].closestRight[0] = pref[i].closestRight[1];
+                               pref[i].midpoint = mid;
+                       }
+                       
+                       //total of preference scores across windows
+                       //pref[i].score[0] += pref[i].score[1];
+               }
 
        }
        catch(exception& e) {