]> git.donarmstrong.com Git - mothur.git/blob - chimeraseqscommand.cpp
added list.seqs command
[mothur.git] / chimeraseqscommand.cpp
1 /*
2  *  chimeraseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 6/29/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "chimeraseqscommand.h"
11 #include "eachgapdist.h"
12
13 //***************************************************************************************************************
14
15 ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
16         try {
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
25                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26                         
27                         OptionParser parser(option);
28                         map<string,string> parameters = parser.getParameters();
29                         
30                         ValidParameters validParameter;
31                         
32                         //check to make sure all parameters are valid for command
33                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
34                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
35                         }
36                         
37                         //check for required parameters
38                         fastafile = validParameter.validFile(parameters, "fasta", true);
39                         if (fastafile == "not open") { abort = true; }
40                         else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true;  }     
41                         
42                         string temp;
43                         temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "T"; }
44                         filter = isTrue(temp);
45                         
46                         temp = validParameter.validFile(parameters, "correction", false);               if (temp == "not found") { temp = "T"; }
47                         correction = isTrue(temp);
48                         
49                         temp = validParameter.validFile(parameters, "processors", false);               if (temp == "not found") { temp = "1"; }
50                         convert(temp, processors);
51                         
52                         temp = validParameter.validFile(parameters, "window", false);                   if (temp == "not found") { temp = "0"; }
53                         convert(temp, window);
54                                         
55                         temp = validParameter.validFile(parameters, "increment", false);                        if (temp == "not found") { temp = "10"; }
56                         convert(temp, increment);
57                                 
58                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "bellerophon"; }
59                         
60                         if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine();  abort = true; }
61
62                 }
63         }
64         catch(exception& e) {
65                 errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
66                 exit(1);
67         }
68 }
69 //**********************************************************************************************************************
70
71 void ChimeraSeqsCommand::help(){
72         try {
73                 mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
74                 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method.  fasta is required.\n");
75                 mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter.  The default is false. \n");
76                 mothurOut("The correction parameter allows you to .....  The default is true. \n");
77                 mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
78                 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is bellerophon. \n");
79                 mothurOut("The chimera.seqs command should be in the following format: \n");
80                 mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
81                 mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
82                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");        
83         }
84         catch(exception& e) {
85                 errorOut(e, "ChimeraSeqsCommand", "help");
86                 exit(1);
87         }
88 }
89 //********************************************************************************************************************
90 //sorts highest score to lowest
91 inline bool comparePref(Preference left, Preference right){
92         return (left.score[0] > right.score[0]);        
93 }
94
95 //***************************************************************************************************************
96
97 ChimeraSeqsCommand::~ChimeraSeqsCommand(){      /*      do nothing      */      }
98
99 //***************************************************************************************************************
100
101 int ChimeraSeqsCommand::execute(){
102         try{
103                 
104                 if (abort == true) { return 0; }
105                 
106                 
107                 //do soft filter
108                 if (filter)  {
109                         string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
110                         filterSeqs = new FilterSeqsCommand(optionString);
111                         filterSeqs->execute();
112                         delete filterSeqs;
113                         
114                         //reset fastafile to filtered file
115                         fastafile = getRootName(fastafile) + "filter.fasta";
116                 }
117                 
118                 distCalculator = new eachGapDist();
119                 
120                 //read in sequences
121                 readSeqs();
122                 
123                 int numSeqs = seqs.size();
124                 
125                 if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
126                 
127                 //set default window to 25% of sequence length
128                 string seq0 = seqs[0].getAligned();
129                 if (window == 0) { window = seq0.length() / 4;  }
130                 else if (window > (seq0.length() / 2)) {  
131                         mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
132                         window = (seq0.length() / 2);
133                 }
134                 
135                 if (increment > (seqs[0].getAlignLength() - (2*window))) { 
136                         if (increment != 10) {
137                         
138                                 mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
139                                 increment = 10;
140                                 if (increment > (seqs[0].getAlignLength() - (2*window))) {  increment = 0;  }
141                                 
142                         }else{ increment = 0; }
143                 }
144 cout << "increment = " << increment << endl;            
145                 if (increment == 0) { iters = 1; }
146                 else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
147                 
148                 //initialize pref
149                 pref.resize(numSeqs);  
150                 
151                 for (int i = 0; i < numSeqs; i++ ) { 
152                         pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2);   pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
153                         pref[i].name = seqs[i].getName();
154                         pref[i].score[0] = 0.0;  pref[i].score[1] = 0.0; 
155                         pref[i].closestLeft[0] = 100000.0;  pref[i].closestLeft[1] = 100000.0;  
156                         pref[i].closestRight[0] = 100000.0;  pref[i].closestRight[1] = 100000.0;  
157                 }
158
159                 int midpoint = window;
160                 int count = 0;
161                 while (count < iters) {
162                                 
163                                 //create 2 vectors of sequences, 1 for left side and one for right side
164                                 vector<Sequence> left;  vector<Sequence> right;
165                                 
166                                 for (int i = 0; i < seqs.size(); i++) {
167 //cout << "whole = " << seqs[i].getAligned() << endl;
168                                         //save left side
169                                         string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
170                                         Sequence tempLeft;
171                                         tempLeft.setName(seqs[i].getName());
172                                         tempLeft.setAligned(seqLeft);
173                                         left.push_back(tempLeft);
174 //cout << "left = " << tempLeft.getAligned() << endl;                   
175                                         //save right side
176                                         string seqRight = seqs[i].getAligned().substr(midpoint, window);
177                                         Sequence tempRight;
178                                         tempRight.setName(seqs[i].getName());
179                                         tempRight.setAligned(seqRight);
180                                         right.push_back(tempRight);
181 //cout << "right = " << seqRight << endl;       
182                                 }
183                                 
184                                 //adjust midpoint by increment
185                                 midpoint += increment;
186                                 
187                                 
188                                 //this should be parallelized
189                                 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
190                                 //create a matrix containing the distance from left to left and right to right
191                                 //calculate distances
192                                 SparseMatrix* SparseLeft = new SparseMatrix();
193                                 SparseMatrix* SparseRight = new SparseMatrix();
194                                 
195                                 createSparseMatrix(0, left.size(), SparseLeft, left);
196                                 createSparseMatrix(0, right.size(), SparseRight, right);
197                                 
198                                 vector<SeqMap> distMapRight;
199                                 vector<SeqMap> distMapLeft;
200                                 
201                                 // Create a data structure to quickly access the distance information.
202                                 // It consists of a vector of distance maps, where each map contains
203                                 // all distances of a certain sequence. Vector and maps are accessed
204                                 // via the index of a sequence in the distance matrix
205                                 distMapRight = vector<SeqMap>(numSeqs); 
206                                 distMapLeft = vector<SeqMap>(numSeqs); 
207                                 //cout << "left" << endl << endl;
208                                 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
209                                         distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
210                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
211                                 }
212                                 //cout << "right" << endl << endl;
213                                 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
214                                         distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
215                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
216                                 }
217                                 
218                                 delete SparseLeft;
219                                 delete SparseRight;
220                                 
221                                 
222                                 //fill preference structure
223                                 generatePreferences(distMapLeft, distMapRight, midpoint);
224                                 
225                                 count++;
226                                 
227                 }
228                 
229                 delete distCalculator;
230                 
231                 //find average pref score across windows
232                 //if (increment != 0) {
233                  
234                         //for (int i = 0; i < pref.size(); i++) {
235                                 //pref[i].score[0] = pref[i].score[0] / iters;
236                         //}
237                 //}
238                 
239                 //sort Preferences highest to lowest
240                 sort(pref.begin(), pref.end(), comparePref);
241                 
242                 string outputFileName = getRootName(fastafile) + "chimeras";
243                 ofstream out;
244                 openOutputFile(outputFileName, out);
245                 
246                 int above1 = 0;
247                 out << "Name\tScore\tLeft\tRight\t" << endl;
248                 //output prefenence structure to .chimeras file
249                 for (int i = 0; i < pref.size(); i++) {
250                         out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
251                         
252                         //calc # of seqs with preference above 1.0
253                         if (pref[i].score[0] > 1.0) { 
254                                 above1++; 
255                                 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
256                                 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
257                         }
258                         
259                         
260                 }
261                 
262                 //output results to screen
263                 mothurOutEndLine();
264                 mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
265                 int spot;
266                 spot = pref.size()-1;
267                 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
268                 spot = pref.size() * 0.975;
269                 mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
270                 spot = pref.size() * 0.75;
271                 mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
272                 spot = pref.size() * 0.50;
273                 mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
274                 spot = pref.size() * 0.25;
275                 mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
276                 spot = pref.size() * 0.025;
277                 mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
278                 spot = 0;
279                 mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
280                 
281                 return 0;
282         }
283         catch(exception& e) {
284                 errorOut(e, "ChimeraSeqsCommand", "execute");
285                 exit(1);
286         }
287 }
288
289 //***************************************************************************************************************
290 void ChimeraSeqsCommand::readSeqs(){
291         try {
292                 ifstream inFASTA;
293                 openInputFile(fastafile, inFASTA);
294                 
295                 //read in seqs and store in vector
296                 while(!inFASTA.eof()){
297                         Sequence current(inFASTA);
298                         
299                         if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
300                         
301                         seqs.push_back(current);
302                         
303                         gobble(inFASTA);
304                 }
305                 inFASTA.close();
306
307         }
308         catch(exception& e) {
309                 errorOut(e, "ChimeraSeqsCommand", "readSeqs");
310                 exit(1);
311         }
312 }
313
314 /***************************************************************************************************************/
315 int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
316         try {
317
318                 for(int i=startSeq; i<endSeq; i++){
319                         
320                         for(int j=0;j<i;j++){
321                         
322                                 distCalculator->calcDist(s[i], s[j]);
323                                 float dist = distCalculator->getDist();
324                         
325                                 PCell temp(i, j, dist);
326                                 sparse->addCell(temp);
327                                 
328                         }
329                 }
330                         
331         
332                 return 1;
333         }
334         catch(exception& e) {
335                 errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
336                 exit(1);
337         }
338 }
339 /***************************************************************************************************************/
340 void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
341         try {
342                 
343                 float dme = 0.0;
344                 SeqMap::iterator itR;
345                 SeqMap::iterator itL;
346                 
347                 //initialize pref[i]
348                 for (int i = 0; i < pref.size(); i++) {
349                         pref[i].score[1] = 0.0;
350                         pref[i].closestLeft[1] = 100000.0; 
351                         pref[i].closestRight[1] = 100000.0; 
352                         pref[i].leftParent[1] = "";
353                         pref[i].rightParent[1] = "";
354                 }
355
356 //cout << "in generate left.size() = " << left.size() << endl;          
357                 for (int i = 0; i < left.size(); i++) {
358                         
359                         SeqMap currentLeft = left[i];    //example i = 3;   currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
360                                                                                                 //                                                                              1 to the distance of sequence 3 to sequence 1,
361                                                                                                 //                                                                              2 to the distance of sequence 3 to sequence 2.
362                         SeqMap currentRight = right[i];         // same as left but with distances on the right side.
363                         
364                         for (int j = 0; j < i; j++) {
365                                 
366                                 itL = currentLeft.find(j);
367                                 itR = currentRight.find(j);
368 cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
369 cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
370                                 
371                                 //if you can find this entry update the preferences
372                                 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
373                                 
374                                         if (!correction) {
375                                                 pref[i].score[1] += abs((itL->second - itR->second));
376                                                 pref[j].score[1] += abs((itL->second - itR->second));
377 cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
378 cout << "abs = " << abs((itL->second - itR->second)) << endl;
379 cout << i << " score = " << pref[i].score[1] << endl;
380 cout << j << " score = " << pref[j].score[1] << endl;
381                                         }else {
382                                                 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
383                                                 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
384 cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
385 cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
386 cout << i << " score = " << pref[i].score[1] << endl;
387 cout << j << " score = " << pref[j].score[1] << endl;
388                                         }
389 cout << "pref[" << i << "].closestLeft[1] = "   <<      pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;                        
390                                         //are you the closest left sequence
391                                         if (itL->second < pref[i].closestLeft[1]) {  
392
393                                                 pref[i].closestLeft[1] = itL->second;
394                                                 pref[i].leftParent[1] = seqs[j].getName();
395 cout << "updating closest left to " << pref[i].leftParent[1] << endl;
396                                         }
397 cout << "pref[" << j << "].closestLeft[1] = "   <<      pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;        
398                                         if (itL->second < pref[j].closestLeft[1]) { 
399                                                 pref[j].closestLeft[1] = itL->second;
400                                                 pref[j].leftParent[1] = seqs[i].getName();
401 cout << "updating closest left to " << pref[j].leftParent[1] << endl;
402                                         }
403                                         
404                                         //are you the closest right sequence
405                                         if (itR->second < pref[i].closestRight[1]) {   
406                                                 pref[i].closestRight[1] = itR->second;
407                                                 pref[i].rightParent[1] = seqs[j].getName();
408                                         }
409                                         if (itR->second < pref[j].closestRight[1]) {   
410                                                 pref[j].closestRight[1] = itR->second;
411                                                 pref[j].rightParent[1] = seqs[i].getName();
412                                         }
413                                         
414                                 }
415                         }
416                 
417                 }
418                 
419                 
420                   
421                 //calculate the dme
422                 int count0 = 0;
423                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
424                 
425                 float expectedPercent = 1 / (float) (pref.size() - count0);
426 cout << endl << "dme = " << dme << endl;
427                 //recalculate prefernences based on dme
428                 for (int i = 0; i < pref.size(); i++) {
429 cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;   
430                         // gives the actual percentage of the dme this seq adds
431                         pref[i].score[1] = pref[i].score[1] / dme;
432                         
433                         //how much higher or lower is this than expected
434                         pref[i].score[1] = pref[i].score[1] / expectedPercent;
435                         
436                         //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
437 cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                     
438                 }
439                 
440                 //is this score bigger then the last score
441                 for (int i = 0; i < pref.size(); i++) {  
442                         
443                         //update biggest score
444                         if (pref[i].score[1] > pref[i].score[0]) {
445                                 pref[i].score[0] = pref[i].score[1];
446                                 pref[i].leftParent[0] = pref[i].leftParent[1];
447                                 pref[i].rightParent[0] = pref[i].rightParent[1];
448                                 pref[i].closestLeft[0] = pref[i].closestLeft[1];
449                                 pref[i].closestRight[0] = pref[i].closestRight[1];
450                                 pref[i].midpoint = mid;
451                         }
452                         
453                         //total of preference scores across windows
454                         //pref[i].score[0] += pref[i].score[1];
455                 }
456
457         }
458         catch(exception& e) {
459                 errorOut(e, "ChimeraSeqsCommand", "generatePreferences");
460                 exit(1);
461         }
462 }
463 /**************************************************************************************************/
464
465 /**************************************************************************************************/
466