]> git.donarmstrong.com Git - mothur.git/blob - chimeraseqscommand.cpp
a334ea31ccc2a7fd0334f687d3f84c16c00d3494
[mothur.git] / chimeraseqscommand.cpp
1 /*
2  *  chimeraseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 6/29/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "chimeraseqscommand.h"
11 #include "eachgapdist.h"
12 #include "ignoregaps.h"
13 #include "onegapdist.h"
14
15 //***************************************************************************************************************
16
17 ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
18         try {
19                 abort = false;
20                 
21                 //allow user to run help
22                 if(option == "help") { help(); abort = true; }
23                 
24                 else {
25                         //valid paramters for this command
26                         string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
27                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28                         
29                         OptionParser parser(option);
30                         map<string,string> parameters = parser.getParameters();
31                         
32                         ValidParameters validParameter;
33                         
34                         //check to make sure all parameters are valid for command
35                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //check for required parameters
40                         fastafile = validParameter.validFile(parameters, "fasta", true);
41                         if (fastafile == "not open") { abort = true; }
42                         else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true;  }     
43                         
44                         string temp;
45                         temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "T"; }
46                         filter = isTrue(temp);
47                         
48                         temp = validParameter.validFile(parameters, "correction", false);               if (temp == "not found") { temp = "T"; }
49                         correction = isTrue(temp);
50                         
51                         temp = validParameter.validFile(parameters, "processors", false);               if (temp == "not found") { temp = "1"; }
52                         convert(temp, processors);
53                         
54                         temp = validParameter.validFile(parameters, "window", false);                   if (temp == "not found") { temp = "0"; }
55                         convert(temp, window);
56                                         
57                         temp = validParameter.validFile(parameters, "increment", false);                        if (temp == "not found") { temp = "10"; }
58                         convert(temp, increment);
59                                 
60                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "bellerophon"; }
61                         
62                         if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine();  abort = true; }
63
64                 }
65         }
66         catch(exception& e) {
67                 errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
68                 exit(1);
69         }
70 }
71 //**********************************************************************************************************************
72
73 void ChimeraSeqsCommand::help(){
74         try {
75                 mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
76                 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method.  fasta is required.\n");
77                 mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter.  The default is false. \n");
78                 mothurOut("The correction parameter allows you to .....  The default is true. \n");
79                 mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
80                 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is bellerophon. \n");
81                 mothurOut("The chimera.seqs command should be in the following format: \n");
82                 mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
83                 mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
84                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");        
85         }
86         catch(exception& e) {
87                 errorOut(e, "ChimeraSeqsCommand", "help");
88                 exit(1);
89         }
90 }
91 //********************************************************************************************************************
92 //sorts highest score to lowest
93 inline bool comparePref(Preference left, Preference right){
94         return (left.score[0] > right.score[0]);        
95 }
96
97 //***************************************************************************************************************
98
99 ChimeraSeqsCommand::~ChimeraSeqsCommand(){      /*      do nothing      */      }
100
101 //***************************************************************************************************************
102
103 int ChimeraSeqsCommand::execute(){
104         try{
105                 
106                 if (abort == true) { return 0; }
107                 
108                 
109                 //do soft filter
110                 if (filter)  {
111                         string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
112                         filterSeqs = new FilterSeqsCommand(optionString);
113                         filterSeqs->execute();
114                         delete filterSeqs;
115                         
116                         //reset fastafile to filtered file
117                         fastafile = getRootName(fastafile) + "filter.fasta";
118                 }
119                 
120                 distCalculator = new eachGapDist();
121                 
122                 //read in sequences
123                 readSeqs();
124                 
125                 int numSeqs = seqs.size();
126                 
127                 if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
128                 
129                 //set default window to 25% of sequence length
130                 string seq0 = seqs[0].getAligned();
131                 if (window == 0) { window = seq0.length() / 4;  }
132                 else if (window > (seq0.length() / 2)) {  
133                         mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
134                         window = (seq0.length() / 2);
135                 }
136                 
137                 if (increment > (seqs[0].getAlignLength() - (2*window))) { 
138                         if (increment != 10) {
139                         
140                                 mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
141                                 increment = 10;
142                                 if (increment > (seqs[0].getAlignLength() - (2*window))) {  increment = 0;  }
143                                 
144                         }else{ increment = 0; }
145                 }
146 cout << "increment = " << increment << endl;            
147                 if (increment == 0) { iters = 1; }
148                 else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
149                 
150                 //initialize pref
151                 pref.resize(numSeqs);  
152                 
153                 for (int i = 0; i < numSeqs; i++ ) { 
154                         pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2);   pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
155                         pref[i].name = seqs[i].getName();
156                         pref[i].score[0] = 0.0;  pref[i].score[1] = 0.0; 
157                         pref[i].closestLeft[0] = 100000.0;  pref[i].closestLeft[1] = 100000.0;  
158                         pref[i].closestRight[0] = 100000.0;  pref[i].closestRight[1] = 100000.0;  
159                 }
160
161                 int midpoint = window;
162                 int count = 0;
163                 while (count < iters) {
164                                 
165                                 //create 2 vectors of sequences, 1 for left side and one for right side
166                                 vector<Sequence> left;  vector<Sequence> right;
167                                 
168                                 for (int i = 0; i < seqs.size(); i++) {
169 //cout << "whole = " << seqs[i].getAligned() << endl;
170                                         //save left side
171                                         string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
172                                         Sequence tempLeft;
173                                         tempLeft.setName(seqs[i].getName());
174                                         tempLeft.setAligned(seqLeft);
175                                         left.push_back(tempLeft);
176 //cout << "left = " << tempLeft.getAligned() << endl;                   
177                                         //save right side
178                                         string seqRight = seqs[i].getAligned().substr(midpoint, window);
179                                         Sequence tempRight;
180                                         tempRight.setName(seqs[i].getName());
181                                         tempRight.setAligned(seqRight);
182                                         right.push_back(tempRight);
183 //cout << "right = " << seqRight << endl;       
184                                 }
185                                 
186                                 //adjust midpoint by increment
187                                 midpoint += increment;
188                                 
189                                 
190                                 //this should be parallelized
191                                 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
192                                 //create a matrix containing the distance from left to left and right to right
193                                 //calculate distances
194                                 SparseMatrix* SparseLeft = new SparseMatrix();
195                                 SparseMatrix* SparseRight = new SparseMatrix();
196                                 
197                                 createSparseMatrix(0, left.size(), SparseLeft, left);
198                                 createSparseMatrix(0, right.size(), SparseRight, right);
199                                 
200                                 vector<SeqMap> distMapRight;
201                                 vector<SeqMap> distMapLeft;
202                                 
203                                 // Create a data structure to quickly access the distance information.
204                                 // It consists of a vector of distance maps, where each map contains
205                                 // all distances of a certain sequence. Vector and maps are accessed
206                                 // via the index of a sequence in the distance matrix
207                                 distMapRight = vector<SeqMap>(numSeqs); 
208                                 distMapLeft = vector<SeqMap>(numSeqs); 
209                                 //cout << "left" << endl << endl;
210                                 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
211                                         distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
212                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
213                                 }
214                                 //cout << "right" << endl << endl;
215                                 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
216                                         distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
217                                         //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
218                                 }
219                                 
220                                 delete SparseLeft;
221                                 delete SparseRight;
222                                 
223                                 
224                                 //fill preference structure
225                                 generatePreferences(distMapLeft, distMapRight, midpoint);
226                                 
227                                 count++;
228                                 
229                 }
230                 
231                 delete distCalculator;
232                 
233                 //rank preference score to eachother
234                 float dme = 0.0;
235                 float expectedPercent = 1 / (float) (pref.size());
236                 
237                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[0];  }
238         
239                 for (int i = 0; i < pref.size(); i++) {
240
241                         //gives the actual percentage of the dme this seq adds
242                         pref[i].score[0] = pref[i].score[0] / dme;
243                         
244                         //how much higher or lower is this than expected
245                         pref[i].score[0] = pref[i].score[0] / expectedPercent;
246                         
247                 }
248                 
249                 
250                 //sort Preferences highest to lowest
251                 sort(pref.begin(), pref.end(), comparePref);
252                 
253                 string outputFileName = getRootName(fastafile) + "chimeras";
254                 ofstream out;
255                 openOutputFile(outputFileName, out);
256                 
257                 int above1 = 0;
258                 out << "Name\tScore\tLeft\tRight\t" << endl;
259                 //output prefenence structure to .chimeras file
260                 for (int i = 0; i < pref.size(); i++) {
261                         out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
262                         
263                         //calc # of seqs with preference above 1.0
264                         if (pref[i].score[0] > 1.0) { 
265                                 above1++; 
266                                 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
267                                 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
268                         }
269                         
270                         
271                 }
272                 
273                 //output results to screen
274                 mothurOutEndLine();
275                 mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
276                 int spot;
277                 spot = pref.size()-1;
278                 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
279                 spot = pref.size() * 0.975;
280                 mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
281                 spot = pref.size() * 0.75;
282                 mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
283                 spot = pref.size() * 0.50;
284                 mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
285                 spot = pref.size() * 0.25;
286                 mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
287                 spot = pref.size() * 0.025;
288                 mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
289                 spot = 0;
290                 mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
291                 
292                 return 0;
293         }
294         catch(exception& e) {
295                 errorOut(e, "ChimeraSeqsCommand", "execute");
296                 exit(1);
297         }
298 }
299
300 //***************************************************************************************************************
301 void ChimeraSeqsCommand::readSeqs(){
302         try {
303                 ifstream inFASTA;
304                 openInputFile(fastafile, inFASTA);
305                 
306                 //read in seqs and store in vector
307                 while(!inFASTA.eof()){
308                         Sequence current(inFASTA);
309                         
310                         if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
311                         
312                         seqs.push_back(current);
313                         
314                         gobble(inFASTA);
315                 }
316                 inFASTA.close();
317
318         }
319         catch(exception& e) {
320                 errorOut(e, "ChimeraSeqsCommand", "readSeqs");
321                 exit(1);
322         }
323 }
324
325 /***************************************************************************************************************/
326 int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
327         try {
328
329                 for(int i=startSeq; i<endSeq; i++){
330                         
331                         for(int j=0;j<i;j++){
332                         
333                                 distCalculator->calcDist(s[i], s[j]);
334                                 float dist = distCalculator->getDist();
335                         
336                                 PCell temp(i, j, dist);
337                                 sparse->addCell(temp);
338                                 
339                         }
340                 }
341                         
342         
343                 return 1;
344         }
345         catch(exception& e) {
346                 errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
347                 exit(1);
348         }
349 }
350 /***************************************************************************************************************/
351 void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
352         try {
353                 
354                 float dme = 0.0;
355                 SeqMap::iterator itR;
356                 SeqMap::iterator itL;
357                 
358                 //initialize pref[i]
359                 for (int i = 0; i < pref.size(); i++) {
360                         pref[i].score[1] = 0.0;
361                         pref[i].closestLeft[1] = 100000.0; 
362                         pref[i].closestRight[1] = 100000.0; 
363                         pref[i].leftParent[1] = "";
364                         pref[i].rightParent[1] = "";
365                 }
366         
367                 for (int i = 0; i < left.size(); i++) {
368                         
369                         SeqMap currentLeft = left[i];    //example i = 3;   currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
370                                                                                                 //                                                                              1 to the distance of sequence 3 to sequence 1,
371                                                                                                 //                                                                              2 to the distance of sequence 3 to sequence 2.
372                         SeqMap currentRight = right[i];         // same as left but with distances on the right side.
373                         
374                         for (int j = 0; j < i; j++) {
375                                 
376                                 itL = currentLeft.find(j);
377                                 itR = currentRight.find(j);
378 //cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
379 //cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
380                                 
381                                 //if you can find this entry update the preferences
382                                 if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
383                                 
384                                         if (!correction) {
385                                                 pref[i].score[1] += abs((itL->second - itR->second));
386                                                 pref[j].score[1] += abs((itL->second - itR->second));
387 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
388 //cout << "abs = " << abs((itL->second - itR->second)) << endl;
389 //cout << i << " score = " << pref[i].score[1] << endl;
390 //cout << j << " score = " << pref[j].score[1] << endl;
391                                         }else {
392                                                 pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
393                                                 pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
394 //cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
395 //cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
396 //cout << i << " score = " << pref[i].score[1] << endl;
397 //cout << j << " score = " << pref[j].score[1] << endl;
398                                         }
399 //cout << "pref[" << i << "].closestLeft[1] = " <<      pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;                        
400                                         //are you the closest left sequence
401                                         if (itL->second < pref[i].closestLeft[1]) {  
402
403                                                 pref[i].closestLeft[1] = itL->second;
404                                                 pref[i].leftParent[1] = seqs[j].getName();
405 //cout << "updating closest left to " << pref[i].leftParent[1] << endl;
406                                         }
407 //cout << "pref[" << j << "].closestLeft[1] = " <<      pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;        
408                                         if (itL->second < pref[j].closestLeft[1]) { 
409                                                 pref[j].closestLeft[1] = itL->second;
410                                                 pref[j].leftParent[1] = seqs[i].getName();
411 //cout << "updating closest left to " << pref[j].leftParent[1] << endl;
412                                         }
413                                         
414                                         //are you the closest right sequence
415                                         if (itR->second < pref[i].closestRight[1]) {   
416                                                 pref[i].closestRight[1] = itR->second;
417                                                 pref[i].rightParent[1] = seqs[j].getName();
418                                         }
419                                         if (itR->second < pref[j].closestRight[1]) {   
420                                                 pref[j].closestRight[1] = itR->second;
421                                                 pref[j].rightParent[1] = seqs[i].getName();
422                                         }
423                                         
424                                 }
425                         }
426                 
427                 }
428                 
429                 
430                   
431                 //calculate the dme
432                 
433                 int count0 = 0;
434                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
435                 
436                 float expectedPercent = 1 / (float) (pref.size() - count0);
437 //cout << endl << "dme = " << dme << endl;
438                 //recalculate prefernences based on dme
439                 for (int i = 0; i < pref.size(); i++) {
440 //cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; 
441                         // gives the actual percentage of the dme this seq adds
442                         pref[i].score[1] = pref[i].score[1] / dme;
443                         
444                         //how much higher or lower is this than expected
445                         pref[i].score[1] = pref[i].score[1] / expectedPercent;
446                         
447                         //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
448                         
449                         //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
450 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                   
451                 }
452                 
453                 //is this score bigger then the last score
454                 for (int i = 0; i < pref.size(); i++) {  
455                         
456                         //update biggest score
457                         if (pref[i].score[1] > pref[i].score[0]) {
458                                 pref[i].score[0] = pref[i].score[1];
459                                 pref[i].leftParent[0] = pref[i].leftParent[1];
460                                 pref[i].rightParent[0] = pref[i].rightParent[1];
461                                 pref[i].closestLeft[0] = pref[i].closestLeft[1];
462                                 pref[i].closestRight[0] = pref[i].closestRight[1];
463                                 pref[i].midpoint = mid;
464                         }
465                         
466                 }
467
468         }
469         catch(exception& e) {
470                 errorOut(e, "ChimeraSeqsCommand", "generatePreferences");
471                 exit(1);
472         }
473 }
474 /**************************************************************************************************/
475
476 /**************************************************************************************************/
477