]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
added sorted option to get.rabund command
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12
13 //***************************************************************************************************************
14
15 Pintail::Pintail(string name) {
16         try {
17                 fastafile = name;
18         }
19         catch(exception& e) {
20                 errorOut(e, "Pintail", "Pintail");
21                 exit(1);
22         }
23 }
24 //***************************************************************************************************************
25
26 Pintail::~Pintail() {
27         try {
28                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
29                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
30         }
31         catch(exception& e) {
32                 errorOut(e, "Pintail", "~Pintail");
33                 exit(1);
34         }
35 }       
36 //***************************************************************************************************************
37 void Pintail::print(ostream& out) {
38         try {
39                 
40                 for (itCoef = DE.begin(); itCoef != DE.end(); itCoef++) {
41                 
42                         out << itCoef->first->getName() << '\t' << itCoef->second << endl;
43                         out << "Observed\t";
44                         
45                         itObsDist = obsDistance.find(itCoef->first);
46                         for (int i = 0; i < itObsDist->second.size(); i++) {  out << itObsDist->second[i] << '\t';  }
47                         out << endl;
48                         
49                         out << "Expected\t";
50                         
51                         itExpDist = expectedDistance.find(itCoef->first);
52                         for (int i = 0; i < itExpDist->second.size(); i++) {  out << itExpDist->second[i] << '\t';  }
53                         out << endl;
54                         
55                 }
56
57         }
58         catch(exception& e) {
59                 errorOut(e, "Pintail", "print");
60                 exit(1);
61         }
62 }
63
64 //***************************************************************************************************************
65 void Pintail::getChimeras() {
66         try {
67                 
68                 distCalculator = new ignoreGaps();
69                 
70                 //read in query sequences and subject sequences
71                 mothurOut("Reading sequences and template file... "); cout.flush();
72                 querySeqs = readSeqs(fastafile);
73                 templateSeqs = readSeqs(templateFile);
74                 mothurOut("Done."); mothurOutEndLine();
75                 
76                 int numSeqs = querySeqs.size();
77                 
78                 //if window is set to default
79                 if (window == 0) {  if (querySeqs[0]->getAligned().length() > 800) {  setWindow(200); }
80                                                         else{  setWindow((querySeqs[0]->getAligned().length() / 4)); }  } 
81         
82                 //calculate number of iters
83                 iters = (querySeqs[0]->getAligned().length() - window + 1) / increment;
84                 
85                 int linesPerProcess = processors / numSeqs;
86                 
87                 //find breakup of sequences for all times we will Parallelize
88                 if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
89                 else {
90                         //fill line pairs
91                         for (int i = 0; i < (processors-1); i++) {                      
92                                 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
93                         }
94                         //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
95                         int i = processors - 1;
96                         lines.push_back(new linePair((i*linesPerProcess), numSeqs));
97                 }
98                 
99                 //map query sequences to their most similiar sequences in the template - Parallelized
100                 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
101                 if (processors == 1) {   findPairs(lines[0]->start, lines[0]->end);  } 
102                 else {          createProcessesPairs();         }
103                 mothurOut("Done."); mothurOutEndLine();
104         //      itBest = bestfit.begin();
105         //      itBest++; itBest++;
106 //cout << itBest->first->getName() << '\t' << itBest->second->getName() << endl;        
107                 //find Oqs for each sequence - the observed distance in each window - Parallelized
108                 mothurOut("Calculating observed percentage differences for each sequence... "); cout.flush();
109                 if (processors == 1) {   calcObserved(lines[0]->start, lines[0]->end);  } 
110                 else {          createProcessesObserved();              }
111                 mothurOut("Done."); mothurOutEndLine();
112                 
113                 //find P
114                 mothurOut("Calculating expected percentage differences for each sequence... "); cout.flush();
115                 vector<float> probabilityProfile = calcFreq(templateSeqs);
116                 
117                 //make P into Q
118                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];      }
119 //cout << "after p into Q" << endl;             
120                 //find Qav
121                 averageProbability = findQav(probabilityProfile);
122 //cout << "after Qav" << endl;          
123                 //find Coefficient - maps a sequence to its coefficient
124                 seqCoef = getCoef(averageProbability);
125 //cout << "after coef" << endl;         
126                 //find Eqs for each sequence - the expected distance in each window - Parallelized
127                 if (processors == 1) {   calcExpected(lines[0]->start, lines[0]->end);  } 
128                 else {          createProcessesExpected();              }
129                 mothurOut("Done."); mothurOutEndLine();
130                 
131                 //find deviation - Parallelized
132                 mothurOut("Finding deviation from expected... "); cout.flush();
133                 if (processors == 1) {   calcDE(lines[0]->start, lines[0]->end);  } 
134                 else {          createProcessesDE();            }
135                 mothurOut("Done."); mothurOutEndLine();
136                 
137                                 
138                 //free memory
139                 for (int i = 0; i < lines.size(); i++)                  {       delete lines[i];                }
140                 delete distCalculator;  
141
142         }
143         catch(exception& e) {
144                 errorOut(e, "Pintail", "getChimeras");
145                 exit(1);
146         }
147 }
148
149 //***************************************************************************************************************
150
151 vector<Sequence*> Pintail::readSeqs(string file) {
152         try {
153         
154                 ifstream in;
155                 openInputFile(file, in);
156                 vector<Sequence*> container;
157                 
158                 //read in seqs and store in vector
159                 while(!in.eof()){
160                         Sequence* current = new Sequence(in);
161                         
162                         if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); }
163                         
164                         container.push_back(current);
165                         
166                         gobble(in);
167                 }
168                 
169                 in.close();
170                 return container;
171                 
172         }
173         catch(exception& e) {
174                 errorOut(e, "Pintail", "readSeqs");
175                 exit(1);
176         }
177 }
178
179 //***************************************************************************************************************
180 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
181 void Pintail::findPairs(int start, int end) {
182         try {
183                 
184                 for(int i = start; i < end; i++){
185                 
186                         float smallest = 10000.0;
187                         Sequence query = *(querySeqs[i]);
188                 
189                         for(int j = 0; j < templateSeqs.size(); j++){
190                                 
191                                 Sequence temp = *(templateSeqs[j]);
192                                 
193                                 distCalculator->calcDist(query, temp);
194                                 float dist = distCalculator->getDist();
195                                 
196                                 if (dist < smallest) { 
197                                 
198                                         bestfit[querySeqs[i]] = templateSeqs[j];
199                                         smallest = dist;
200                                 }
201                         }
202                 }
203                 
204         }
205         catch(exception& e) {
206                 errorOut(e, "Pintail", "findPairs");
207                 exit(1);
208         }
209 }
210 //***************************************************************************************************************
211 void Pintail::calcObserved(int start, int end) {
212         try {
213
214                 for(int i = start; i < end; i++){
215                 
216                         itBest = bestfit.find(querySeqs[i]);
217                         Sequence* query;
218                         Sequence* subject;
219                         
220                         if (itBest != bestfit.end()) {
221                                 query = itBest->first;
222                                 subject = itBest->second;
223                         }else{ mothurOut("Error in calcObserved"); mothurOutEndLine(); }
224                         
225                         vector<Sequence*> queryFragment;
226                         vector<Sequence*> subjectFragment;
227                         
228                         int startpoint = 0; 
229                         for (int m = 0; m < iters; m++) {
230                                 string seqFrag = query->getAligned().substr(startpoint, window);
231                                 Sequence* temp = new Sequence(query->getName(), seqFrag);
232                                 queryFragment.push_back(temp);
233                                 
234                                 string seqFragsub = subject->getAligned().substr(startpoint, window);
235                                 Sequence* tempsub = new Sequence(subject->getName(), seqFragsub);
236                                 subjectFragment.push_back(tempsub);
237                                 
238                                 startpoint += increment;
239                         }
240                         
241                         
242                         for(int j = 0; j < iters; j++){
243                         
244                                 distCalculator->calcDist(*(queryFragment[j]), *(subjectFragment[j]));
245                                 float dist = distCalculator->getDist();
246                                 
247                                 //convert to similiarity
248                                 //dist = 1 - dist;
249                                 
250                                 //save oi
251                                 obsDistance[query].push_back(dist);
252                         }
253                 
254                         //free memory
255                         for (int i = 0; i < queryFragment.size(); i++)  {  delete queryFragment[i];             }
256                         for (int i = 0; i < subjectFragment.size(); i++) {  delete subjectFragment[i];  }
257                 }
258                 
259         }
260         catch(exception& e) {
261                 errorOut(e, "Pintail", "calcObserved");
262                 exit(1);
263         }
264 }
265
266 //***************************************************************************************************************
267 void Pintail::calcExpected(int start, int end) {
268         try {
269                 
270         
271                 //for each sequence
272                 for(int i = start; i < end; i++){
273                         
274                         itCoef = seqCoef.find(querySeqs[i]);
275                         float coef = itCoef->second;
276                         
277                         //for each window
278                         vector<float> queryExpected;
279                         for (int m = 0; m < iters; m++) {               
280                                 float expected = averageProbability[m] * coef;
281                                 queryExpected.push_back(expected);              
282                         }
283                         
284                         expectedDistance[querySeqs[i]] = queryExpected;
285                         
286                 }
287                                 
288         }
289         catch(exception& e) {
290                 errorOut(e, "Pintail", "calcExpected");
291                 exit(1);
292         }
293 }
294 //***************************************************************************************************************
295 void Pintail::calcDE(int start, int end) {
296         try {
297                 
298         
299                 //for each sequence
300                 for(int i = start; i < end; i++){
301                         
302                         itObsDist = obsDistance.find(querySeqs[i]);
303                         vector<float> obs = itObsDist->second;
304                         
305                         itExpDist = expectedDistance.find(querySeqs[i]);
306                         vector<float> exp = itExpDist->second;
307                         
308                         //for each window
309                         float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
310                         for (int m = 0; m < iters; m++) {               sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));         }
311                         
312                         float de = sqrt((sum / (iters - 1)));
313                         
314                         DE[querySeqs[i]] = de;
315                 }
316                                 
317         }
318         catch(exception& e) {
319                 errorOut(e, "Pintail", "calcDE");
320                 exit(1);
321         }
322 }
323
324 //***************************************************************************************************************
325
326 vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
327         try {
328
329                 vector<float> prob;
330                 
331                 //at each position in the sequence
332                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
333                 
334                         vector<int> freq;   freq.resize(4,0);
335                         
336                         //find the frequency of each nucleotide
337                         for (int j = 0; j < seqs.size(); j++) {
338                                 
339                                 char value = seqs[j]->getAligned()[i];
340                         
341                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
342                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
343                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
344                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
345                         }
346                         
347                         //find base with highest frequency
348                         int highest = 0;
349                         for (int m = 0; m < freq.size(); m++) {    if (freq[m] > highest) {  highest = freq[m];  }              }
350                                 
351                         float highFreq = highest / (float) seqs.size(); 
352                         float Pi;
353                         //if this is not a gap column
354                         if (highFreq != 0) { Pi =  (highFreq - 0.25) / 0.75;  }
355                         else { Pi = 0; }
356                         
357                         prob.push_back(Pi);
358                         
359                 }
360                 
361                 return prob;
362                                 
363         }
364         catch(exception& e) {
365                 errorOut(e, "Pintail", "calcFreq");
366                 exit(1);
367         }
368 }
369 //***************************************************************************************************************
370 vector<float> Pintail::findQav(vector<float> prob) {
371         try {
372                 vector<float> averages;
373                 
374                 //for each window find average
375                 int startpoint = 0;
376                 for (int m = 0; m < iters; m++) {
377                         
378                         float average = 0.0;
379                         for (int i = startpoint; i < (startpoint+window); i++) {   average += prob[i];  }
380                         
381                         average = average / window;
382                         
383                         //save this windows average
384                         averages.push_back(average);
385                 
386                         startpoint += increment;
387                 }
388                 
389                 return averages;
390         }
391         catch(exception& e) {
392                 errorOut(e, "Pintail", "findQav");
393                 exit(1);
394         }
395 }
396 //***************************************************************************************************************
397 map<Sequence*, float> Pintail::getCoef(vector<float> prob) {
398         try {
399                 map<Sequence*, float> coefs;
400                 
401                 //find average prob
402                 float probAverage = 0.0;
403                 for (int i = 0; i < prob.size(); i++) {   probAverage += prob[i];       }
404                 probAverage = probAverage / (float) prob.size();
405                 
406                 //find a coef for each sequence
407                 map<Sequence*, vector<float> >::iterator it;
408                 for (it = obsDistance.begin(); it != obsDistance.end(); it++) {
409                         
410                         vector<float> temp = it->second;
411                         Sequence* tempSeq = it->first;
412                         
413                         //find observed average 
414                         float obsAverage = 0.0;
415                         for (int i = 0; i < temp.size(); i++) {   obsAverage += temp[i];        }
416                         obsAverage = obsAverage / (float) temp.size();
417                         
418                         float coef = obsAverage / probAverage;
419                         
420                         //save this sequences coefficient
421                         coefs[tempSeq] = coef;
422                 }
423                 
424                                                 
425                 return coefs;
426         }
427         catch(exception& e) {
428                 errorOut(e, "Pintail", "getCoef");
429                 exit(1);
430         }
431 }
432
433 /**************************************************************************************************/
434
435 void Pintail::createProcessesPairs() {
436         try {
437 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
438                 int process = 0;
439                 vector<int> processIDS;
440                 
441                 //loop through and create all the processes you want
442                 while (process != processors) {
443                         int pid = fork();
444                         
445                         if (pid > 0) {
446                                 processIDS.push_back(pid);  
447                                 process++;
448                         }else if (pid == 0){
449                                 findPairs(lines[process]->start, lines[process]->end);
450                                 exit(0);
451                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
452                 }
453                 
454                 //force parent to wait until all the processes are done
455                 for (int i=0;i<processors;i++) { 
456                         int temp = processIDS[i];
457                         wait(&temp);
458                 }
459                 
460 #else
461                 findPairs(lines[0]->start, lines[0]->end);
462
463 #endif          
464         }
465         catch(exception& e) {
466                 errorOut(e, "Pintail", "createProcessesPairs");
467                 exit(1);
468         }
469 }
470
471 /**************************************************************************************************/
472
473 void Pintail::createProcessesObserved() {
474         try {
475 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
476                 int process = 0;
477                 vector<int> processIDS;
478                 
479                 //loop through and create all the processes you want
480                 while (process != processors) {
481                         int pid = fork();
482                         
483                         if (pid > 0) {
484                                 processIDS.push_back(pid);  
485                                 process++;
486                         }else if (pid == 0){
487                                 calcObserved(lines[process]->start, lines[process]->end);
488                                 exit(0);
489                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
490                 }
491                 
492                 //force parent to wait until all the processes are done
493                 for (int i=0;i<processors;i++) { 
494                         int temp = processIDS[i];
495                         wait(&temp);
496                 }
497                 
498 #else
499                 calcObserved(lines[0]->start, lines[0]->end);
500
501 #endif          
502         }
503         catch(exception& e) {
504                 errorOut(e, "Pintail", "createProcessesObserved");
505                 exit(1);
506         }
507 }
508
509 //***************************************************************************************************************
510
511 void Pintail::createProcessesExpected() {
512         try {
513 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
514                 int process = 0;
515                 vector<int> processIDS;
516                 
517                 //loop through and create all the processes you want
518                 while (process != processors) {
519                         int pid = fork();
520                         
521                         if (pid > 0) {
522                                 processIDS.push_back(pid);  
523                                 process++;
524                         }else if (pid == 0){
525                                 calcExpected(lines[process]->start, lines[process]->end);
526                                 exit(0);
527                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
528                 }
529                 
530                 //force parent to wait until all the processes are done
531                 for (int i=0;i<processors;i++) { 
532                         int temp = processIDS[i];
533                         wait(&temp);
534                 }
535                 
536 #else
537                 calcExpected(lines[0]->start, lines[0]->end);
538
539 #endif          
540         }
541         catch(exception& e) {
542                 errorOut(e, "Pintail", "createProcessesExpected");
543                 exit(1);
544         }
545 }
546
547 /**************************************************************************************************/
548
549 void Pintail::createProcessesDE() {
550         try {
551 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
552                 int process = 0;
553                 vector<int> processIDS;
554                 
555                 //loop through and create all the processes you want
556                 while (process != processors) {
557                         int pid = fork();
558                         
559                         if (pid > 0) {
560                                 processIDS.push_back(pid);  
561                                 process++;
562                         }else if (pid == 0){
563                                 calcDE(lines[process]->start, lines[process]->end);
564                                 exit(0);
565                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
566                 }
567                 
568                 //force parent to wait until all the processes are done
569                 for (int i=0;i<processors;i++) { 
570                         int temp = processIDS[i];
571                         wait(&temp);
572                 }
573                 
574 #else
575                 calcDE(lines[0]->start, lines[0]->end);
576
577 #endif          
578         }
579         catch(exception& e) {
580                 errorOut(e, "Pintail", "createProcessesDE");
581                 exit(1);
582         }
583 }
584
585 //***************************************************************************************************************
586
587