]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
*** empty log message ***
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12
13 //***************************************************************************************************************
14
15 Pintail::Pintail(string name) {
16         try {
17                 fastafile = name;
18         }
19         catch(exception& e) {
20                 errorOut(e, "Pintail", "Pintail");
21                 exit(1);
22         }
23 }
24 //***************************************************************************************************************
25
26 Pintail::~Pintail() {
27         try {
28                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
29                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
30         }
31         catch(exception& e) {
32                 errorOut(e, "Pintail", "~Pintail");
33                 exit(1);
34         }
35 }       
36 //***************************************************************************************************************
37 void Pintail::print(ostream& out) {
38         try {
39                 
40                 for (itCoef = DE.begin(); itCoef != DE.end(); itCoef++) {
41                         
42                         out << itCoef->first->getName() << '\t' << itCoef->second << endl;
43                         out << "Observed\t";
44                         
45                         itObsDist = obsDistance.find(itCoef->first);
46                         for (int i = 0; i < itObsDist->second.size(); i++) {  out << itObsDist->second[i] << '\t';  }
47                         out << endl;
48                         
49                         out << "Expected\t";
50                         
51                         itExpDist = expectedDistance.find(itCoef->first);
52                         for (int i = 0; i < itExpDist->second.size(); i++) {  out << itExpDist->second[i] << '\t';  }
53                         out << endl;
54                         
55                 }
56                 
57                 
58         }
59         catch(exception& e) {
60                 errorOut(e, "Pintail", "print");
61                 exit(1);
62         }
63 }
64
65 //***************************************************************************************************************
66 void Pintail::getChimeras() {
67         try {
68                 
69                 distCalculator = new ignoreGaps();
70                 
71                 //read in query sequences and subject sequences
72                 mothurOut("Reading sequences and template file... "); cout.flush();
73                 querySeqs = readSeqs(fastafile);
74                 templateSeqs = readSeqs(templateFile);
75                 mothurOut("Done."); mothurOutEndLine();
76                 
77                 int numSeqs = querySeqs.size();
78                 
79                 //if window is set to default
80                 if (window == 0) {  if (querySeqs[0]->getAligned().length() > 800) {  setWindow(200); }
81                                                         else{  setWindow((querySeqs[0]->getAligned().length() / 4)); }  } 
82                 else if (window > (querySeqs[0]->getAligned().length() / 4)) { 
83                                 mothurOut("You have selected to large a window size for you sequences.  I will choose a smaller window."); mothurOutEndLine();
84                                 setWindow((querySeqs[0]->getAligned().length() / 4)); 
85                 }
86         
87                 //calculate number of iters 
88                 iters = (querySeqs[0]->getAligned().length() - window + 1) / increment;
89 cout << "length = " << querySeqs[0]->getAligned().length() << " window = " << window << " increment = " << increment << " iters = " << iters << endl;                   
90                 int linesPerProcess = processors / numSeqs;
91                 
92                 //find breakup of sequences for all times we will Parallelize
93                 if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
94                 else {
95                         //fill line pairs
96                         for (int i = 0; i < (processors-1); i++) {                      
97                                 lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
98                         }
99                         //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
100                         int i = processors - 1;
101                         lines.push_back(new linePair((i*linesPerProcess), numSeqs));
102                 }
103                 
104                 //map query sequences to their most similiar sequences in the template - Parallelized
105                 mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
106                 if (processors == 1) {   findPairs(lines[0]->start, lines[0]->end);  } 
107                 else {          createProcessesPairs();         }
108                 mothurOut("Done."); mothurOutEndLine();
109                 
110                 //find Oqs for each sequence - the observed distance in each window - Parallelized
111                 mothurOut("Calculating observed percentage differences for each sequence... "); cout.flush();
112                 if (processors == 1) {   calcObserved(lines[0]->start, lines[0]->end);  } 
113                 else {          createProcessesObserved();              }
114                 mothurOut("Done."); mothurOutEndLine();
115                 
116                 //find P
117                 mothurOut("Calculating expected percentage differences for each sequence... "); cout.flush();
118                 vector<float> probabilityProfile = calcFreq(templateSeqs);
119                 
120                 //make P into Q
121                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];      }
122
123                 //find Qav
124                 averageProbability = findQav(probabilityProfile);
125         
126                 //find Coefficient - maps a sequence to its coefficient
127                 seqCoef = getCoef(averageProbability);
128         
129                 //find Eqs for each sequence - the expected distance in each window - Parallelized
130                 if (processors == 1) {   calcExpected(lines[0]->start, lines[0]->end);  } 
131                 else {          createProcessesExpected();              }
132                 mothurOut("Done."); mothurOutEndLine();
133                 
134                 //find deviation - Parallelized
135                 mothurOut("Finding deviation from expected... "); cout.flush();
136                 if (processors == 1) {   calcDE(lines[0]->start, lines[0]->end);  } 
137                 else {          createProcessesDE();            }
138                 mothurOut("Done."); mothurOutEndLine();
139                 
140                                 
141                 //free memory
142                 for (int i = 0; i < lines.size(); i++)                  {       delete lines[i];                }
143                 delete distCalculator;  
144
145         }
146         catch(exception& e) {
147                 errorOut(e, "Pintail", "getChimeras");
148                 exit(1);
149         }
150 }
151
152 //***************************************************************************************************************
153
154 vector<Sequence*> Pintail::readSeqs(string file) {
155         try {
156         
157                 ifstream in;
158                 openInputFile(file, in);
159                 vector<Sequence*> container;
160                 
161                 //read in seqs and store in vector
162                 while(!in.eof()){
163                         Sequence* current = new Sequence(in);
164                         
165                         if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); }
166                         //takes out stuff is needed
167                         current->setUnaligned(current->getUnaligned());
168                         
169                         container.push_back(current);
170                         
171                         gobble(in);
172                 }
173                 
174                 in.close();
175                 return container;
176                 
177         }
178         catch(exception& e) {
179                 errorOut(e, "Pintail", "readSeqs");
180                 exit(1);
181         }
182 }
183
184 //***************************************************************************************************************
185 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
186 void Pintail::findPairs(int start, int end) {
187         try {
188                 
189                 for(int i = start; i < end; i++){
190                 
191                         float smallest = 10000.0;
192                         Sequence query = *(querySeqs[i]);
193                 
194                         for(int j = 0; j < templateSeqs.size(); j++){
195                                 
196                                 Sequence temp = *(templateSeqs[j]);
197                                 
198                                 distCalculator->calcDist(query, temp);
199                                 float dist = distCalculator->getDist();
200                                 
201                                 if (dist < smallest) { 
202                                 
203                                         bestfit[querySeqs[i]] = templateSeqs[j];
204                                         smallest = dist;
205                                 }
206                         }
207                 }
208                 
209         }
210         catch(exception& e) {
211                 errorOut(e, "Pintail", "findPairs");
212                 exit(1);
213         }
214 }
215 //***************************************************************************************************************
216 void Pintail::calcObserved(int start, int end) {
217         try {
218         
219                                                 
220                 for(int i = start; i < end; i++){
221                 
222                         itBest = bestfit.find(querySeqs[i]);
223                         Sequence* query;
224                         Sequence* subject;
225                 
226                         if (itBest != bestfit.end()) {
227                                 query = itBest->first;
228                                 subject = itBest->second;
229                         }else{ mothurOut("Error in calcObserved"); mothurOutEndLine(); }
230 //cout << query->getName() << '\t' << subject->getName() << endl;                       
231                         
232                         int startpoint = 0; 
233                         for (int m = 0; m < iters; m++) {
234
235                                 string seqFrag = query->getAligned().substr(startpoint, window);
236                                 string seqFragsub = subject->getAligned().substr(startpoint, window);
237                                                                 
238                                 int diff = 0;
239                 for (int b = 0; b < seqFrag.length(); b++) {
240                   
241                     //if this is not a gap
242                     if ((isalpha(seqFrag[b])) && (isalpha(seqFragsub[b]))) {
243                         //and they are different - penalize
244                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
245                     }
246                 }
247                
248                 //percentage of mismatched bases
249                 float dist = diff / (float)seqFrag.length();       
250                                 
251                                 obsDistance[query].push_back(dist);
252                                 
253                                 startpoint += increment;
254                         }
255                 }
256                 
257         }
258         catch(exception& e) {
259                 errorOut(e, "Pintail", "calcObserved");
260                 exit(1);
261         }
262 }
263
264 //***************************************************************************************************************
265 void Pintail::calcExpected(int start, int end) {
266         try {
267                 
268         
269                 //for each sequence
270                 for(int i = start; i < end; i++){
271                         
272                         itCoef = seqCoef.find(querySeqs[i]);
273                         float coef = itCoef->second;
274                         
275                         //for each window
276                         vector<float> queryExpected;
277                         for (int m = 0; m < iters; m++) {               
278                                 float expected = averageProbability[m] * coef;
279                                 queryExpected.push_back(expected);      
280 //cout << "average variabilty over window = " << averageProbability[m] << " coef = " << coef << " ei = "  << expected << '\t' <<  "window = " << m << endl;
281                         }
282                         
283                         expectedDistance[querySeqs[i]] = queryExpected;
284                         
285                 }
286                                 
287         }
288         catch(exception& e) {
289                 errorOut(e, "Pintail", "calcExpected");
290                 exit(1);
291         }
292 }
293 //***************************************************************************************************************
294 void Pintail::calcDE(int start, int end) {
295         try {
296                 
297         
298                 //for each sequence
299                 for(int i = start; i < end; i++){
300                         
301                         itObsDist = obsDistance.find(querySeqs[i]);
302                         vector<float> obs = itObsDist->second;
303                         
304                         itExpDist = expectedDistance.find(querySeqs[i]);
305                         vector<float> exp = itExpDist->second;
306 //      cout << "difference between obs and exp = " << abs(obs[m] - exp[m]) << endl;    
307                         //for each window
308                         float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
309                         for (int m = 0; m < iters; m++) {               sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));         }
310                         
311                         float de = sqrt((sum / (iters - 1)));
312                         
313                         DE[querySeqs[i]] = de;
314                 }
315                                 
316         }
317         catch(exception& e) {
318                 errorOut(e, "Pintail", "calcDE");
319                 exit(1);
320         }
321 }
322
323 //***************************************************************************************************************
324
325 vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
326         try {
327
328                 vector<float> prob;
329                 
330                 //at each position in the sequence
331                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
332                 
333                         vector<int> freq;   freq.resize(4,0);
334                         int gaps = 0;
335                         
336                         //find the frequency of each nucleotide
337                         for (int j = 0; j < seqs.size(); j++) {
338                                 
339                                 char value = seqs[j]->getAligned()[i];
340                         
341                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
342                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
343                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
344                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
345                                 else { gaps++; }
346                         }
347                         
348                         //find base with highest frequency
349                         int highest = 0;
350                         for (int m = 0; m < freq.size(); m++) {    if (freq[m] > highest) {  highest = freq[m];  }              }
351                         
352                         //add in gaps - so you can effectively "ignore them"
353                         highest += gaps;
354                         
355                         float highFreq = highest / (float) seqs.size(); 
356                         
357                         float Pi;
358                         Pi =  (highFreq - 0.25) / 0.75;  
359                                 
360                         prob.push_back(Pi); 
361                 }
362                 
363                 return prob;
364                                 
365         }
366         catch(exception& e) {
367                 errorOut(e, "Pintail", "calcFreq");
368                 exit(1);
369         }
370 }
371 //***************************************************************************************************************
372 vector<float> Pintail::findQav(vector<float> prob) {
373         try {
374                 vector<float> averages;
375                 
376                 //for each window find average
377                 int startpoint = 0;
378                 for (int m = 0; m < iters; m++) {
379                         
380                         float average = 0.0;
381                         for (int i = startpoint; i < (startpoint+window); i++) {   average += prob[i];  }
382                         
383                         average = average / window;
384 //cout << average << endl;                      
385                         //save this windows average
386                         averages.push_back(average);
387                 
388                         startpoint += increment;
389                 }
390                 
391                 return averages;
392         }
393         catch(exception& e) {
394                 errorOut(e, "Pintail", "findQav");
395                 exit(1);
396         }
397 }
398 //***************************************************************************************************************
399 map<Sequence*, float> Pintail::getCoef(vector<float> prob) {
400         try {
401                 map<Sequence*, float> coefs;
402                 
403                 //find average prob
404                 float probAverage = 0.0;
405                 for (int i = 0; i < prob.size(); i++) {   probAverage += prob[i];       }
406                 probAverage = probAverage / (float) prob.size();
407 cout << "(sum of ai) / m = " << probAverage << endl;            
408                 //find a coef for each sequence
409                 map<Sequence*, vector<float> >::iterator it;
410                 for (it = obsDistance.begin(); it != obsDistance.end(); it++) {
411                         
412                         vector<float> temp = it->second;
413                         Sequence* tempSeq = it->first;
414                         
415                         //find observed average 
416                         float obsAverage = 0.0;
417                         for (int i = 0; i < temp.size(); i++) {   obsAverage += temp[i];        }
418                         obsAverage = obsAverage / (float) temp.size();
419 cout << tempSeq->getName() << '\t' << obsAverage << endl;                       
420                         float coef = obsAverage / probAverage;
421 cout  << tempSeq->getName() << '\t' << "coef = " << coef << endl;                       
422                         //save this sequences coefficient
423                         coefs[tempSeq] = coef;
424                 }
425                 
426                                                 
427                 return coefs;
428         }
429         catch(exception& e) {
430                 errorOut(e, "Pintail", "getCoef");
431                 exit(1);
432         }
433 }
434
435 /**************************************************************************************************/
436
437 void Pintail::createProcessesPairs() {
438         try {
439 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
440                 int process = 0;
441                 vector<int> processIDS;
442                 
443                 //loop through and create all the processes you want
444                 while (process != processors) {
445                         int pid = fork();
446                         
447                         if (pid > 0) {
448                                 processIDS.push_back(pid);  
449                                 process++;
450                         }else if (pid == 0){
451                                 findPairs(lines[process]->start, lines[process]->end);
452                                 exit(0);
453                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
454                 }
455                 
456                 //force parent to wait until all the processes are done
457                 for (int i=0;i<processors;i++) { 
458                         int temp = processIDS[i];
459                         wait(&temp);
460                 }
461                 
462 #else
463                 findPairs(lines[0]->start, lines[0]->end);
464
465 #endif          
466         }
467         catch(exception& e) {
468                 errorOut(e, "Pintail", "createProcessesPairs");
469                 exit(1);
470         }
471 }
472
473 /**************************************************************************************************/
474
475 void Pintail::createProcessesObserved() {
476         try {
477 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
478                 int process = 0;
479                 vector<int> processIDS;
480                 
481                 //loop through and create all the processes you want
482                 while (process != processors) {
483                         int pid = fork();
484                         
485                         if (pid > 0) {
486                                 processIDS.push_back(pid);  
487                                 process++;
488                         }else if (pid == 0){
489                                 calcObserved(lines[process]->start, lines[process]->end);
490                                 exit(0);
491                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
492                 }
493                 
494                 //force parent to wait until all the processes are done
495                 for (int i=0;i<processors;i++) { 
496                         int temp = processIDS[i];
497                         wait(&temp);
498                 }
499                 
500 #else
501                 calcObserved(lines[0]->start, lines[0]->end);
502
503 #endif          
504         }
505         catch(exception& e) {
506                 errorOut(e, "Pintail", "createProcessesObserved");
507                 exit(1);
508         }
509 }
510
511 //***************************************************************************************************************
512
513 void Pintail::createProcessesExpected() {
514         try {
515 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
516                 int process = 0;
517                 vector<int> processIDS;
518                 
519                 //loop through and create all the processes you want
520                 while (process != processors) {
521                         int pid = fork();
522                         
523                         if (pid > 0) {
524                                 processIDS.push_back(pid);  
525                                 process++;
526                         }else if (pid == 0){
527                                 calcExpected(lines[process]->start, lines[process]->end);
528                                 exit(0);
529                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
530                 }
531                 
532                 //force parent to wait until all the processes are done
533                 for (int i=0;i<processors;i++) { 
534                         int temp = processIDS[i];
535                         wait(&temp);
536                 }
537                 
538 #else
539                 calcExpected(lines[0]->start, lines[0]->end);
540
541 #endif          
542         }
543         catch(exception& e) {
544                 errorOut(e, "Pintail", "createProcessesExpected");
545                 exit(1);
546         }
547 }
548
549 /**************************************************************************************************/
550
551 void Pintail::createProcessesDE() {
552         try {
553 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
554                 int process = 0;
555                 vector<int> processIDS;
556                 
557                 //loop through and create all the processes you want
558                 while (process != processors) {
559                         int pid = fork();
560                         
561                         if (pid > 0) {
562                                 processIDS.push_back(pid);  
563                                 process++;
564                         }else if (pid == 0){
565                                 calcDE(lines[process]->start, lines[process]->end);
566                                 exit(0);
567                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
568                 }
569                 
570                 //force parent to wait until all the processes are done
571                 for (int i=0;i<processors;i++) { 
572                         int temp = processIDS[i];
573                         wait(&temp);
574                 }
575                 
576 #else
577                 calcDE(lines[0]->start, lines[0]->end);
578
579 #endif          
580         }
581         catch(exception& e) {
582                 errorOut(e, "Pintail", "createProcessesDE");
583                 exit(1);
584         }
585 }
586
587 //***************************************************************************************************************
588
589