]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
worked on chimeras
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12
13 //***************************************************************************************************************
14
15 Pintail::Pintail(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
16 //***************************************************************************************************************
17
18 Pintail::~Pintail() {
19         try {
20                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
21                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
22         }
23         catch(exception& e) {
24                 errorOut(e, "Pintail", "~Pintail");
25                 exit(1);
26         }
27 }       
28 //***************************************************************************************************************
29 void Pintail::print(ostream& out) {
30         try {
31                 
32                 for (int i = 0; i < querySeqs.size(); i++) {
33                         
34                         int index = ceil(deviation[i]);
35                         
36                         //is your DE value higher than the 95%
37                         string chimera;
38                         if (DE[i] > quantiles[index][4])        {       chimera = "Yes";        }
39                         else                                                            {       chimera = "No";         }
40                         
41                         out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
42                         out << "Observed\t";
43                         
44                         for (int j = 0; j < obsDistance[i].size(); j++) {  out << obsDistance[i][j] << '\t';  }
45                         out << endl;
46                         
47                         out << "Expected\t";
48                         
49                         for (int m = 0; m < expectedDistance[i].size(); m++) {  out << expectedDistance[i][m] << '\t';  }
50                         out << endl;
51                         
52                 }
53         }
54         catch(exception& e) {
55                 errorOut(e, "Pintail", "print");
56                 exit(1);
57         }
58 }
59
60 //***************************************************************************************************************
61 void Pintail::getChimeras() {
62         try {
63                 
64                 //read in query sequences and subject sequences
65                 mothurOut("Reading sequences and template file... "); cout.flush();
66                 querySeqs = readSeqs(fastafile);
67                 templateSeqs = readSeqs(templateFile);
68                 mothurOut("Done."); mothurOutEndLine();
69                 
70                 int numSeqs = querySeqs.size();
71                 
72                 obsDistance.resize(numSeqs);
73                 expectedDistance.resize(numSeqs);
74                 seqCoef.resize(numSeqs);
75                 DE.resize(numSeqs);
76                 Qav.resize(numSeqs);
77                 bestfit.resize(numSeqs);
78                 deviation.resize(numSeqs);
79                 trimmed.resize(numSeqs);
80                 windowSizes.resize(numSeqs, window);
81                 windowSizesTemplate.resize(templateSeqs.size(), window);
82                 windowsForeachQuery.resize(numSeqs);
83                 quantiles.resize(100);  //one for every percent mismatch
84                 
85                 //break up file if needed
86                 int linesPerProcess = processors / numSeqs;
87                 
88                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
89                         //find breakup of sequences for all times we will Parallelize
90                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
91                         else {
92                                 //fill line pairs
93                                 for (int i = 0; i < (processors-1); i++) {                      
94                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
95                                 }
96                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
97                                 int i = processors - 1;
98                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
99                         }
100                         
101                         //find breakup of templatefile for quantiles
102                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
103                         else { 
104                                 for (int i = 0; i < processors; i++) {
105                                         templateLines.push_back(new linePair());
106                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
107                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
108                                 }
109                         }
110                 #else
111                         lines.push_back(new linePair(0, numSeqs));
112                         templateLines.push_back(new linePair(0, templateSeqs.size()));
113                 #endif
114                 
115                 distcalculator = new ignoreGaps();
116
117                                 
118                 if (processors == 1) { 
119                         mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
120                         bestfit = findPairs(lines[0]->start, lines[0]->end);
121                         
122                         //ex.align matches from wigeon
123 /*for (int m = 0; m < templateSeqs.size(); m++)  {
124         if (templateSeqs[m]->getName() == "159481") {  bestfit[17] = *(templateSeqs[m]); }
125         if (templateSeqs[m]->getName() == "100137") {  bestfit[16] = *(templateSeqs[m]); }
126         if (templateSeqs[m]->getName() == "112956") {  bestfit[15] = *(templateSeqs[m]); }
127         if (templateSeqs[m]->getName() == "102326") {  bestfit[14] = *(templateSeqs[m]); }
128         if (templateSeqs[m]->getName() == "66229") {  bestfit[13] = *(templateSeqs[m]); }
129         if (templateSeqs[m]->getName() == "206276") {  bestfit[12] = *(templateSeqs[m]); }
130     if (templateSeqs[m]->getName() == "63607") {  bestfit[11] = *(templateSeqs[m]); }
131         if (templateSeqs[m]->getName() == "7056") {  bestfit[10] = *(templateSeqs[m]); }
132         if (templateSeqs[m]->getName() == "7088") {  bestfit[9] = *(templateSeqs[m]); }
133         if (templateSeqs[m]->getName() == "17553") {  bestfit[8] = *(templateSeqs[m]); }
134         if (templateSeqs[m]->getName() == "131723") {  bestfit[7] = *(templateSeqs[m]); }
135         if (templateSeqs[m]->getName() == "69013") {  bestfit[6] = *(templateSeqs[m]); }
136         if (templateSeqs[m]->getName() == "24543") {  bestfit[5] = *(templateSeqs[m]); }
137         if (templateSeqs[m]->getName() == "27824") {  bestfit[4] = *(templateSeqs[m]); }
138         if (templateSeqs[m]->getName() == "1456") {  bestfit[3] = *(templateSeqs[m]); }
139         if (templateSeqs[m]->getName() == "1456") {  bestfit[2] = *(templateSeqs[m]); }
140         if (templateSeqs[m]->getName() == "141312") {  bestfit[1] = *(templateSeqs[m]); }
141         if (templateSeqs[m]->getName() == "141312") {  bestfit[0] = *(templateSeqs[m]); }
142
143
144 }*/
145                         
146                         for (int j = 0; j < bestfit.size(); j++) { 
147                                 //chops off beginning and end of sequences so they both start and end with a base
148                                 trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);  
149                         }
150                         mothurOut("Done."); mothurOutEndLine();
151                         
152                         mothurOut("Finding window breaks... "); cout.flush();
153                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
154                                 it = trimmed[i].begin();
155 cout << "trimmed = " << it->first << '\t' << it->second << endl;
156                                 vector<int> win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]);
157                                 windowsForeachQuery[i] = win;
158                         }
159                         mothurOut("Done."); mothurOutEndLine();
160                 
161                 }else {         createProcessesSpots();         }
162
163                 //find P
164                 mothurOut("Getting conservation... "); cout.flush();
165                 if (consfile == "") { 
166                         mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the quantiles to a .prob file so that you can input them using the conservation parameter next time you run this command.  Providing the .prob file will dramatically improve speed.    "); cout.flush();
167                         probabilityProfile = calcFreq(templateSeqs); 
168                         mothurOut("Done."); mothurOutEndLine();
169                 }else                           {   probabilityProfile = readFreq();                      }
170                 
171                 //make P into Q
172                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];      }
173                 mothurOut("Done."); mothurOutEndLine();
174                 
175                 if (processors == 1) { 
176                                                 
177                         mothurOut("Calculating observed distance... "); cout.flush();
178                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
179         cout << querySeqs[i]->getName() << '\t' << bestfit[i].getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
180                                 vector<float> obsi = calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
181                                 obsDistance[i] = obsi;
182                         }
183                         mothurOut("Done."); mothurOutEndLine();
184                         
185                         
186                         mothurOut("Finding variability... "); cout.flush();
187                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
188                                 vector<float> q = findQav(windowsForeachQuery[i], windowSizes[i]);
189                                 Qav[i] = q;
190                         }
191                         mothurOut("Done."); mothurOutEndLine();
192                         
193                         
194                         mothurOut("Calculating alpha... "); cout.flush();
195                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
196                                 float alpha = getCoef(obsDistance[i], Qav[i]);
197                                 seqCoef.push_back(alpha);
198                         }
199                         mothurOut("Done."); mothurOutEndLine();
200                 
201                 
202                         mothurOut("Calculating expected distance... "); cout.flush();
203                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
204                                 vector<float> exp = calcExpected(Qav[i], seqCoef[i]);
205                                 expectedDistance[i] = exp;
206                         }
207                         mothurOut("Done."); mothurOutEndLine();
208                         
209                         
210                         mothurOut("Finding deviation... "); cout.flush();
211                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
212                                 float de = calcDE(obsDistance[i], expectedDistance[i]);
213                                 DE[i] = de;
214                                 
215                                 it = trimmed[i].begin();
216                                 float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
217                                 deviation[i] = dist;
218                         }
219                         mothurOut("Done."); mothurOutEndLine();
220                         
221                 } 
222                 else {          createProcesses();              }
223                 
224                 
225                 //quantiles are used to determine whether the de values found indicate a chimera
226                 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
227                 //combination of sequences in the template
228                 if (quanfile != "") {  quantiles =  readQuantiles();  }
229                 else {
230                         
231                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
232                         if (processors == 1) { 
233                                 quantiles = getQuantiles(0, templateSeqs.size());
234                         }else {         createProcessesQuan();          }
235                         
236                         ofstream out4;
237                         string o = getRootName(templateFile) + "quan";
238                         
239                         openOutputFile(o, out4);
240                         
241                         //adjust quantiles
242                         for (int i = 0; i < quantiles.size(); i++) {
243                                 if (quantiles[i].size() == 0) {
244                                         //in case this is not a distance found in your template files
245                                         for (int g = 0; g < 6; g++) {
246                                                 quantiles[i].push_back(0.0);
247                                         }
248                                 }else{
249                                         
250                                         sort(quantiles[i].begin(), quantiles[i].end());
251                                         
252                                         vector<float> temp;
253                                         //save 10%
254                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
255                                         //save 25%
256                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]);
257                                         //save 50%
258                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]);
259                                         //save 75%
260                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]);
261                                         //save 95%
262                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]);
263                                         //save 99%
264                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
265                                         
266                                         quantiles[i] = temp;
267                                 }
268                                 
269                                 //output quan value
270                                 out4 << i+1 << '\t';                            
271                                 for (int u = 0; u < quantiles[i].size(); u++) {   out4 << quantiles[i][u] << '\t'; }
272                                 out4 << endl;
273
274                         }
275                         
276                         mothurOut("Done."); mothurOutEndLine();
277                 }
278         
279                 //free memory
280                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
281                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
282                         
283                 delete distcalculator;
284         }
285         catch(exception& e) {
286                 errorOut(e, "Pintail", "getChimeras");
287                 exit(1);
288         }
289 }
290 //***************************************************************************************************************
291 //num is query's spot in querySeqs
292 void Pintail::trimSeqs(Sequence* query, Sequence subject, map<int, int>& trim) {
293         try {
294                 
295                 string q = query->getAligned();
296                 string s = subject.getAligned();
297                 
298                 int front = 0;
299                 for (int i = 0; i < q.length(); i++) {
300                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
301                 }
302                 
303                 int back = 0;           
304                 for (int i = q.length(); i >= 0; i--) {
305                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
306                 }
307                 
308                 trim[front] = back;
309                 
310         }
311         catch(exception& e) {
312                 errorOut(e, "Pintail", "trimSeqs");
313                 exit(1);
314         }
315 }
316
317 //***************************************************************************************************************
318
319 vector<float> Pintail::readFreq() {
320         try {
321         
322                 ifstream in;
323                 openInputFile(consfile, in);
324                 
325                 vector<float> prob;
326                 
327                 //read in probabilities and store in vector
328                 int pos; float num; 
329                 
330                 while(!in.eof()){
331                         
332                         in >> pos >> num;
333                         
334                         //do you want this spot
335                         prob.push_back(num);  
336                         
337                         gobble(in);
338                 }
339                 
340                 in.close();
341                 return prob;
342                 
343         }
344         catch(exception& e) {
345                 errorOut(e, "Pintail", "readFreq");
346                 exit(1);
347         }
348 }
349
350 //***************************************************************************************************************
351
352 vector< vector<float> > Pintail::readQuantiles() {
353         try {
354         
355                 ifstream in;
356                 openInputFile(quanfile, in);
357                 
358                 vector< vector<float> > quan;
359         
360                 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
361                 
362                 while(!in.eof()){
363                         
364                         in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
365                         
366                         vector <float> temp;
367                         
368                         temp.push_back(ten); 
369                         temp.push_back(twentyfive);
370                         temp.push_back(fifty);
371                         temp.push_back(seventyfive);
372                         temp.push_back(ninetyfive);
373                         temp.push_back(ninetynine);
374                         
375                         quan.push_back(temp);  
376                         
377                         gobble(in);
378                 }
379                 
380                 in.close();
381                 return quan;
382                 
383         }
384         catch(exception& e) {
385                 errorOut(e, "Pintail", "readQuantiles");
386                 exit(1);
387         }
388 }
389 //***************************************************************************************************************
390 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
391 vector<Sequence> Pintail::findPairs(int start, int end) {
392         try {
393                 
394                 vector<Sequence> seqsMatches;  seqsMatches.resize(end-start);
395                 
396                 for(int i = start; i < end; i++){
397                 
398                         float smallest = 10000.0;
399                         Sequence query = *(querySeqs[i]);
400                 
401                         for(int j = 0; j < templateSeqs.size(); j++){
402                                 
403                                 Sequence temp = *(templateSeqs[j]);
404                                 
405                                 distcalculator->calcDist(query, temp);
406                                 float dist = distcalculator->getDist();
407                                 
408                                 if (dist < smallest) { 
409                                         seqsMatches[i] = *(templateSeqs[j]);
410                                         smallest = dist;
411                                 }
412                         }
413                 }
414                 
415                 return seqsMatches;
416         
417         }
418         catch(exception& e) {
419                 errorOut(e, "Pintail", "findPairs");
420                 exit(1);
421         }
422 }
423
424 //***************************************************************************************************************
425 //find the window breaks for each sequence - this is so you can move ahead by bases.
426 vector<int>  Pintail::findWindows(Sequence* query, int front, int back, int& size) {
427         try {
428                 
429                 vector<int> win; 
430                 
431                 int cutoff = back - front;  //back - front 
432                         
433                 //if window is set to default
434                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
435                                                         else{  size = (cutoff / 4); }  } 
436                 else if (size > (cutoff / 4)) { 
437                                 mothurOut("You have selected to large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); mothurOutEndLine();
438                                 size = (cutoff / 4); 
439                 }
440         
441                 string seq = query->getAligned().substr(front, cutoff);
442                         
443                 //count bases
444                 int numBases = 0;
445                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
446                         
447                 //save start of seq
448                 win.push_back(front);
449                 
450                 //move ahead increment bases at a time until all bases are in a window
451                 int countBases = 0;
452                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
453                         
454                 seq = query->getAligned();
455                 for (int m = front; m < (back - size) ; m++) {
456                                 
457                         //count number of bases you see
458                         if (isalpha(seq[m])) { countBases++; totalBases++;  }
459                                 
460                         //if you have seen enough bases to make a new window
461                         if (countBases >= increment) {
462                                 win.push_back(m);  //save spot in alignment
463                                 countBases = 0;                         //reset bases you've seen in this window
464                         }
465                                 
466                         //no need to continue if all your bases are in a window
467                         if (totalBases == numBases) {   break;  }
468                 }
469                         
470                 return win;
471         
472         }
473         catch(exception& e) {
474                 errorOut(e, "Pintail", "findWindows");
475                 exit(1);
476         }
477 }
478
479 //***************************************************************************************************************
480 vector<float> Pintail::calcObserved(Sequence* query, Sequence subject, vector<int> window, int size) {
481         try {
482                 
483                 vector<float> temp;
484 //cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl;                           
485                 for (int m = 0; m < window.size(); m++) {
486                                                 
487                         string seqFrag = query->getAligned().substr(window[m], size);
488                         string seqFragsub = subject.getAligned().substr(window[m], size);
489         //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl;                                           
490                         int diff = 0;
491                         for (int b = 0; b < seqFrag.length(); b++) {
492                                 if (seqFrag[b] != seqFragsub[b]) { diff++; }
493                         }
494                
495                         //percentage of mismatched bases
496                         float dist;
497                         dist = diff / (float) seqFrag.length() * 100;       
498                                 
499                         temp.push_back(dist);
500                 }
501                         
502                 return temp;
503         }
504         catch(exception& e) {
505                 errorOut(e, "Pintail", "calcObserved");
506                 exit(1);
507         }
508 }
509 //***************************************************************************************************************
510 float Pintail::calcDist(Sequence* query, Sequence subject, int front, int back) {
511         try {
512                 
513                 //so you only look at the trimmed part of the sequence
514                 int cutoff = back - front;  
515                         
516                 //from first startpoint with length back-front
517                 string seqFrag = query->getAligned().substr(front, cutoff);
518                 string seqFragsub = subject.getAligned().substr(front, cutoff);
519                                                                                                                 
520                 int diff = 0;
521                 for (int b = 0; b < seqFrag.length(); b++) {
522                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
523                 }
524                
525                 //percentage of mismatched bases
526                 float dist = diff / (float) seqFrag.length() * 100;       
527                                 
528                 return dist;
529         }
530         catch(exception& e) {
531                 errorOut(e, "Pintail", "calcDist");
532                 exit(1);
533         }
534 }
535
536 //***************************************************************************************************************
537 vector<float> Pintail::calcExpected(vector<float> qav, float coef) {
538         try {
539                 
540                 //for each window
541                 vector<float> queryExpected;
542                         
543                 for (int m = 0; m < qav.size(); m++) {          
544                                 
545                         float expected = qav[m] * coef;
546                                 
547                         queryExpected.push_back(expected);      
548                 }
549                         
550                 return queryExpected;
551                                 
552         }
553         catch(exception& e) {
554                 errorOut(e, "Pintail", "calcExpected");
555                 exit(1);
556         }
557 }
558 //***************************************************************************************************************
559 float Pintail::calcDE(vector<float> obs, vector<float> exp) {
560         try {
561                 
562                 //for each window
563                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
564                 for (int m = 0; m < obs.size(); m++) {          sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));         }
565                         
566                 float de = sqrt((sum / (obs.size() - 1)));
567                         
568                 return de;
569         }
570         catch(exception& e) {
571                 errorOut(e, "Pintail", "calcDE");
572                 exit(1);
573         }
574 }
575
576 //***************************************************************************************************************
577
578 vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
579         try {
580
581                 vector<float> prob;
582                 string freqfile = getRootName(templateFile) + "prob";
583                 ofstream outFreq;
584                 
585                 openOutputFile(freqfile, outFreq);
586                 
587                 //at each position in the sequence
588                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
589                         
590                         vector<int> freq;   freq.resize(4,0);
591                         int gaps = 0;
592                         
593                         //find the frequency of each nucleotide
594                         for (int j = 0; j < seqs.size(); j++) {
595                                 
596                                 char value = seqs[j]->getAligned()[i];
597                                 
598                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
599                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
600                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
601                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
602                                 else { gaps++; }
603                         }
604                         
605                         //find base with highest frequency
606                         int highest = 0;
607                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
608                         
609                         float highFreq;
610                         //subtract gaps to "ignore them"
611                         if ( (seqs.size() - gaps) == 0 ) {  highFreq = 1.0;  }                  
612                         else { highFreq = highest / (float) (seqs.size() - gaps);        }
613                                                 
614                         float Pi;
615                         Pi =  (highFreq - 0.25) / 0.75; 
616                         
617                         //cannot have probability less than 0.
618                         if (Pi < 0) { Pi = 0.0; }
619                         
620                         //saves this for later
621                         outFreq << i+1 << '\t' << Pi << endl;
622                         
623                         prob.push_back(Pi); 
624                 }
625                 
626                 outFreq.close();
627                 
628                 return prob;
629                                 
630         }
631         catch(exception& e) {
632                 errorOut(e, "Pintail", "calcFreq");
633                 exit(1);
634         }
635 }
636 //***************************************************************************************************************
637 vector<float>  Pintail::findQav(vector<int> window, int size) {
638         try {
639                 vector<float>  averages; 
640                                 
641                 //for each window find average
642                 for (int m = 0; m < window.size(); m++) {
643                                 
644                         float average = 0.0;
645                                 
646                         //while you are in the window for this sequence
647                         for (int j = window[m]; j < (window[m]+size); j++) {   average += probabilityProfile[j];        }
648                                 
649                         average = average / size;
650         
651                         //save this windows average
652                         averages.push_back(average);
653                 }
654                                 
655                 return averages;
656         }
657         catch(exception& e) {
658                 errorOut(e, "Pintail", "findQav");
659                 exit(1);
660         }
661 }
662
663 //***************************************************************************************************************
664 vector< vector<float> > Pintail::getQuantiles(int start, int end) {
665         try {
666                 vector< vector<float> > quan; 
667                 
668                 //percentage of mismatched pairs 1 to 100
669                 quan.resize(100);
670                 
671                 
672                 //for each sequence
673                 for(int i = start; i < end; i++){
674                 
675                         mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine();
676                         Sequence* query = templateSeqs[i];
677                         
678                         //compare to every other sequence in template
679                         for(int j = 0; j < i; j++){
680                                 
681                                 Sequence subject = *(templateSeqs[j]);
682                                 
683                                 map<int, int> trim;
684                                 trimSeqs(query, subject, trim);
685                                 
686                                 it = trim.begin();
687                                 int front = it->first; int back = it->second;
688                                 
689                                 //reset window for each new comparison
690                                 windowSizesTemplate[i] = window;
691                                 
692                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i]);
693                                 
694                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
695                                 
696                                 vector<float> q = findQav(win, windowSizesTemplate[i]);
697                                                                         
698                                 float alpha = getCoef(obsi, q);
699                                                 
700                                 vector<float> exp = calcExpected(q, alpha);
701                                 
702                                 float de = calcDE(obsi, exp);
703                                                                 
704                                 float dist = calcDist(query, subject, front, back); 
705                                 
706                                 dist = ceil(dist);
707                                 
708                                 //dist-1 because vector indexes start at 0.
709                                 quan[dist-1].push_back(de);
710                                 
711                         }
712                 }
713
714                 return quan;
715                                                 
716         }
717         catch(exception& e) {
718                 errorOut(e, "Pintail", "findQav");
719                 exit(1);
720         }
721 }
722
723 //***************************************************************************************************************
724 float Pintail::getCoef(vector<float> obs, vector<float> qav) {
725         try {
726         
727                 //find average prob for this seqs windows
728                 float probAverage = 0.0;
729                 for (int j = 0; j < qav.size(); j++) {   probAverage += qav[j]; }
730                 probAverage = probAverage / (float) qav.size();
731                 
732                 //find observed average 
733                 float obsAverage = 0.0;
734                 for (int j = 0; j < obs.size(); j++) {   obsAverage += obs[j];  }
735                 obsAverage = obsAverage / (float) obs.size();
736                 
737
738                 float coef = obsAverage / probAverage;
739                                                 
740                 return coef;
741         }
742         catch(exception& e) {
743                 errorOut(e, "Pintail", "getCoef");
744                 exit(1);
745         }
746 }
747 /**************************************************************************************************/
748
749 void Pintail::createProcessesSpots() {
750         try {
751 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
752                 int process = 0;
753                 vector<int> processIDS;
754                 vector< vector<int> > win; win.resize(querySeqs.size());
755                 
756                 //loop through and create all the processes you want
757                 while (process != processors) {
758                         int pid = fork();
759                         
760                         if (pid > 0) {
761                                 processIDS.push_back(pid);  
762                                 process++;
763                         }else if (pid == 0){
764                                 
765                                 mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
766                                 vector<Sequence> tempbest;
767                                 tempbest = findPairs(lines[process]->start, lines[process]->end);
768                                 int count = 0;
769                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
770                                         bestfit[i] = tempbest[count];
771                                         
772                                         //chops off beginning and end of sequences so they both start and end with a base
773                                         trimSeqs(querySeqs[i], bestfit[i], trimmed[i]);
774                                         count++;
775                                 }
776                                 mothurOut("Done finding pairs for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
777                                 
778                                 mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
779                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
780                                         vector<int> temp = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]);
781                                         win[i] = temp;
782                                 }
783                                 mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
784                                 
785                                 exit(0);
786                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
787                 }
788                 
789                 //force parent to wait until all the processes are done
790                 for (int i=0;i<processors;i++) { 
791                         int temp = processIDS[i];
792                         wait(&temp);
793                 }
794                 
795                 windowsForeachQuery = win;
796 #else
797                 bestfit = findPairs(lines[0]->start, lines[0]->end);
798                 for (int j = 0; j < bestfit.size(); j++) {
799                                 //chops off beginning and end of sequences so they both start and end with a base
800                                 trimSeqs(querySeqs[j], bestfit[j], j);  
801                 }
802
803                 for (int i = lines[0]->start; i < lines[0]->end; i++) {
804                                 it = trimmed[i].begin();
805                                 map<int, int> win = findWindows(querySeqs[i], it->first, it->second, windowSizes[i]);
806                                 windows[i] = win;
807                 }
808
809 #endif          
810         }
811         catch(exception& e) {
812                 errorOut(e, "Pintail", "createProcessesSpots");
813                 exit(1);
814         }
815 }
816
817
818 /**************************************************************************************************/
819
820 void Pintail::createProcesses() {
821         try {
822 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
823                 int process = 0;
824                 vector<int> processIDS;
825                 
826                 vector< vector<float> > exp;  exp.resize(querySeqs.size());
827                 vector<float> de; de.resize(querySeqs.size());
828                 vector< vector<float> > obs; obs.resize(querySeqs.size());
829                 vector<float> dev; dev.resize(querySeqs.size());
830                 
831                 
832                 //loop through and create all the processes you want
833                 while (process != processors) {
834                         int pid = fork();
835                         
836                         if (pid > 0) {
837                                 processIDS.push_back(pid);  
838                                 process++;
839                         }else if (pid == 0){
840                                 
841                                 mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
842                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
843                                         
844                                         vector<float> obsi = calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
845                                         obs[i] = obsi;
846                                 
847                                         //calc Qav
848                                         vector<float> q = findQav(windowsForeachQuery[i], windowSizes[i]);
849                                         
850                                         //get alpha
851                                         float alpha = getCoef(obsDistance[i], q);
852                                         
853                                         //find expected
854                                         vector<float> exp = calcExpected(q, alpha);
855                                         expectedDistance[i] = exp;
856                                         
857                                         //get de and deviation
858                                         float dei = calcDE(obsi, exp);
859                                         de[i] = dei;
860                                         
861                                         it = trimmed[i].begin();
862                                         float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
863                                         dev[i] = dist;
864                                 }
865                                 mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
866                                 
867                                 exit(0);
868                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
869                 }
870                 
871                 //force parent to wait until all the processes are done
872                 for (int i=0;i<processors;i++) { 
873                         int temp = processIDS[i];
874                         wait(&temp);
875                 }
876                 
877                 obsDistance = obs;
878                 expectedDistance = exp;
879                 DE = de;
880                 deviation = dev;
881                 
882 #else
883                         mothurOut("Calculating observed distance... "); cout.flush();
884                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
885                                 vector<float> obsi = calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]);
886                                 obsDistance[i] = obsi;
887                         }
888                         mothurOut("Done."); mothurOutEndLine();
889                         
890                         
891                         
892                         mothurOut("Finding variability... "); cout.flush();
893                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
894                                 vector<float> q = findQav(windows[i], windowSizes[i]);
895                                 Qav[i] = q;
896                         }
897                         mothurOut("Done."); mothurOutEndLine();
898                         
899                         
900                         
901                         mothurOut("Calculating alpha... "); cout.flush();
902                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
903                                 float alpha = getCoef(obsDistance[i], Qav[i]);
904                                 seqCoef.push_back(alpha);
905                         }
906                         mothurOut("Done."); mothurOutEndLine();
907                 
908                 
909                 
910                         mothurOut("Calculating expected distance... "); cout.flush();
911                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
912                                 vector<float> exp = calcExpected(Qav[i], seqCoef[i]);
913                                 expectedDistance[i] = exp;
914                         }
915                         mothurOut("Done."); mothurOutEndLine();
916                         
917                         
918                         
919                         mothurOut("Finding deviation... "); cout.flush();
920                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
921                                 float de = calcDE(obsDistance[i], expectedDistance[i]);
922                                 DE[i] = de;
923                                 
924                                 it = trimmed[i].begin();
925                                 float dist = calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
926                                 deviation[i] = dist;
927                         }
928                         mothurOut("Done."); mothurOutEndLine();
929
930 #endif          
931         }
932         catch(exception& e) {
933                 errorOut(e, "Pintail", "createProcesses");
934                 exit(1);
935         }
936 }
937
938
939 /**************************************************************************************************/
940
941 void Pintail::createProcessesQuan() {
942         try {
943 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
944                 int process = 0;
945                 vector<int> processIDS;
946                 vector< vector<float> > quan; quan.resize(100);
947                                 
948                 //loop through and create all the processes you want
949                 while (process != processors) {
950                         int pid = fork();
951                         
952                         if (pid > 0) {
953                                 processIDS.push_back(pid);  
954                                 process++;
955                         }else if (pid == 0){
956                                 
957                                 vector< vector<float> > q = getQuantiles(templateLines[process]->start, templateLines[process]->end);
958                                 
959                                 for (int i = 0; i < q.size(); i++) {
960                                         //put all values of q[i] into quan[i]
961                                         quan[i].insert(quan[i].begin(), q[i].begin(), q[i].end());
962                                 }
963                                 
964                                 for (int i = 0; i < quan.size(); i++) {
965                                         cout << i+1 << '\t';
966                                         for (int j = 0; j < quan[i].size(); j++) {  cout << quan[i][j] << '\t';  }
967                                         cout << endl;
968                                 }
969
970                                 exit(0);
971                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
972                 }
973                 
974                 //force parent to wait until all the processes are done
975                 for (int i=0;i<processors;i++) { 
976                         int temp = processIDS[i];
977                         wait(&temp);
978                 }
979                 
980                 quantiles = quan;
981 #else
982                 quantiles = getQuantiles(0, templateSeqs.size());
983 #endif          
984         }
985         catch(exception& e) {
986                 errorOut(e, "Pintail", "createProcessesQuan");
987                 exit(1);
988         }
989 }
990
991
992 //***************************************************************************************************************
993
994