]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
working on chimeras
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12
13 //***************************************************************************************************************
14
15 Pintail::Pintail(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
16 //***************************************************************************************************************
17
18 Pintail::~Pintail() {
19         try {
20                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
21                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
22                 
23                 if (processors != 1) {   for (int i = 0; i < bestfit.size(); i++)       {  delete bestfit[i];   }  }
24         }
25         catch(exception& e) {
26                 errorOut(e, "Pintail", "~Pintail");
27                 exit(1);
28         }
29 }       
30 //***************************************************************************************************************
31 void Pintail::print(ostream& out) {
32         try {
33                 
34                 for (int i = 0; i < querySeqs.size(); i++) {
35                         
36                         int index = ceil(deviation[i]);
37                         
38                         //is your DE value higher than the 95%
39                         string chimera;
40                         if (DE[i] > quantiles[index][4])        {       chimera = "Yes";        }
41                         else                                                            {       chimera = "No";         }
42                         
43                         out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
44                         if (chimera == "Yes") {
45                                 mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine();
46                         }
47                         out << "Observed\t";
48                         
49                         for (int j = 0; j < obsDistance[i].size(); j++) {  out << obsDistance[i][j] << '\t';  }
50                         out << endl;
51                         
52                         out << "Expected\t";
53                         
54                         for (int m = 0; m < expectedDistance[i].size(); m++) {  out << expectedDistance[i][m] << '\t';  }
55                         out << endl;
56                         
57                 }
58         }
59         catch(exception& e) {
60                 errorOut(e, "Pintail", "print");
61                 exit(1);
62         }
63 }
64
65 //***************************************************************************************************************
66 void Pintail::getChimeras() {
67         try {
68                 
69                 //read in query sequences and subject sequences
70                 mothurOut("Reading sequences and template file... "); cout.flush();
71                 querySeqs = readSeqs(fastafile);
72                 templateSeqs = readSeqs(templateFile);
73                 mothurOut("Done."); mothurOutEndLine();
74                 
75                 int numSeqs = querySeqs.size();
76                 
77                 obsDistance.resize(numSeqs);
78                 expectedDistance.resize(numSeqs);
79                 seqCoef.resize(numSeqs);
80                 DE.resize(numSeqs);
81                 Qav.resize(numSeqs);
82                 bestfit.resize(numSeqs);
83                 deviation.resize(numSeqs);
84                 trimmed.resize(numSeqs);
85                 windowSizes.resize(numSeqs, window);
86                 windowSizesTemplate.resize(templateSeqs.size(), window);
87                 windowsForeachQuery.resize(numSeqs);
88                 h.resize(numSeqs);
89                 quantiles.resize(100);  //one for every percent mismatch
90                 
91                 //break up file if needed
92                 int linesPerProcess = numSeqs / processors ;
93                 
94                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
95                         //find breakup of sequences for all times we will Parallelize
96                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
97                         else {
98                                 //fill line pairs
99                                 for (int i = 0; i < (processors-1); i++) {                      
100                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
101                                 }
102                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
103                                 int i = processors - 1;
104                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
105                         }
106                         
107                         //find breakup of templatefile for quantiles
108                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
109                         else { 
110                                 for (int i = 0; i < processors; i++) {
111                                         templateLines.push_back(new linePair());
112                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
113                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
114                                 }
115                         }
116                 #else
117                         lines.push_back(new linePair(0, numSeqs));
118                         templateLines.push_back(new linePair(0, templateSeqs.size()));
119                 #endif
120                 
121                 distcalculator = new ignoreGaps();
122                 decalc = new DeCalculator();
123                 
124                 decalc->setMask(seqMask);
125                 
126                 //find pairs
127                 if (processors == 1) { 
128                         mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
129                         bestfit = findPairs(lines[0]->start, lines[0]->end);
130                         mothurOut("Done."); mothurOutEndLine();
131                 }else {         createProcessesPairs();         }
132                 
133                 
134                 for (int j = 0; j < bestfit.size(); j++) { 
135                                 //chops off beginning and end of sequences so they both start and end with a base
136                                 ofstream out;
137                                 string s = querySeqs[j]->getName();
138                                 
139                                 openOutputFile(s, out);
140                                 out << ">" << querySeqs[j]->getName() << endl;
141                                 out << querySeqs[j]->getAligned() << endl;
142                                 out.close();
143                                 
144                                 string t =querySeqs[j]->getName() + ".ref";
145                                 openOutputFile(t, out);
146                                 out << ">" << bestfit[j]->getName() << endl;
147                                 out << bestfit[j]->getAligned() << endl;
148                                 out.close();
149                 }
150
151                 
152                 //find P
153                 mothurOut("Getting conservation... "); cout.flush();
154                 if (consfile == "") { 
155                         mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command.  Providing the .freq file will improve speed.    "); cout.flush();
156                         probabilityProfile = decalc->calcFreq(templateSeqs, templateFile); 
157                         mothurOut("Done."); mothurOutEndLine();
158                 }else                           {   probabilityProfile = readFreq();                      }
159
160                 //make P into Q
161                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];       cout << i << '\t' << probabilityProfile[i] << endl;  }
162                 mothurOut("Done."); mothurOutEndLine();
163                 
164                 //mask querys
165                 for (int i = 0; i < querySeqs.size(); i++) {
166                 //cout << querySeqs[i]->getName() << " before mask = " << querySeqs[i]->getAligned() << endl << endl;
167                         decalc->runMask(querySeqs[i]);
168                 //cout << querySeqs[i]->getName() << " after mask = " << querySeqs[i]->getAligned() << endl << endl;
169                 }
170                 
171                 //mask templates
172                 for (int i = 0; i < templateSeqs.size(); i++) {
173                         decalc->runMask(templateSeqs[i]);
174                 }
175                 
176 //for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl;  }
177                                 
178                 if (processors == 1) { 
179         
180                         for (int j = 0; j < bestfit.size(); j++) { 
181                         cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl;
182                         cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl;
183                                 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);  
184                         }
185                         
186                         mothurOut("Finding window breaks... "); cout.flush();
187                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
188                                 it = trimmed[i].begin();
189 cout << i << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
190                                 vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
191                                 windowsForeachQuery[i] = win;
192                         }
193                         mothurOut("Done."); mothurOutEndLine();
194                 
195                 }else {         createProcessesSpots();         }
196
197                 
198                 if (processors == 1) { 
199                                                 
200                         mothurOut("Calculating observed distance... "); cout.flush();
201                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
202         cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
203                                 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
204         
205         for (int j = 0; j < obsi.size(); j++) {
206                 cout << obsi[j] << '\t';
207         }
208         cout << endl;
209                                 obsDistance[i] = obsi;
210                         }
211                         mothurOut("Done."); mothurOutEndLine();
212                         
213                         
214                         mothurOut("Finding variability... "); cout.flush();
215                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
216                                 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
217
218                                 Qav[i] = q;
219 cout << i+1 << endl;
220 for (int j = 0; j < Qav[i].size(); j++) {
221         cout << Qav[i][j] << '\t';
222 }
223 cout << endl << endl;
224
225                         }
226                         mothurOut("Done."); mothurOutEndLine();
227                         
228                         
229                         mothurOut("Calculating alpha... "); cout.flush();
230                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
231                                 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
232 cout << i+1 << "\tcoef = " << alpha << endl;
233                                 seqCoef[i] = alpha;
234                         }
235                         mothurOut("Done."); mothurOutEndLine();
236                 
237                 
238                         mothurOut("Calculating expected distance... "); cout.flush();
239                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
240                                 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
241                                 expectedDistance[i] = exp;
242                         }
243                         mothurOut("Done."); mothurOutEndLine();
244                         
245                         
246                         mothurOut("Finding deviation... "); cout.flush();
247                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
248                                 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
249                                 DE[i] = de;
250 cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl;                         
251                                 it = trimmed[i].begin();
252                                 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
253 cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
254                                 deviation[i] = dist;
255                         }
256                         mothurOut("Done."); mothurOutEndLine();
257                         
258                 } 
259                 else {          createProcesses();              }
260                 
261                 
262                 //quantiles are used to determine whether the de values found indicate a chimera
263                 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
264                 //combination of sequences in the template
265                 if (quanfile != "") {  quantiles =  readQuantiles();  }
266                 else {
267                         
268                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
269                         if (processors == 1) { 
270                                 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
271                         }else {         createProcessesQuan();          }
272                         
273                         ofstream out4;
274                         string o = getRootName(templateFile) + "quan";
275                         
276                         openOutputFile(o, out4);
277                         
278                         //adjust quantiles
279                         for (int i = 0; i < quantiles.size(); i++) {
280                                 if (quantiles[i].size() == 0) {
281                                         //in case this is not a distance found in your template files
282                                         for (int g = 0; g < 6; g++) {
283                                                 quantiles[i].push_back(0.0);
284                                         }
285                                 }else{
286                                         
287                                         sort(quantiles[i].begin(), quantiles[i].end());
288                                         
289                                         vector<float> temp;
290                                         //save 10%
291                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
292                                         //save 25%
293                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]);
294                                         //save 50%
295                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]);
296                                         //save 75%
297                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]);
298                                         //save 95%
299                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]);
300                                         //save 99%
301                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
302                                         
303                                         quantiles[i] = temp;
304                                 }
305                                 
306                                 //output quan value
307                                 out4 << i+1 << '\t';                            
308                                 for (int u = 0; u < quantiles[i].size(); u++) {   out4 << quantiles[i][u] << '\t'; }
309                                 out4 << endl;
310
311                         }
312                         
313                         mothurOut("Done."); mothurOutEndLine();
314                 }
315         
316                 //free memory
317                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
318                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
319                         
320                 delete distcalculator;
321                 delete decalc;
322         }
323         catch(exception& e) {
324                 errorOut(e, "Pintail", "getChimeras");
325                 exit(1);
326         }
327 }
328
329 //***************************************************************************************************************
330
331 vector<float> Pintail::readFreq() {
332         try {
333         
334                 ifstream in;
335                 openInputFile(consfile, in);
336                 
337                 vector<float> prob;
338                 set<int> h = decalc->getPos();  //positions of bases in masking sequence
339                 
340                 //read in probabilities and store in vector
341                 int pos; float num; 
342                 
343                 while(!in.eof()){
344                         
345                         in >> pos >> num;
346                         
347                         if (h.count(pos-1) > 0) {
348                                 float Pi;
349                                 Pi =  (num - 0.25) / 0.75; 
350                         
351                                 //cannot have probability less than 0.
352                                 if (Pi < 0) { Pi = 0.0; }
353
354                                 //do you want this spot
355                                 prob.push_back(Pi);  
356                         }
357                         
358                         gobble(in);
359                 }
360                 
361                 in.close();
362                 return prob;
363                 
364         }
365         catch(exception& e) {
366                 errorOut(e, "Pintail", "readFreq");
367                 exit(1);
368         }
369 }
370
371 //***************************************************************************************************************
372
373 vector< vector<float> > Pintail::readQuantiles() {
374         try {
375         
376                 ifstream in;
377                 openInputFile(quanfile, in);
378                 
379                 vector< vector<float> > quan;
380         
381                 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
382                 
383                 while(!in.eof()){
384                         
385                         in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
386                         
387                         vector <float> temp;
388                         
389                         temp.push_back(ten); 
390                         temp.push_back(twentyfive);
391                         temp.push_back(fifty);
392                         temp.push_back(seventyfive);
393                         temp.push_back(ninetyfive);
394                         temp.push_back(ninetynine);
395                         
396                         quan.push_back(temp);  
397                         
398                         gobble(in);
399                 }
400                 
401                 in.close();
402                 return quan;
403                 
404         }
405         catch(exception& e) {
406                 errorOut(e, "Pintail", "readQuantiles");
407                 exit(1);
408         }
409 }
410 //***************************************************************************************************************
411 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
412 vector<Sequence*> Pintail::findPairs(int start, int end) {
413         try {
414                 
415                 vector<Sequence*> seqsMatches;  
416                 
417                 for(int i = start; i < end; i++){
418                 
419                         float smallest = 10000.0;
420                         Sequence query = *(querySeqs[i]);
421                         Sequence* match;
422                         
423                         for(int j = 0; j < templateSeqs.size(); j++){
424                                 
425                                 Sequence temp = *(templateSeqs[j]);
426                                 
427                                 distcalculator->calcDist(query, temp);
428                                 float dist = distcalculator->getDist();
429                                 
430                                 if (dist < smallest) { 
431                                         match = templateSeqs[j];
432                                         smallest = dist;
433                                 }
434                         }
435                         
436                         seqsMatches.push_back(match);
437                 }
438                 
439                 return seqsMatches;
440         
441         }
442         catch(exception& e) {
443                 errorOut(e, "Pintail", "findPairs");
444                 exit(1);
445         }
446 }
447
448 /**************************************************************************************************/
449
450 void Pintail::createProcessesSpots() {
451         try {
452 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
453                 int process = 0;
454                 vector<int> processIDS;
455                 
456                 //loop through and create all the processes you want
457                 while (process != processors) {
458                         int pid = fork();
459                         
460                         if (pid > 0) {
461                                 processIDS.push_back(pid);  
462                                 process++;
463                         }else if (pid == 0){
464                                 
465                                 for (int j = lines[process]->start; j < lines[process]->end; j++) {
466                                 
467                                         //chops off beginning and end of sequences so they both start and end with a base
468                                         map<int, int> trim;
469                                         decalc->trimSeqs(querySeqs[j], bestfit[j], trim); 
470                                         trimmed[j] = trim;
471                                         
472                                 }
473
474                                 mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
475                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
476                                         it = trimmed[i].begin();
477                                         windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
478                                 }
479                                 mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
480                                 
481                                 //write out data to file so parent can read it
482                                 ofstream out;
483                                 string s = toString(getpid()) + ".temp";
484                                 openOutputFile(s, out);
485                                 
486                                 //output windowsForeachQuery
487                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
488                                         out << windowsForeachQuery[i].size() << '\t';
489                                         for (int j = 0; j < windowsForeachQuery[i].size(); j++) {
490                                                 out << windowsForeachQuery[i][j] << '\t';
491                                         }
492                                         out << endl;
493                                 }
494                         
495                                 //output windowSizes
496                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
497                                         out << windowSizes[i] << '\t';
498                                 }
499                                 out << endl;
500                                 
501                                 //output trimmed values
502                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
503                                         it = trimmed[i].begin();
504                                         
505                                         out << it->first << '\t' << it->second << endl;
506                                 }
507                                 out.close();
508                                 
509                                 exit(0);
510                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
511                 }
512                 
513                 //force parent to wait until all the processes are done
514                 for (int i=0;i<processors;i++) { 
515                         int temp = processIDS[i];
516                         wait(&temp);
517                 }
518                 
519                 //get data created by processes
520                 for (int i=0;i<processors;i++) { 
521                         ifstream in;
522                         string s = toString(processIDS[i]) + ".temp";
523                         openInputFile(s, in);
524                         
525                         int size = lines[i]->end - lines[i]->start;
526                                         
527                         int count = lines[i]->start;
528                         for (int m = 0; m < size; m++) {
529                                 int num;
530                                 in >> num;
531                         
532                                 vector<int> win;  int w;
533                                 for (int j = 0; j < num; j++) {
534                                         in >> w;
535                                         win.push_back(w);
536                                 }
537                         
538                                 windowsForeachQuery[count] = win;
539                                 count++;
540                                 gobble(in);
541                         }
542                 
543                         gobble(in);
544                         count = lines[i]->start;
545                         for (int m = 0; m < size; m++) {
546                                 int num;
547                                 in >> num;
548                                 
549                                 windowSizes[count] = num;
550                                 count++;
551                         }
552                         
553                         gobble(in);
554                         
555                         count = lines[i]->start;
556                         for (int m = 0; m < size; m++) {
557                                 int front, back;
558                                 in >> front >> back;
559                                 
560                                 map<int, int> t;
561                                 
562                                 t[front] = back;
563                                 
564                                 trimmed[count] = t;
565                                 count++;
566                                 
567                                 gobble(in);
568                         }
569
570                         
571                         in.close();
572                         remove(s.c_str());
573                 }
574                         
575         
576 #else
577                 for (int j = 0; j < bestfit.size(); j++) {
578                         //chops off beginning and end of sequences so they both start and end with a base
579                         decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);  
580                 }
581
582                 for (int i = lines[0]->start; i < lines[0]->end; i++) {
583                                 it = trimmed[i].begin();
584                                 map<int, int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
585                                 windows[i] = win;
586                 }
587
588 #endif          
589         }
590         catch(exception& e) {
591                 errorOut(e, "Pintail", "createProcessesSpots");
592                 exit(1);
593         }
594 }
595 /**************************************************************************************************/
596
597 void Pintail::createProcessesPairs() {
598         try {
599 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
600                 int process = 0;
601                 vector<int> processIDS;
602                 
603                 //loop through and create all the processes you want
604                 while (process != processors) {
605                         int pid = fork();
606                         
607                         if (pid > 0) {
608                                 processIDS.push_back(pid);  
609                                 process++;
610                         }else if (pid == 0){
611                                 
612                                 mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
613                                 bestfit = findPairs(lines[process]->start, lines[process]->end);
614                                 mothurOut("Done finding pairs for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
615                                 
616                                 //write out data to file so parent can read it
617                                 ofstream out;
618                                 string s = toString(getpid()) + ".temp";
619                                 openOutputFile(s, out);
620                                 
621                                 //output range and size
622                                 out << bestfit.size() << endl;
623                                 
624                                 //output pairs
625                                 for (int i = 0; i < bestfit.size(); i++) {
626                                         out << ">" << bestfit[i]->getName() << endl << bestfit[i]->getAligned() << endl;
627                                 }
628                                 out.close();
629                                 
630                                 exit(0);
631                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
632                 }
633                 
634                 //force parent to wait until all the processes are done
635                 for (int i=0;i<processors;i++) { 
636                         int temp = processIDS[i];
637                         wait(&temp);
638                 }
639                 
640                 //get data created by processes
641                 for (int i=0;i<processors;i++) { 
642                         ifstream in;
643                         string s = toString(processIDS[i]) + ".temp";
644                         openInputFile(s, in);
645                         
646                         int size;
647                         in >> size;  gobble(in);
648                                 
649                         //get pairs
650                         int count = lines[i]->start;
651                         for (int m = 0; m < size; m++) {
652                                 Sequence* temp = new Sequence(in);
653                                 bestfit[count] = temp;
654                         
655                                 count++;
656                                 gobble(in);
657                         }
658                         
659                         in.close();
660                         remove(s.c_str());
661                 }
662                         
663         
664 #else
665                 bestfit = findPairs(lines[0]->start, lines[0]->end);
666 #endif          
667         }
668         catch(exception& e) {
669                 errorOut(e, "Pintail", "createProcessesPairs");
670                 exit(1);
671         }
672 }
673 /**************************************************************************************************/
674
675 void Pintail::createProcesses() {
676         try {
677 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
678                 int process = 0;
679                 vector<int> processIDS;
680                 
681                 //loop through and create all the processes you want
682                 while (process != processors) {
683                         int pid = fork();
684                         
685                         if (pid > 0) {
686                                 processIDS.push_back(pid);  
687                                 process++;
688                         }else if (pid == 0){
689                                 
690                                 mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
691                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
692                                         
693                                         vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
694                                         obsDistance[i] = obsi;
695                                 
696                                         //calc Qav
697                                         vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
698                                         
699                                         //get alpha
700                                         float alpha = decalc->getCoef(obsDistance[i], q);
701                                         
702                                         //find expected
703                                         vector<float> exp = decalc->calcExpected(q, alpha);
704                                         expectedDistance[i] = exp;
705                                         
706                                         //get de and deviation
707                                         float dei = decalc->calcDE(obsi, exp);
708                                         DE[i] = dei;
709                                         
710                                         it = trimmed[i].begin();
711                                         float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
712                                         deviation[i] = dist;
713                                 }
714                                 mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
715                                 
716                                 //write out data to file so parent can read it
717                                 ofstream out;
718                                 string s = toString(getpid()) + ".temp";
719                                 openOutputFile(s, out);
720                                 
721                                 int size = lines[process]->end - lines[process]->start;
722                                 out << size << endl;
723                                                                 
724                                 //output observed distances
725                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
726                                         out << obsDistance[i].size() << '\t';
727                                         for (int j = 0; j < obsDistance[i].size(); j++) {
728                                                 out << obsDistance[i][j] << '\t';
729                                         }
730                                         out << endl;
731                                 }
732                                 
733                                 
734                                 //output expected distances
735                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
736                                         out << expectedDistance[i].size() << '\t';
737                                         for (int j = 0; j < expectedDistance[i].size(); j++) {
738                                                 out << expectedDistance[i][j] << '\t';
739                                         }
740                                         out << endl;
741                                 }
742
743                         
744                                 //output de values
745                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
746                                         out << DE[i] << '\t';
747                                 }
748                                 out << endl;    
749                                 
750                                 //output de values
751                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
752                                         out << deviation[i] << '\t';
753                                 }
754                                 out << endl;
755                                 
756                                 out.close();
757
758                                 exit(0);
759                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
760                 }
761                 
762                 //force parent to wait until all the processes are done
763                 for (int i=0;i<processors;i++) { 
764                         int temp = processIDS[i];
765                         wait(&temp);
766                 }
767                 
768                 //get data created by processes
769                 for (int i=0;i<processors;i++) { 
770                         ifstream in;
771                         string s = toString(processIDS[i]) + ".temp";
772                         openInputFile(s, in);
773                         
774                         int size;
775                         in >> size;  gobble(in);
776                         
777                         //get observed distances
778                         int count = lines[i]->start;
779                         for (int m = 0; m < size; m++) {
780                                 int num;
781                                 in >> num;
782                         
783                                 vector<float> obs;  float w;
784                                 for (int j = 0; j < num; j++) {
785                                         in >> w;
786                                         obs.push_back(w);
787                                 }
788                         
789                                 obsDistance[count] = obs;
790                                 count++;
791                                 gobble(in);
792                         }
793                         
794                         gobble(in);
795                         
796                         //get expected distances
797                         count = lines[i]->start;
798                         for (int m = 0; m < size; m++) {
799                                 int num;
800                                 in >> num;
801                         
802                                 vector<float> exp;  float w;
803                                 for (int j = 0; j < num; j++) {
804                                         in >> w;
805                                         exp.push_back(w);
806                                 }
807                         
808                                 expectedDistance[count] = exp;
809                                 count++;
810                                 gobble(in);
811                         }
812
813                         gobble(in);
814                         
815                         count = lines[i]->start;
816                         for (int m = 0; m < size; m++) {
817                                 float num;
818                                 in >> num;
819                                 
820                                 DE[count] = num;
821                                 count++;
822                         }
823                         
824                         gobble(in);
825                         
826                         count = lines[i]->start;
827                         for (int m = 0; m < size; m++) {
828                                 float num;
829                                 in >> num;
830                                 
831                                 deviation[count] = num;
832                                 count++;
833                         }
834
835                         in.close();
836                         remove(s.c_str());
837                 }
838
839                                 
840 #else
841                         mothurOut("Calculating observed distance... "); cout.flush();
842                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
843                                 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]);
844                                 obsDistance[i] = obsi;
845                         }
846                         mothurOut("Done."); mothurOutEndLine();
847                         
848                         
849                         
850                         mothurOut("Finding variability... "); cout.flush();
851                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
852                                 vector<float> q = decalc->findQav(windows[i], windowSizes[i], probabilityProfile, h[i]);
853                                 Qav[i] = q;
854                         }
855                         mothurOut("Done."); mothurOutEndLine();
856                         
857                         
858                         
859                         mothurOut("Calculating alpha... "); cout.flush();
860                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
861                                 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
862                                 seqCoef.push_back(alpha);
863                         }
864                         mothurOut("Done."); mothurOutEndLine();
865                 
866                 
867                 
868                         mothurOut("Calculating expected distance... "); cout.flush();
869                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
870                                 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
871                                 expectedDistance[i] = exp;
872                         }
873                         mothurOut("Done."); mothurOutEndLine();
874                         
875                         
876                         
877                         mothurOut("Finding deviation... "); cout.flush();
878                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
879                                 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
880                                 DE[i] = de;
881                                 
882                                 it = trimmed[i].begin();
883                                 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
884                                 deviation[i] = dist;
885                         }
886                         mothurOut("Done."); mothurOutEndLine();
887
888 #endif          
889         }
890         catch(exception& e) {
891                 errorOut(e, "Pintail", "createProcesses");
892                 exit(1);
893         }
894 }
895
896
897 /**************************************************************************************************/
898
899 void Pintail::createProcessesQuan() {
900         try {
901 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
902                 int process = 0;
903                 vector<int> processIDS;
904                                 
905                 //loop through and create all the processes you want
906                 while (process != processors) {
907                         int pid = fork();
908                         
909                         if (pid > 0) {
910                                 processIDS.push_back(pid);  
911                                 process++;
912                         }else if (pid == 0){
913                                 
914                                 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
915                                 
916                                 //write out data to file so parent can read it
917                                 ofstream out;
918                                 string s = toString(getpid()) + ".temp";
919                                 openOutputFile(s, out);
920                                 
921                                                                 
922                                 //output observed distances
923                                 for (int i = 0; i < quantiles.size(); i++) {
924                                         out << quantiles[i].size() << '\t';
925                                         for (int j = 0; j < quantiles[i].size(); j++) {
926                                                 out << quantiles[i][j] << '\t';
927                                         }
928                                         out << endl;
929                                 }
930                                 
931                                 out.close();
932                                 
933                                 exit(0);
934                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
935                 }
936                 
937                 //force parent to wait until all the processes are done
938                 for (int i=0;i<processors;i++) { 
939                         int temp = processIDS[i];
940                         wait(&temp);
941                 }
942                 
943                 //get data created by processes
944                 for (int i=0;i<processors;i++) { 
945                         ifstream in;
946                         string s = toString(processIDS[i]) + ".temp";
947                         openInputFile(s, in);
948                         
949                         vector< vector<float> > quan; quan.resize(100);
950                         
951                         //get quantiles
952                         for (int m = 0; m < quan.size(); m++) {
953                                 int num;
954                                 in >> num;
955                         
956                                 vector<float> q;  float w;
957                                 for (int j = 0; j < num; j++) {
958                                         in >> w;
959                                         q.push_back(w);
960                                 }
961                         
962                                 quan[m] = q;
963                                 gobble(in);
964                         }
965                         
966                         
967                         //save quan in quantiles
968                         for (int i = 0; i < quan.size(); i++) {
969                                 //put all values of q[i] into quan[i]
970                                 quantiles[i].insert(quantiles[i].begin(), quan[i].begin(), quan[i].end());
971                         }
972                                         
973                         in.close();
974                         remove(s.c_str());
975                 }
976                 
977 #else
978                 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
979 #endif          
980         }
981         catch(exception& e) {
982                 errorOut(e, "Pintail", "createProcessesQuan");
983                 exit(1);
984         }
985 }
986
987
988 //***************************************************************************************************************
989
990