]> git.donarmstrong.com Git - mothur.git/blob - pintail.cpp
working on chimeras
[mothur.git] / pintail.cpp
1 /*
2  *  pintail.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/9/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "pintail.h"
11 #include "ignoregaps.h"
12
13 //***************************************************************************************************************
14
15 Pintail::Pintail(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
16 //***************************************************************************************************************
17
18 Pintail::~Pintail() {
19         try {
20                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
21                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
22                 
23                 if (processors != 1) {   for (int i = 0; i < bestfit.size(); i++)       {  delete bestfit[i];   }  }
24         }
25         catch(exception& e) {
26                 errorOut(e, "Pintail", "~Pintail");
27                 exit(1);
28         }
29 }       
30 //***************************************************************************************************************
31 void Pintail::print(ostream& out) {
32         try {
33                 
34                 for (int i = 0; i < querySeqs.size(); i++) {
35                         
36                         int index = ceil(deviation[i]);
37 float quan = 2.64 * log10(deviation[i]) + 1.46;
38 cout << "dist = " << index << endl;
39 cout << "de = "  << DE[i] << endl;
40 cout << "mallard quantile = " << quan << endl;
41 cout << "my quantile = " << quantiles[index][4] << endl;
42                         
43                         //is your DE value higher than the 95%
44                         string chimera;
45                         if (DE[i] > quantiles[index][4])                {       chimera = "Yes";        }
46                         else                                                                    {       chimera = "No";         }
47                         
48                         out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
49                         if (chimera == "Yes") {
50                                 mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine();
51                         }
52                         out << "Observed\t";
53                         
54                         for (int j = 0; j < obsDistance[i].size(); j++) {  out << obsDistance[i][j] << '\t';  }
55                         out << endl;
56                         
57                         out << "Expected\t";
58                         
59                         for (int m = 0; m < expectedDistance[i].size(); m++) {  out << expectedDistance[i][m] << '\t';  }
60                         out << endl;
61                         
62                 }
63         }
64         catch(exception& e) {
65                 errorOut(e, "Pintail", "print");
66                 exit(1);
67         }
68 }
69
70 //***************************************************************************************************************
71 void Pintail::getChimeras() {
72         try {
73                 
74                 //read in query sequences and subject sequences
75                 mothurOut("Reading sequences and template file... "); cout.flush();
76                 querySeqs = readSeqs(fastafile);
77                 templateSeqs = readSeqs(templateFile);
78                 mothurOut("Done."); mothurOutEndLine();
79                 
80                 int numSeqs = querySeqs.size();
81                 
82                 obsDistance.resize(numSeqs);
83                 expectedDistance.resize(numSeqs);
84                 seqCoef.resize(numSeqs);
85                 DE.resize(numSeqs);
86                 Qav.resize(numSeqs);
87                 bestfit.resize(numSeqs);
88                 deviation.resize(numSeqs);
89                 trimmed.resize(numSeqs);
90                 windowSizes.resize(numSeqs, window);
91                 windowSizesTemplate.resize(templateSeqs.size(), window);
92                 windowsForeachQuery.resize(numSeqs);
93                 h.resize(numSeqs);
94                 quantiles.resize(100);  //one for every percent mismatch
95                 
96                 //break up file if needed
97                 int linesPerProcess = numSeqs / processors ;
98                 
99                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
100                         //find breakup of sequences for all times we will Parallelize
101                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
102                         else {
103                                 //fill line pairs
104                                 for (int i = 0; i < (processors-1); i++) {                      
105                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
106                                 }
107                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
108                                 int i = processors - 1;
109                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
110                         }
111                         
112                         //find breakup of templatefile for quantiles
113                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
114                         else { 
115                                 for (int i = 0; i < processors; i++) {
116                                         templateLines.push_back(new linePair());
117                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
118                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
119                                 }
120                         }
121                 #else
122                         lines.push_back(new linePair(0, numSeqs));
123                         templateLines.push_back(new linePair(0, templateSeqs.size()));
124                 #endif
125                 
126                 distcalculator = new ignoreGaps();
127                 decalc = new DeCalculator();
128                 
129                 //if the user does enter a mask then you want to keep all the spots in the alignment
130                 if (seqMask.length() == 0)      {       decalc->setAlignmentLength(querySeqs[0]->getAligned().length());        }
131                 else                                            {       decalc->setAlignmentLength(seqMask.length());                                           }
132                 
133                 decalc->setMask(seqMask);
134                 
135                 //find pairs
136                 if (processors == 1) { 
137                         mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
138                         bestfit = findPairs(lines[0]->start, lines[0]->end);
139                         mothurOut("Done."); mothurOutEndLine();
140                 }else {         createProcessesPairs();         }
141                 
142                 
143                 for (int j = 0; j < bestfit.size(); j++) { 
144                                 //chops off beginning and end of sequences so they both start and end with a base
145                                 ofstream out;
146                                 string s = querySeqs[j]->getName();
147                                 
148                                 openOutputFile(s, out);
149                                 out << ">" << querySeqs[j]->getName() << endl;
150                                 out << querySeqs[j]->getAligned() << endl;
151                                 out.close();
152                                 
153                                 string t =querySeqs[j]->getName() + ".ref";
154                                 openOutputFile(t, out);
155                                 out << ">" << bestfit[j]->getName() << endl;
156                                 out << bestfit[j]->getAligned() << endl;
157                                 out.close();
158                 }
159
160                 
161                 //find P
162                 mothurOut("Getting conservation... "); cout.flush();
163                 if (consfile == "") { 
164                         mothurOut("Calculating probability of conservation for your template sequences.  This can take a while...  I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command.  Providing the .freq file will improve speed.    "); cout.flush();
165                         probabilityProfile = decalc->calcFreq(templateSeqs, templateFile); 
166                         mothurOut("Done."); mothurOutEndLine();
167                 }else                           {   probabilityProfile = readFreq();                      }
168
169                 //make P into Q
170                 for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
171                 mothurOut("Done."); mothurOutEndLine();
172                 
173                 //mask querys
174                 for (int i = 0; i < querySeqs.size(); i++) {
175                 //cout << querySeqs[i]->getName() << " before mask = " << querySeqs[i]->getAligned() << endl << endl;
176                         decalc->runMask(querySeqs[i]);
177                 //cout << querySeqs[i]->getName() << " after mask = " << querySeqs[i]->getAligned() << endl << endl;
178                 }
179                 
180                 //mask templates
181                 for (int i = 0; i < templateSeqs.size(); i++) {
182                         decalc->runMask(templateSeqs[i]);
183                 }
184                 
185 //for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl;  }
186                                 
187                 if (processors == 1) { 
188         
189                         for (int j = 0; j < bestfit.size(); j++) { 
190                         //cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl;
191                         ///cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl;
192                                 decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);  
193                         }
194                         
195                         mothurOut("Finding window breaks... "); cout.flush();
196                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
197                                 it = trimmed[i].begin();
198 //cout << querySeqs[i]->getName() << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
199                                 vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
200                                 windowsForeachQuery[i] = win;
201                         }
202                         mothurOut("Done."); mothurOutEndLine();
203                 
204                 }else {         createProcessesSpots();         }
205
206                 
207                 if (processors == 1) { 
208                                                 
209                         mothurOut("Calculating observed distance... "); cout.flush();
210                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
211         //cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
212                                 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
213         
214         for (int j = 0; j < obsi.size(); j++) {
215                 //cout << obsi[j] << '\t';
216         }
217         //cout << endl;
218                                 obsDistance[i] = obsi;
219                         }
220                         mothurOut("Done."); mothurOutEndLine();
221                         
222                         
223                         mothurOut("Finding variability... "); cout.flush();
224                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
225                                 vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
226
227                                 Qav[i] = q;
228 //cout << querySeqs[i]->getName() << endl;
229 for (int j = 0; j < Qav[i].size(); j++) {
230         //cout << Qav[i][j] << '\t';
231 }
232 //cout << endl << endl;
233
234                         }
235                         mothurOut("Done."); mothurOutEndLine();
236                         
237                         
238                         mothurOut("Calculating alpha... "); cout.flush();
239                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
240                                 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
241 //cout << querySeqs[i]->getName() << "\tcoef = " << alpha << endl;
242                                 seqCoef[i] = alpha;
243                         }
244                         mothurOut("Done."); mothurOutEndLine();
245                 
246                 
247                         mothurOut("Calculating expected distance... "); cout.flush();
248                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
249                                 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
250                                 expectedDistance[i] = exp;
251                         }
252                         mothurOut("Done."); mothurOutEndLine();
253                         
254                         
255                         mothurOut("Finding deviation... "); cout.flush();
256                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
257                                 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
258                                 DE[i] = de;
259 //cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl;                               
260                                 it = trimmed[i].begin();
261                                 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
262 //cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
263                                 deviation[i] = dist;
264                         }
265                         mothurOut("Done."); mothurOutEndLine();
266                         
267                 } 
268                 else {          createProcesses();              }
269                 
270                 
271                 //quantiles are used to determine whether the de values found indicate a chimera
272                 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
273                 //combination of sequences in the template
274                 if (quanfile != "") {  quantiles =  readQuantiles();  }
275                 else {
276                         
277                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
278                         if (processors == 1) { 
279                                 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
280                         }else {         createProcessesQuan();          }
281                         
282                         
283                         decalc->removeObviousOutliers(quantiles);
284                         
285                         ofstream out4;
286                         string o = getRootName(templateFile) + "quan";
287                         
288                         openOutputFile(o, out4);
289                         
290                         //adjust quantiles
291                         for (int i = 0; i < quantiles.size(); i++) {
292                                 if (quantiles[i].size() == 0) {
293                                         //in case this is not a distance found in your template files
294                                         for (int g = 0; g < 6; g++) {
295                                                 quantiles[i].push_back(0.0);
296                                         }
297                                 }else{
298                                         
299                                         sort(quantiles[i].begin(), quantiles[i].end());
300                                         
301                                         vector<float> temp;
302                                         //save 10%
303                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
304                                         //save 25%
305                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]);
306                                         //save 50%
307                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]);
308                                         //save 75%
309                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]);
310                                         //save 95%
311                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]);
312                                         //save 99%
313                                         temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
314                                         
315                                         quantiles[i] = temp;
316                                 }
317                                 
318                                 //output quan value
319                                 out4 << i+1 << '\t';                            
320                                 for (int u = 0; u < quantiles[i].size(); u++) {   out4 << quantiles[i][u] << '\t'; }
321                                 out4 << endl;
322
323                         }
324                         
325                         mothurOut("Done."); mothurOutEndLine();
326                 }
327         
328                 //free memory
329                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
330                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
331                         
332                 delete distcalculator;
333                 delete decalc;
334         }
335         catch(exception& e) {
336                 errorOut(e, "Pintail", "getChimeras");
337                 exit(1);
338         }
339 }
340
341 //***************************************************************************************************************
342
343 vector<float> Pintail::readFreq() {
344         try {
345         
346                 ifstream in;
347                 openInputFile(consfile, in);
348                 
349                 vector<float> prob;
350                 set<int> h = decalc->getPos();  //positions of bases in masking sequence
351                 
352                 //read in probabilities and store in vector
353                 int pos; float num; 
354                 
355                 while(!in.eof()){
356                         
357                         in >> pos >> num;
358                         
359                         if (h.count(pos) > 0) {
360                                 float Pi;
361                                 Pi =  (num - 0.25) / 0.75; 
362                         
363                                 //cannot have probability less than 0.
364                                 if (Pi < 0) { Pi = 0.0; }
365
366                                 //do you want this spot
367                                 prob.push_back(Pi);  
368                         }
369                         
370                         gobble(in);
371                 }
372                 
373                 in.close();
374                 return prob;
375                 
376         }
377         catch(exception& e) {
378                 errorOut(e, "Pintail", "readFreq");
379                 exit(1);
380         }
381 }
382
383 //***************************************************************************************************************
384
385 vector< vector<float> > Pintail::readQuantiles() {
386         try {
387         
388                 ifstream in;
389                 openInputFile(quanfile, in);
390                 
391                 vector< vector<float> > quan;
392         
393                 int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
394                 
395                 while(!in.eof()){
396                         
397                         in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
398                         
399                         vector <float> temp;
400                         
401                         temp.push_back(ten); 
402                         temp.push_back(twentyfive);
403                         temp.push_back(fifty);
404                         temp.push_back(seventyfive);
405                         temp.push_back(ninetyfive);
406                         temp.push_back(ninetynine);
407                         
408                         quan.push_back(temp);  
409                         
410                         gobble(in);
411                 }
412                 
413                 in.close();
414                 return quan;
415                 
416         }
417         catch(exception& e) {
418                 errorOut(e, "Pintail", "readQuantiles");
419                 exit(1);
420         }
421 }
422 //***************************************************************************************************************
423 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
424 vector<Sequence*> Pintail::findPairs(int start, int end) {
425         try {
426                 
427                 vector<Sequence*> seqsMatches;  
428                 
429                 for(int i = start; i < end; i++){
430                 
431                         float smallest = 10000.0;
432                         Sequence query = *(querySeqs[i]);
433                         Sequence* match;
434                         
435                         for(int j = 0; j < templateSeqs.size(); j++){
436                                 
437                                 Sequence temp = *(templateSeqs[j]);
438                                 
439                                 distcalculator->calcDist(query, temp);
440                                 float dist = distcalculator->getDist();
441                                 
442                                 if (dist < smallest) { 
443                                         match = templateSeqs[j];
444                                         smallest = dist;
445                                 }
446                         }
447                         
448                         seqsMatches.push_back(match);
449                 }
450                 
451                 return seqsMatches;
452         
453         }
454         catch(exception& e) {
455                 errorOut(e, "Pintail", "findPairs");
456                 exit(1);
457         }
458 }
459
460 /**************************************************************************************************/
461
462 void Pintail::createProcessesSpots() {
463         try {
464 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
465                 int process = 0;
466                 vector<int> processIDS;
467                 
468                 //loop through and create all the processes you want
469                 while (process != processors) {
470                         int pid = fork();
471                         
472                         if (pid > 0) {
473                                 processIDS.push_back(pid);  
474                                 process++;
475                         }else if (pid == 0){
476                                 
477                                 for (int j = lines[process]->start; j < lines[process]->end; j++) {
478                                 
479                                         //chops off beginning and end of sequences so they both start and end with a base
480                                         map<int, int> trim;
481                                         decalc->trimSeqs(querySeqs[j], bestfit[j], trim); 
482                                         trimmed[j] = trim;
483                                         
484                                 }
485
486                                 mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
487                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
488                                         it = trimmed[i].begin();
489                                         windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
490                                 }
491                                 mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
492                                 
493                                 //write out data to file so parent can read it
494                                 ofstream out;
495                                 string s = toString(getpid()) + ".temp";
496                                 openOutputFile(s, out);
497                                 
498                                 //output windowsForeachQuery
499                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
500                                         out << windowsForeachQuery[i].size() << '\t';
501                                         for (int j = 0; j < windowsForeachQuery[i].size(); j++) {
502                                                 out << windowsForeachQuery[i][j] << '\t';
503                                         }
504                                         out << endl;
505                                 }
506                         
507                                 //output windowSizes
508                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
509                                         out << windowSizes[i] << '\t';
510                                 }
511                                 out << endl;
512                                 
513                                 //output trimmed values
514                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
515                                         it = trimmed[i].begin();
516                                         
517                                         out << it->first << '\t' << it->second << endl;
518                                 }
519                                 out.close();
520                                 
521                                 exit(0);
522                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
523                 }
524                 
525                 //force parent to wait until all the processes are done
526                 for (int i=0;i<processors;i++) { 
527                         int temp = processIDS[i];
528                         wait(&temp);
529                 }
530                 
531                 //get data created by processes
532                 for (int i=0;i<processors;i++) { 
533                         ifstream in;
534                         string s = toString(processIDS[i]) + ".temp";
535                         openInputFile(s, in);
536                         
537                         int size = lines[i]->end - lines[i]->start;
538                                         
539                         int count = lines[i]->start;
540                         for (int m = 0; m < size; m++) {
541                                 int num;
542                                 in >> num;
543                         
544                                 vector<int> win;  int w;
545                                 for (int j = 0; j < num; j++) {
546                                         in >> w;
547                                         win.push_back(w);
548                                 }
549                         
550                                 windowsForeachQuery[count] = win;
551                                 count++;
552                                 gobble(in);
553                         }
554                 
555                         gobble(in);
556                         count = lines[i]->start;
557                         for (int m = 0; m < size; m++) {
558                                 int num;
559                                 in >> num;
560                                 
561                                 windowSizes[count] = num;
562                                 count++;
563                         }
564                         
565                         gobble(in);
566                         
567                         count = lines[i]->start;
568                         for (int m = 0; m < size; m++) {
569                                 int front, back;
570                                 in >> front >> back;
571                                 
572                                 map<int, int> t;
573                                 
574                                 t[front] = back;
575                                 
576                                 trimmed[count] = t;
577                                 count++;
578                                 
579                                 gobble(in);
580                         }
581
582                         
583                         in.close();
584                         remove(s.c_str());
585                 }
586                         
587         
588 #else
589                 for (int j = 0; j < bestfit.size(); j++) {
590                         //chops off beginning and end of sequences so they both start and end with a base
591                         decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);  
592                 }
593
594                 for (int i = lines[0]->start; i < lines[0]->end; i++) {
595                                 it = trimmed[i].begin();
596                                 map<int, int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
597                                 windows[i] = win;
598                 }
599
600 #endif          
601         }
602         catch(exception& e) {
603                 errorOut(e, "Pintail", "createProcessesSpots");
604                 exit(1);
605         }
606 }
607 /**************************************************************************************************/
608
609 void Pintail::createProcessesPairs() {
610         try {
611 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
612                 int process = 0;
613                 vector<int> processIDS;
614                 
615                 //loop through and create all the processes you want
616                 while (process != processors) {
617                         int pid = fork();
618                         
619                         if (pid > 0) {
620                                 processIDS.push_back(pid);  
621                                 process++;
622                         }else if (pid == 0){
623                                 
624                                 mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
625                                 bestfit = findPairs(lines[process]->start, lines[process]->end);
626                                 mothurOut("Done finding pairs for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
627                                 
628                                 //write out data to file so parent can read it
629                                 ofstream out;
630                                 string s = toString(getpid()) + ".temp";
631                                 openOutputFile(s, out);
632                                 
633                                 //output range and size
634                                 out << bestfit.size() << endl;
635                                 
636                                 //output pairs
637                                 for (int i = 0; i < bestfit.size(); i++) {
638                                         out << ">" << bestfit[i]->getName() << endl << bestfit[i]->getAligned() << endl;
639                                 }
640                                 out.close();
641                                 
642                                 exit(0);
643                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
644                 }
645                 
646                 //force parent to wait until all the processes are done
647                 for (int i=0;i<processors;i++) { 
648                         int temp = processIDS[i];
649                         wait(&temp);
650                 }
651                 
652                 //get data created by processes
653                 for (int i=0;i<processors;i++) { 
654                         ifstream in;
655                         string s = toString(processIDS[i]) + ".temp";
656                         openInputFile(s, in);
657                         
658                         int size;
659                         in >> size;  gobble(in);
660                                 
661                         //get pairs
662                         int count = lines[i]->start;
663                         for (int m = 0; m < size; m++) {
664                                 Sequence* temp = new Sequence(in);
665                                 bestfit[count] = temp;
666                         
667                                 count++;
668                                 gobble(in);
669                         }
670                         
671                         in.close();
672                         remove(s.c_str());
673                 }
674                         
675         
676 #else
677                 bestfit = findPairs(lines[0]->start, lines[0]->end);
678 #endif          
679         }
680         catch(exception& e) {
681                 errorOut(e, "Pintail", "createProcessesPairs");
682                 exit(1);
683         }
684 }
685 /**************************************************************************************************/
686
687 void Pintail::createProcesses() {
688         try {
689 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
690                 int process = 0;
691                 vector<int> processIDS;
692                 
693                 //loop through and create all the processes you want
694                 while (process != processors) {
695                         int pid = fork();
696                         
697                         if (pid > 0) {
698                                 processIDS.push_back(pid);  
699                                 process++;
700                         }else if (pid == 0){
701                                 
702                                 mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
703                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
704                                         
705                                         vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
706                                         obsDistance[i] = obsi;
707                                 
708                                         //calc Qav
709                                         vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
710                                         
711                                         //get alpha
712                                         float alpha = decalc->getCoef(obsDistance[i], q);
713                                         
714                                         //find expected
715                                         vector<float> exp = decalc->calcExpected(q, alpha);
716                                         expectedDistance[i] = exp;
717                                         
718                                         //get de and deviation
719                                         float dei = decalc->calcDE(obsi, exp);
720                                         DE[i] = dei;
721                                         
722                                         it = trimmed[i].begin();
723                                         float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
724                                         deviation[i] = dist;
725                                 }
726                                 mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
727                                 
728                                 //write out data to file so parent can read it
729                                 ofstream out;
730                                 string s = toString(getpid()) + ".temp";
731                                 openOutputFile(s, out);
732                                 
733                                 int size = lines[process]->end - lines[process]->start;
734                                 out << size << endl;
735                                                                 
736                                 //output observed distances
737                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
738                                         out << obsDistance[i].size() << '\t';
739                                         for (int j = 0; j < obsDistance[i].size(); j++) {
740                                                 out << obsDistance[i][j] << '\t';
741                                         }
742                                         out << endl;
743                                 }
744                                 
745                                 
746                                 //output expected distances
747                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
748                                         out << expectedDistance[i].size() << '\t';
749                                         for (int j = 0; j < expectedDistance[i].size(); j++) {
750                                                 out << expectedDistance[i][j] << '\t';
751                                         }
752                                         out << endl;
753                                 }
754
755                         
756                                 //output de values
757                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
758                                         out << DE[i] << '\t';
759                                 }
760                                 out << endl;    
761                                 
762                                 //output de values
763                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
764                                         out << deviation[i] << '\t';
765                                 }
766                                 out << endl;
767                                 
768                                 out.close();
769
770                                 exit(0);
771                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
772                 }
773                 
774                 //force parent to wait until all the processes are done
775                 for (int i=0;i<processors;i++) { 
776                         int temp = processIDS[i];
777                         wait(&temp);
778                 }
779                 
780                 //get data created by processes
781                 for (int i=0;i<processors;i++) { 
782                         ifstream in;
783                         string s = toString(processIDS[i]) + ".temp";
784                         openInputFile(s, in);
785                         
786                         int size;
787                         in >> size;  gobble(in);
788                         
789                         //get observed distances
790                         int count = lines[i]->start;
791                         for (int m = 0; m < size; m++) {
792                                 int num;
793                                 in >> num;
794                         
795                                 vector<float> obs;  float w;
796                                 for (int j = 0; j < num; j++) {
797                                         in >> w;
798                                         obs.push_back(w);
799                                 }
800                         
801                                 obsDistance[count] = obs;
802                                 count++;
803                                 gobble(in);
804                         }
805                         
806                         gobble(in);
807                         
808                         //get expected distances
809                         count = lines[i]->start;
810                         for (int m = 0; m < size; m++) {
811                                 int num;
812                                 in >> num;
813                         
814                                 vector<float> exp;  float w;
815                                 for (int j = 0; j < num; j++) {
816                                         in >> w;
817                                         exp.push_back(w);
818                                 }
819                         
820                                 expectedDistance[count] = exp;
821                                 count++;
822                                 gobble(in);
823                         }
824
825                         gobble(in);
826                         
827                         count = lines[i]->start;
828                         for (int m = 0; m < size; m++) {
829                                 float num;
830                                 in >> num;
831                                 
832                                 DE[count] = num;
833                                 count++;
834                         }
835                         
836                         gobble(in);
837                         
838                         count = lines[i]->start;
839                         for (int m = 0; m < size; m++) {
840                                 float num;
841                                 in >> num;
842                                 
843                                 deviation[count] = num;
844                                 count++;
845                         }
846
847                         in.close();
848                         remove(s.c_str());
849                 }
850
851                                 
852 #else
853                         mothurOut("Calculating observed distance... "); cout.flush();
854                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
855                                 vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windows[i], windowSizes[i]);
856                                 obsDistance[i] = obsi;
857                         }
858                         mothurOut("Done."); mothurOutEndLine();
859                         
860                         
861                         
862                         mothurOut("Finding variability... "); cout.flush();
863                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
864                                 vector<float> q = decalc->findQav(windows[i], windowSizes[i], probabilityProfile, h[i]);
865                                 Qav[i] = q;
866                         }
867                         mothurOut("Done."); mothurOutEndLine();
868                         
869                         
870                         
871                         mothurOut("Calculating alpha... "); cout.flush();
872                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
873                                 float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
874                                 seqCoef.push_back(alpha);
875                         }
876                         mothurOut("Done."); mothurOutEndLine();
877                 
878                 
879                 
880                         mothurOut("Calculating expected distance... "); cout.flush();
881                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
882                                 vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
883                                 expectedDistance[i] = exp;
884                         }
885                         mothurOut("Done."); mothurOutEndLine();
886                         
887                         
888                         
889                         mothurOut("Finding deviation... "); cout.flush();
890                         for (int i = lines[0]->start; i < lines[0]->end; i++) {
891                                 float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
892                                 DE[i] = de;
893                                 
894                                 it = trimmed[i].begin();
895                                 float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
896                                 deviation[i] = dist;
897                         }
898                         mothurOut("Done."); mothurOutEndLine();
899
900 #endif          
901         }
902         catch(exception& e) {
903                 errorOut(e, "Pintail", "createProcesses");
904                 exit(1);
905         }
906 }
907
908
909 /**************************************************************************************************/
910
911 void Pintail::createProcessesQuan() {
912         try {
913 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
914                 int process = 0;
915                 vector<int> processIDS;
916                                 
917                 //loop through and create all the processes you want
918                 while (process != processors) {
919                         int pid = fork();
920                         
921                         if (pid > 0) {
922                                 processIDS.push_back(pid);  
923                                 process++;
924                         }else if (pid == 0){
925                                 
926                                 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
927                                 
928                                 //write out data to file so parent can read it
929                                 ofstream out;
930                                 string s = toString(getpid()) + ".temp";
931                                 openOutputFile(s, out);
932                                 
933                                                                 
934                                 //output observed distances
935                                 for (int i = 0; i < quantiles.size(); i++) {
936                                         out << quantiles[i].size() << '\t';
937                                         for (int j = 0; j < quantiles[i].size(); j++) {
938                                                 out << quantiles[i][j] << '\t';
939                                         }
940                                         out << endl;
941                                 }
942                                 
943                                 out.close();
944                                 
945                                 exit(0);
946                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
947                 }
948                 
949                 //force parent to wait until all the processes are done
950                 for (int i=0;i<processors;i++) { 
951                         int temp = processIDS[i];
952                         wait(&temp);
953                 }
954                 
955                 //get data created by processes
956                 for (int i=0;i<processors;i++) { 
957                         ifstream in;
958                         string s = toString(processIDS[i]) + ".temp";
959                         openInputFile(s, in);
960                         
961                         vector< vector<float> > quan; quan.resize(100);
962                         
963                         //get quantiles
964                         for (int m = 0; m < quan.size(); m++) {
965                                 int num;
966                                 in >> num;
967                         
968                                 vector<float> q;  float w;
969                                 for (int j = 0; j < num; j++) {
970                                         in >> w;
971                                         q.push_back(w);
972                                 }
973                         
974                                 quan[m] = q;
975                                 gobble(in);
976                         }
977                         
978                         
979                         //save quan in quantiles
980                         for (int i = 0; i < quan.size(); i++) {
981                                 //put all values of q[i] into quan[i]
982                                 quantiles[i].insert(quantiles[i].begin(), quan[i].begin(), quan[i].end());
983                         }
984                                         
985                         in.close();
986                         remove(s.c_str());
987                 }
988                 
989 #else
990                 quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
991 #endif          
992         }
993         catch(exception& e) {
994                 errorOut(e, "Pintail", "createProcessesQuan");
995                 exit(1);
996         }
997 }
998
999
1000 //***************************************************************************************************************
1001
1002