]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
started adding chimeraslayer method and fixed minor bug in treegroupscommand deconstr...
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
17 //***************************************************************************************************************
18
19 Ccode::~Ccode() {
20         try {
21                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
22                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
23         }
24         catch(exception& e) {
25                 errorOut(e, "Ccode", "~Ccode");
26                 exit(1);
27         }
28 }       
29 //***************************************************************************************************************
30 void Ccode::print(ostream& out) {
31         try {
32                 
33                 mothurOutEndLine();
34                 
35                 string mapInfo = getRootName(fastafile) + "mapinfo";
36                 ofstream out2;
37                 openOutputFile(mapInfo, out2);
38                 
39                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
40                 
41                 for (int j = 0; j < querySeqs.size(); j++) {
42                         out2 << querySeqs[j]->getName() << endl;
43                         for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) {
44                                 out2 << it->first << '\t' << it->second << endl;
45                         }
46                 }
47                 out2.close();
48                 
49                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
50                 
51                 out << "For full window mapping info refer to " << mapInfo << endl << endl;
52                 
53                 for (int i = 0; i < querySeqs.size(); i++) {
54                 
55                         out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
56                         
57                         for (int j = 0; j < closest[i].size(); j++) {
58                                 out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl;
59                         }
60                         out << endl << endl;
61                 
62                         //for each window
63                         //window mapping info.
64                         out << "Mapping information: ";
65                         //you mask and did not filter
66                         if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
67                                 
68                         //you filtered and did not mask
69                         if ((seqMask == "") && (filter)) { out << "filter and trim."; }
70                                 
71                         //you masked and filtered
72                         if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
73                         
74                         out << endl << "Window\tStartPos\tEndPos" << endl;
75                         it = trim[i].begin();
76                         
77                         for (int k = 0; k < windows[i].size()-1; k++) {
78                                 out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl;
79                         }
80                         
81                         out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl;
82                         out << endl;
83                         
84                         out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
85                         for (int k = 0; k < windows[i].size(); k++) {
86                                 float ds = averageQuery[i][k] / averageRef[i][k]; 
87                                 out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl;
88                         }
89                         out << endl;
90                         
91                         //varRef
92                         //varQuery
93                         /* F test for differences among variances.
94                         * varQuery is expected to be higher or similar than varRef */
95                         //float fs = varQuery[query][i] / varRef[query][i];     /* F-Snedecor, test for differences of variances */
96                         
97                         bool results = false;   
98                                                 
99                         //confidence limit, t - Student, anova
100                         out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
101                         
102                         for (int k = 0; k < windows[i].size(); k++) {
103                                 string temp = "";
104                                 if (isChimericConfidence[i][k]) {  temp += "*\t"; }
105                                 else { temp += "\t"; }
106                                 
107                                 if (isChimericTStudent[i][k]) {  temp += "*\t"; }
108                                 else { temp += "\t"; }
109                                 
110                                 if (isChimericANOVA[i][k]) {  temp += "*\t"; }
111                                 else { temp += "\t"; }
112                         
113                                 out << k+1 << '\t' << temp << endl;
114                                 
115                                 if (temp == "*\t*\t*\t") {  results = true;  }
116                         }
117                         out << endl;    
118                         
119                         if (results) {
120                                 mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
121                         }
122                 }
123         }
124         catch(exception& e) {
125                 errorOut(e, "Ccode", "print");
126                 exit(1);
127         }
128 }
129
130 //***************************************************************************************************************
131 void Ccode::getChimeras() {
132         try {
133                 
134                 //read in query sequences and subject sequences
135                 mothurOut("Reading sequences and template file... "); cout.flush();
136                 querySeqs = readSeqs(fastafile);
137                 templateSeqs = readSeqs(templateFile);
138                 mothurOut("Done."); mothurOutEndLine();
139                 
140                 int numSeqs = querySeqs.size();
141                 
142                 closest.resize(numSeqs);
143                 
144                 refCombo.resize(numSeqs, 0);
145                 sumRef.resize(numSeqs); 
146                 varRef.resize(numSeqs); 
147                 varQuery.resize(numSeqs); 
148                 sdRef.resize(numSeqs); 
149                 sdQuery.resize(numSeqs);     
150                 sumQuery.resize(numSeqs);
151                 sumSquaredRef.resize(numSeqs); 
152                 sumSquaredQuery.resize(numSeqs); 
153                 averageRef.resize(numSeqs);
154                 averageQuery.resize(numSeqs);
155                 anova.resize(numSeqs);
156                 isChimericConfidence.resize(numSeqs);
157                 isChimericTStudent.resize(numSeqs);
158                 isChimericANOVA.resize(numSeqs);
159                 trim.resize(numSeqs);
160                 spotMap.resize(numSeqs);
161                 windowSizes.resize(numSeqs, window);
162                 windows.resize(numSeqs);
163                 
164                 //break up file if needed
165                 int linesPerProcess = numSeqs / processors ;
166                 
167                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
168                         //find breakup of sequences for all times we will Parallelize
169                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
170                         else {
171                                 //fill line pairs
172                                 for (int i = 0; i < (processors-1); i++) {                      
173                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
174                                 }
175                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
176                                 int i = processors - 1;
177                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
178                         }
179                         
180                 #else
181                         lines.push_back(new linePair(0, numSeqs));
182                 #endif
183         
184                 distCalc = new eachGapDist();
185                 decalc = new DeCalculator();
186                 
187                 //find closest
188                 if (processors == 1) { 
189                         mothurOut("Finding top matches for sequences... "); cout.flush();
190                         closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
191                         mothurOut("Done."); mothurOutEndLine();
192                 }else {         createProcessesClosest();               }
193
194                 //initialize spotMap
195                 for (int j = 0; j < numSeqs; j++) {
196                         for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
197                                 spotMap[j][i] = i;
198                         }
199                 }
200                 
201                 //mask sequences if the user wants to 
202                 if (seqMask != "") {
203                         decalc->setMask(seqMask);
204                         
205                         //mask querys
206                         for (int i = 0; i < querySeqs.size(); i++) {
207                                 decalc->runMask(querySeqs[i]);
208                         }
209                 
210                         //mask templates
211                         for (int i = 0; i < templateSeqs.size(); i++) {
212                                 decalc->runMask(templateSeqs[i]);
213                         }
214                         
215                         for (int i = 0; i < numSeqs; i++) {
216                                 spotMap[i] = decalc->getMaskMap();
217                         }
218                 }
219                 
220                 if (filter) {
221                         vector<Sequence*> temp = templateSeqs;
222                         for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
223                         
224                         createFilter(temp);
225                 
226                         runFilter(querySeqs);
227                         runFilter(templateSeqs);
228                         
229                         //update spotMap
230                         map<int, int> newMap;
231                         int spot = 0;
232                         int j = 0;
233                         
234                         for (int i = 0; i < filterString.length(); i++) {
235                                 if (filterString[i] == '1') {
236                                         //add to newMap
237                                         newMap[spot] = spotMap[j][i];
238                                         spot++;  
239                                 }
240                         }
241                         
242                         for (int i = 0; i < numSeqs; i++) {
243                                 spotMap[i] = newMap;
244                         }
245                 }
246
247                 //trim sequences - this follows ccodes remove_extra_gaps 
248                 for (int i = 0; i < querySeqs.size(); i++) {
249                         trimSequences(i);
250                 }
251                 
252                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
253                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
254                 for (int i = 0; i < querySeqs.size(); i++) {
255                         windows[i] = findWindows(i);  
256                 }
257
258                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
259                 if (processors == 1) { 
260                         for (int i = 0; i < closest.size(); i++) {
261                                 removeBadReferenceSeqs(closest[i], i);
262                         }
263                 }else {         createProcessesRemoveBad();             }
264
265                 
266                 if (processors == 1) { 
267                         //find the averages for each querys references
268                         for (int i = 0; i < numSeqs; i++) {
269                                 getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
270                         }
271                         
272                         //find the averages for the query 
273                         for (int i = 0; i < numSeqs; i++) {
274                                 getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
275                         }
276                 }else {         createProcessesAverages();              }
277                 
278                 if (processors == 1) { 
279                         //find the averages for each querys references 
280                         for (int i = 0; i < numSeqs; i++) {
281                                 findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
282                         }
283                         
284                         //find the averages for the query 
285                         for (int i = 0; i < numSeqs; i++) {
286                                 findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
287                         }
288                 }else {         createProcessesVariances();             }
289                 
290                 if (processors == 1) { 
291                         for (int i = 0; i < numSeqs; i++) {
292                                 determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. 
293                         }
294                 }else {         createProcessesDetermine();             }
295                 
296                 //free memory
297                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
298                 delete distCalc;
299                 delete decalc;
300                         
301         }
302         catch(exception& e) {
303                 errorOut(e, "Ccode", "getChimeras");
304                 exit(1);
305         }
306 }
307 /***************************************************************************************************************/
308 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
309 void Ccode::trimSequences(int query) {
310         try {
311                 
312                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
313                 int rearPos = querySeqs[query]->getAligned().length();
314                 
315                 //********find first position in closest seqs that is a non gap character***********//
316                 //find first position all query seqs that is a non gap character
317                 for (int i = 0; i < closest[query].size(); i++) {
318                         
319                         string aligned = closest[query][i].seq->getAligned();
320                         int pos = 0;
321                         
322                         //find first spot in this seq
323                         for (int j = 0; j < aligned.length(); j++) {
324                                 if (isalpha(aligned[j])) {
325                                         pos = j;
326                                         break;
327                                 }
328                         }
329                         
330                         //save this spot if it is the farthest
331                         if (pos > frontPos) { frontPos = pos; }
332                 }
333                 
334                 //find first position all querySeq[query] that is a non gap character
335                 string aligned = querySeqs[query]->getAligned();
336                 int pos = 0;
337                         
338                 //find first spot in this seq
339                 for (int j = 0; j < aligned.length(); j++) {
340                         if (isalpha(aligned[j])) {
341                                 pos = j;
342                                 break;
343                         }
344                 }
345                 
346                 //save this spot if it is the farthest
347                 if (pos > frontPos) { frontPos = pos; }
348                 
349                 
350                 //********find last position in closest seqs that is a non gap character***********//
351                 for (int i = 0; i < closest[query].size(); i++) {
352                         
353                         string aligned = closest[query][i].seq->getAligned();
354                         int pos = aligned.length();
355                         
356                         //find first spot in this seq
357                         for (int j = aligned.length()-1; j >= 0; j--) {
358                                 if (isalpha(aligned[j])) {
359                                         pos = j;
360                                         break;
361                                 }
362                         }
363                         
364                         //save this spot if it is the farthest
365                         if (pos < rearPos) { rearPos = pos; }
366                 }
367                 
368                 //find last position all querySeqs[query] that is a non gap character
369                 aligned = querySeqs[query]->getAligned();
370                 pos = aligned.length();
371                 
372                 //find first spot in this seq
373                 for (int j = aligned.length()-1; j >= 0; j--) {
374                         if (isalpha(aligned[j])) {
375                                 pos = j;
376                                 break;
377                         }
378                 }
379                 
380                 //save this spot if it is the farthest
381                 if (pos < rearPos) { rearPos = pos; }
382
383                 
384                 //check to make sure that is not whole seq
385                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
386                 
387                 map<int, int> tempTrim;
388                 tempTrim[frontPos] = rearPos;
389                 
390                 //save trimmed locations
391                 trim[query] = tempTrim;
392                                                 
393                 //update spotMask
394                 map<int, int> newMap;
395                 int spot = 0;
396                 
397                 for (int i = frontPos; i < rearPos; i++) {
398                         //add to newMap
399 //cout << query << '\t' << i << '\t' << spotMap[query][i] << endl;
400                         newMap[spot] = spotMap[query][i];
401                         spot++;  
402                 }
403                 
404                 //for (it = newMap.begin(); it!=newMap.end(); it++) {
405                         //cout << query << '\t' << it->first << '\t' << it->second << endl;
406                 //}
407                 
408                 spotMap[query] = newMap;
409
410                 
411         }
412         catch(exception& e) {
413                 errorOut(e, "Ccode", "trimSequences");
414                 exit(1);
415         }
416
417 }
418 /***************************************************************************************************************/
419 vector<int> Ccode::findWindows(int query) {
420         try {
421                 
422                 vector<int> win; 
423                 it = trim[query].begin();
424                 
425                 int length = it->second - it->first;
426                 
427                 //default is wanted = 10% of total length
428                 if (windowSizes[query] > length) { 
429                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
430                         windowSizes[query] = length / 10;
431                 }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10;  }
432                 else if (windowSizes[query] > (length / 20)) {
433                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
434                 }else if (windowSizes[query] < (length / 5)) {
435                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
436                 }
437                 
438                 //save starting points of each window
439                 for (int m = it->first;  m < (it->second-windowSizes[query]); m+=windowSizes[query]) {  win.push_back(m);  }
440                 
441                 //save last window
442                 if (win[win.size()-1] < (it->first+length)) {
443                         win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
444                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
445                 
446
447                 return win;
448         
449         }
450         catch(exception& e) {
451                 errorOut(e, "Ccode", "findWindows");
452                 exit(1);
453         }
454 }
455 //***************************************************************************************************************
456 int Ccode::getDiff(string seqA, string seqB) {
457         try {
458                 
459                 int numDiff = 0;
460                 
461                 for (int i = 0; i < seqA.length(); i++) {
462                         //if you are both not gaps
463                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
464                                 //are you different
465                                 if (seqA[i] != seqB[i]) { 
466                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
467
468                                         /* the char in base_a and base_b have been checked and they are different */
469                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
470                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
471                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
472                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
473                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
474                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
475                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
476                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
477                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
478                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
479                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
480                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
481                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
482                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
483                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
484                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
485                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
486                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
487                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
488                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
489                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
490                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
491                                         else ok = 0;  /* the bases are different and not equivalent */
492                                         
493                                         //check if they are both blanks
494                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
495                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
496                                         
497                                         if (ok == 0) {  numDiff++;  }
498                                 }
499                         //}
500                 }
501                 
502                 return numDiff;
503         
504         }
505         catch(exception& e) {
506                 errorOut(e, "Ccode", "getDiff");
507                 exit(1);
508         }
509 }
510 //***************************************************************************************************************
511 //tried to make this look most like ccode original implementation
512 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs, int query) {
513         try {
514                 
515                 vector< vector<int> > numDiffBases;
516                 numDiffBases.resize(seqs.size());
517                 //initialize to 0
518                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
519                 
520                 it = trim[query].begin();
521                 int length = it->second - it->first;
522                 
523                 //calc differences from each sequence to everyother seq in the set
524                 for (int i = 0; i < seqs.size(); i++) {
525                         
526                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
527                         
528                         //so you don't calc i to j and j to i since they are the same
529                         for (int j = 0; j < i; j++) {
530                                 
531                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
532                                 
533                                 //compare strings
534                                 int numDiff = getDiff(seqA, seqB);
535                                 
536                                 numDiffBases[i][j] = numDiff;
537                                 numDiffBases[j][i] = numDiff;
538                         }
539                 }
540                 
541                 //initailize remove to 0
542                 vector<int> remove;  remove.resize(seqs.size(), 0);
543                 float top = ((20*length) / (float) 100);
544                 float bottom = ((0.5*length) / (float) 100);
545                 
546                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
547                 for (int i = 0; i < numDiffBases.size(); i++) {
548                         for (int j = 0; j < i; j++) {   
549                                 //are you more than 20% different
550                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
551                                 //are you less than 0.5% different
552                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
553                         }
554                 }
555                 
556                 int numSeqsLeft = 0;
557                 
558                 //count seqs that are not going to be removed
559                 for (int i = 0; i < remove.size(); i++) {  
560                         if (remove[i] == 0)  { numSeqsLeft++;  }
561                 }
562                 
563                 //if you have enough then remove bad ones
564                 if (numSeqsLeft >= 3) {
565                         vector<SeqDist> goodSeqs;
566                         //remove bad seqs
567                         for (int i = 0; i < remove.size(); i++) {
568                                 if (remove[i] == 0) { 
569                                         goodSeqs.push_back(seqs[i]);
570                                 }
571                         }
572                         
573                         seqs = goodSeqs;
574                         
575                 }else { //warn, but dont remove any
576                         mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
577                 }
578                         
579
580         }
581         catch(exception& e) {
582                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
583                 exit(1);
584         }
585 }
586 //***************************************************************************************************************
587 vector< vector<SeqDist> > Ccode::findClosest(int start, int end, int numWanted) {
588         try{
589         
590                 vector< vector<SeqDist> > topMatches;  topMatches.resize(querySeqs.size());
591         
592                 float smallestOverall, smallestLeft, smallestRight;
593                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
594                 
595                 //for each sequence in querySeqs - find top matches to use as reference
596                 for(int j = start; j < end; j++){
597                         
598                         Sequence query = *(querySeqs[j]);
599                         
600                         vector<SeqDist> distances;
601                         
602                         //calc distance to each sequence in template seqs
603                         for (int i = 0; i < templateSeqs.size(); i++) {
604                         
605                                 Sequence ref = *(templateSeqs[i]); 
606                                         
607                                 //find overall dist
608                                 distCalc->calcDist(query, ref);
609                                 float dist = distCalc->getDist();       
610                                 
611                                 //save distance
612                                 SeqDist temp;
613                                 temp.seq = templateSeqs[i];
614                                 temp.dist = dist;
615
616                                 distances.push_back(temp);
617                         }
618                         
619                         sort(distances.begin(), distances.end(), compareSeqDist);
620                         
621                         //save the number of top matches wanted
622                         for (int h = 0; h < numWanted; h++) {
623                                 topMatches[j].push_back(distances[h]);
624                         }
625                 }
626                         
627                 return topMatches;
628
629         }
630         catch(exception& e) {
631                 errorOut(e, "Ccode", "findClosestSides");
632                 exit(1);
633         }
634 }
635 /**************************************************************************************************/
636 //find the distances from each reference sequence to every other reference sequence for each window for this query
637 void Ccode::getAverageRef(vector<SeqDist> ref, int query) {
638         try {
639                 
640                 vector< vector< vector<int> > > diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
641                 
642                 //initialize diffs vector
643                 diffs.resize(ref.size());
644                 for (int i = 0; i < diffs.size(); i++) {  
645                         diffs[i].resize(ref.size());
646                         for (int j = 0; j < diffs[i].size(); j++) {
647                                 diffs[i][j].resize(windows[query].size(), 0);
648                         }
649                 }
650                 
651                 it = trim[query].begin();
652                                 
653                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
654                 for (int i = 0; i < ref.size(); i++) {
655                         
656                         string refI = ref[i].seq->getAligned();
657                         
658                         //j<i, so you don't find distances from i to j and then j to i.
659                         for (int j = 0; j < i; j++) {
660                         
661                                 string refJ = ref[j].seq->getAligned();
662                         
663                                 for (int k = 0; k < windows[query].size(); k++) {
664                                         
665                                         string refIWindowk, refJWindowk;
666                                         
667                                         if (k < windows[query].size()-1) {
668                                                 //get window strings
669                                                 refIWindowk = refI.substr(windows[query][k], windowSizes[query]);
670                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
671                                         }else { //last window may be smaller than rest - see findwindows
672                                                 //get window strings
673                                                 refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k]));
674                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
675                                         }
676                                         
677                                         //find differences
678                                         int diff = getDiff(refIWindowk, refJWindowk);
679 //cout <<  i << '\t' << j << '\t' << k << '\t' << diff << endl;
680                                         //save differences in [i][j][k] and [j][i][k] since they are the same
681                                         diffs[i][j][k] = diff;
682                                         diffs[j][i][k] = diff;
683
684                                 }//k
685                                 
686                         }//j
687                 
688                 }//i
689                 
690                 //initialize sumRef for this query 
691                 sumRef[query].resize(windows[query].size(), 0);
692                 sumSquaredRef[query].resize(windows[query].size(), 0);
693                 averageRef[query].resize(windows[query].size(), 0);
694                 
695                 //find the sum of the differences for hte reference sequences
696                 for (int i = 0; i < diffs.size(); i++) {
697                         for (int j = 0; j < i; j++) {
698                         
699                                 //increment this querys reference sequences combos
700                                 refCombo[query]++;
701                                 
702                                 for (int k = 0; k < diffs[i][j].size(); k++) {
703                                         sumRef[query][k] += diffs[i][j][k];
704                                         sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]);
705                                 }//k
706                                 
707                         }//j
708                 }//i
709
710                 
711                 //find the average of the differences for the references for each window
712                 for (int i = 0; i < windows[query].size(); i++) {
713                         averageRef[query][i] = sumRef[query][i] / (float) refCombo[query];
714                 }
715                 
716         }
717         catch(exception& e) {
718                 errorOut(e, "Ccode", "getAverageRef");
719                 exit(1);
720         }
721 }
722 /**************************************************************************************************/
723 void Ccode::getAverageQuery (vector<SeqDist> ref, int query) {
724         try {
725         
726                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
727                 
728                 //initialize diffs vector
729                 diffs.resize(ref.size());
730                 for (int j = 0; j < diffs.size(); j++) {
731                         diffs[j].resize(windows[query].size(), 0);
732                 }
733                 
734                 it = trim[query].begin();
735                                                         
736                 string refQuery = querySeqs[query]->getAligned();
737                          
738                 //j<i, so you don't find distances from i to j and then j to i.
739                 for (int j = 0; j < ref.size(); j++) {
740                          
741                          string refJ = ref[j].seq->getAligned();
742                          
743                          for (int k = 0; k < windows[query].size(); k++) {
744                                         
745                                         string QueryWindowk, refJWindowk;
746                                         
747                                         if (k < windows[query].size()-1) {
748                                                 //get window strings
749                                                 QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]);
750                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);                                       
751                                         }else { //last window may be smaller than rest - see findwindows
752                                                 //get window strings
753                                                 QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k]));
754                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
755                                         }
756                                         
757                                         //find differences
758                                         int diff = getDiff(QueryWindowk, refJWindowk);
759 //cout  << j << '\t' << k << '\t' << diff << endl;                                              
760                                         //save differences 
761                                         diffs[j][k] = diff;
762                          
763                          }//k
764                 }//j
765                          
766                 
767                 //initialize sumRef for this query 
768                 sumQuery[query].resize(windows[query].size(), 0);
769                 sumSquaredQuery[query].resize(windows[query].size(), 0);
770                 averageQuery[query].resize(windows[query].size(), 0);
771                 
772                 //find the sum of the differences 
773                 for (int j = 0; j < diffs.size(); j++) {
774                         for (int k = 0; k < diffs[j].size(); k++) {
775                                 sumQuery[query][k] += diffs[j][k];
776                                 sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]);
777                         }//k
778                 }//j
779                 
780                 
781                 //find the average of the differences for the references for each window
782                 for (int i = 0; i < windows[query].size(); i++) {
783                         averageQuery[query][i] = sumQuery[query][i] / (float) ref.size();
784                 }
785
786         
787         }
788         catch(exception& e) {
789                 errorOut(e, "Ccode", "getAverageQuery");
790                 exit(1);
791         }
792 }
793 /**************************************************************************************************/
794 void Ccode::findVarianceRef (int query) {
795         try {
796                 
797                 varRef[query].resize(windows[query].size(), 0);
798                 sdRef[query].resize(windows[query].size(), 0);
799                 
800                 //for each window
801                 for (int i = 0; i < windows[query].size(); i++) {
802                         varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1);
803                         sdRef[query][i] = sqrt(varRef[query][i]);
804                         
805                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
806                         if (averageRef[query][i] < 0.001)                       {       averageRef[query][i] = 0.001;           }
807                         if (sumRef[query][i] < 0.001)                           {       sumRef[query][i] = 0.001;                       }
808                         if (varRef[query][i] < 0.001)                           {       varRef[query][i] = 0.001;                       }
809                         if (sumSquaredRef[query][i] < 0.001)            {       sumSquaredRef[query][i] = 0.001;        }
810                         if (sdRef[query][i] < 0.001)                            {       sdRef[query][i] = 0.001;                        }
811                                 
812                 }
813         }
814         catch(exception& e) {
815                 errorOut(e, "Ccode", "findVarianceRef");
816                 exit(1);
817         }
818 }
819 /**************************************************************************************************/
820 void Ccode::findVarianceQuery (int query) {
821         try {
822                 varQuery[query].resize(windows[query].size(), 0);
823                 sdQuery[query].resize(windows[query].size(), 0);
824                 
825                 //for each window
826                 for (int i = 0; i < windows[query].size(); i++) {
827                         varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1);
828                         sdQuery[query][i] = sqrt(varQuery[query][i]);
829                         
830                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
831                         if (averageQuery[query][i] < 0.001)                     {       averageQuery[query][i] = 0.001;         }
832                         if (sumQuery[query][i] < 0.001)                         {       sumQuery[query][i] = 0.001;                     }
833                         if (varQuery[query][i] < 0.001)                         {       varQuery[query][i] = 0.001;                     }
834                         if (sumSquaredQuery[query][i] < 0.001)          {       sumSquaredQuery[query][i] = 0.001;      }
835                         if (sdQuery[query][i] < 0.001)                          {       sdQuery[query][i] = 0.001;                      }
836                 }
837
838         }
839         catch(exception& e) {
840                 errorOut(e, "Ccode", "findVarianceQuery");
841                 exit(1);
842         }
843 }
844 /**************************************************************************************************/
845 void Ccode::determineChimeras (int query) {
846         try {
847                 
848                 isChimericConfidence[query].resize(windows[query].size(), false);
849                 isChimericTStudent[query].resize(windows[query].size(), false);
850                 isChimericANOVA[query].resize(windows[query].size(), false);
851                 anova[query].resize(windows[query].size());
852
853                 
854                 //for each window
855                 for (int i = 0; i < windows[query].size(); i++) {
856                 
857                         //get confidence limits
858                         float t = getT(closest[query].size()-1);  //how many seqs you are comparing to this querySeq
859                         float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; 
860                         float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; 
861 //cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl;                    
862                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) {  /* range does not include 1 */
863                                         isChimericConfidence[query][i] = true;   /* significantly higher at P<0.05 */ 
864 //cout << i << " made it here" << endl;
865                         }
866                         
867                         //student t test
868                         int degreeOfFreedom = refCombo[query] + closest[query].size() - 2;
869                         float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size()));    /* denominator, without sqrt(), for ts calculations */
870                                 
871                         float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT)));  /* value of ts for t-student test */     
872                         t = getT(degreeOfFreedom);
873 //cout << i << '\t' << t << '\t' << ts << endl;                 
874                         if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) {   
875                                 isChimericTStudent[query][i] = true;   /* significantly higher at P<0.05 */ 
876                         }
877                 
878                         //anova test
879                         float value1 = sumQuery[query][i] + sumRef[query][i];
880                         float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i];
881                         float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]);
882                         float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) );
883                         float value5 = value2 - value4;
884                         float value6 = value3 - value4;
885                         float value7 = value5 - value6;
886                         float value8 = value7 / ((float) degreeOfFreedom);
887                         float anovaValue = value6 / value8;
888                         
889                         float f = getF(degreeOfFreedom);
890                         
891                          if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i]))  {
892                        isChimericANOVA[query][i] = true;   /* significant P<0.05 */
893                 }
894                         
895                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
896                         
897                         anova[query][i] = anovaValue;
898                 }
899                 
900         }
901         catch(exception& e) {
902                 errorOut(e, "Ccode", "determineChimeras");
903                 exit(1);
904         }
905 }
906 /**************************************************************************************************/    
907 float Ccode::getT(int numseq) {
908         try {
909         
910                 float tvalue = 0;
911                 
912                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
913                 if (numseq > 120) tvalue = 1.645;
914                 else if (numseq > 60) tvalue = 1.658;
915         else if (numseq > 40) tvalue = 1.671;
916         else if (numseq > 30) tvalue = 1.684;
917         else if (numseq > 29) tvalue = 1.697;
918         else if (numseq > 28) tvalue = 1.699;
919         else if (numseq > 27) tvalue = 1.701;
920         else if (numseq > 26) tvalue = 1.703;
921         else if (numseq > 25) tvalue = 1.706;
922         else if (numseq > 24) tvalue = 1.708;
923         else if (numseq > 23) tvalue = 1.711;
924         else if (numseq > 22) tvalue = 1.714;
925         else if (numseq > 21) tvalue = 1.717;
926         else if (numseq > 20) tvalue = 1.721;
927         else if (numseq > 19) tvalue = 1.725;
928         else if (numseq > 18) tvalue = 1.729;
929         else if (numseq > 17) tvalue = 1.734;
930         else if (numseq > 16) tvalue = 1.740;
931         else if (numseq > 15) tvalue = 1.746;
932         else if (numseq > 14) tvalue = 1.753;
933         else if (numseq > 13) tvalue = 1.761;
934         else if (numseq > 12) tvalue = 1.771;
935         else if (numseq > 11) tvalue = 1.782;
936         else if (numseq > 10) tvalue = 1.796;
937         else if (numseq > 9) tvalue = 1.812;
938         else if (numseq > 8) tvalue = 1.833;
939         else if (numseq > 7) tvalue = 1.860;
940         else if (numseq > 6) tvalue = 1.895;
941         else if (numseq > 5) tvalue = 1.943;
942         else if (numseq > 4) tvalue = 2.015;
943         else if (numseq > 3) tvalue = 2.132;
944         else if (numseq > 2) tvalue = 2.353;
945         else if (numseq > 1) tvalue = 2.920;
946                 else if (numseq <= 1) {
947                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
948                 }
949                 
950                 return tvalue;
951         }
952         catch(exception& e) {
953                 errorOut(e, "Ccode", "getT");
954                 exit(1);
955         }
956 }
957 /**************************************************************************************************/    
958 float Ccode::getF(int numseq) {
959         try {
960         
961                 float fvalue = 0;
962                 
963                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
964         if (numseq > 120) fvalue = 3.84;
965         else if (numseq > 60) fvalue = 3.92;
966         else if (numseq > 40) fvalue = 4.00;
967         else if (numseq > 30) fvalue = 4.08;
968         else if (numseq > 29) fvalue = 4.17;
969         else if (numseq > 28) fvalue = 4.18;
970         else if (numseq > 27) fvalue = 4.20;
971         else if (numseq > 26) fvalue = 4.21;
972         else if (numseq > 25) fvalue = 4.23;
973         else if (numseq > 24) fvalue = 4.24;
974         else if (numseq > 23) fvalue = 4.26;
975         else if (numseq > 22) fvalue = 4.28;
976         else if (numseq > 21) fvalue = 4.30;
977         else if (numseq > 20) fvalue = 4.32;
978         else if (numseq > 19) fvalue = 4.35;
979         else if (numseq > 18) fvalue = 4.38;
980         else if (numseq > 17) fvalue = 4.41;
981         else if (numseq > 16) fvalue = 4.45;
982         else if (numseq > 15) fvalue = 4.49;
983         else if (numseq > 14) fvalue = 4.54;
984         else if (numseq > 13) fvalue = 4.60;
985         else if (numseq > 12) fvalue = 4.67;
986         else if (numseq > 11) fvalue = 4.75;
987         else if (numseq > 10) fvalue = 4.84;
988         else if (numseq > 9) fvalue = 4.96;
989         else if (numseq > 8) fvalue = 5.12;
990         else if (numseq > 7) fvalue = 5.32;
991         else if (numseq > 6) fvalue = 5.59;
992         else if (numseq > 5) fvalue = 5.99;
993         else if (numseq > 4) fvalue = 6.61;
994         else if (numseq > 3) fvalue = 7.71;
995         else if (numseq > 2) fvalue = 10.1;
996         else if (numseq > 1) fvalue = 18.5;
997         else if (numseq > 0) fvalue = 161;
998                 else if (numseq <= 0) {
999                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
1000         }
1001                 
1002                 return fvalue;
1003         }
1004         catch(exception& e) {
1005                 errorOut(e, "Ccode", "getF");
1006                 exit(1);
1007         }
1008 }
1009
1010 /**************************************************************************************************/
1011 void Ccode::createProcessesClosest() {
1012         try {
1013 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1014                 int process = 0;
1015                 vector<int> processIDS;
1016                 
1017                 //loop through and create all the processes you want
1018                 while (process != processors) {
1019                         int pid = fork();
1020                         
1021                         if (pid > 0) {
1022                                 processIDS.push_back(pid);  
1023                                 process++;
1024                         }else if (pid == 0){
1025                                 
1026                                 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1027                                 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
1028                                 mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1029                                 
1030                                 //write out data to file so parent can read it
1031                                 ofstream out;
1032                                 string s = toString(getpid()) + ".temp";
1033                                 openOutputFile(s, out);
1034                                                         
1035                                 //output pairs
1036                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1037                                          for (int j = 0; j < closest[i].size(); j++) {
1038                                                 closest[i][j].seq->printSequence(out);
1039                                          }
1040                                 }
1041                                 out << ">" << endl; //to stop sequence read
1042                                 
1043                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1044                                          for (int j = 0; j < closest[i].size(); j++) {
1045                                                 out << closest[i][j].dist << '\t';
1046                                          }
1047                                          out << endl;
1048                                 }
1049                                 out.close();
1050                                 exit(0);
1051                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1052                 }
1053                 
1054                 //force parent to wait until all the processes are done
1055                 for (int i=0;i<processors;i++) { 
1056                         int temp = processIDS[i];
1057                         wait(&temp);
1058                 }
1059                 
1060                 //get data created by processes
1061                 for (int i=0;i<processors;i++) { 
1062                         ifstream in;
1063                         string s = toString(processIDS[i]) + ".temp";
1064                         openInputFile(s, in);
1065                         
1066                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1067                         //get pairs
1068                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1069                                 vector<Sequence*> tempVector;
1070         
1071                                 for (int j = 0; j < numWanted; j++) {
1072                                 
1073                                         Sequence* temp = new Sequence(in);
1074                                         gobble(in);
1075                                                 
1076                                         tempVector.push_back(temp);
1077                                 }
1078                                 
1079                                 tempClosest[k] = tempVector;
1080                         }
1081                         
1082                         string junk;
1083                         in >> junk;  gobble(in);  // to get ">"
1084                 
1085                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1086                         
1087                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1088                                 dists[k].resize(numWanted);
1089                                 for (int j = 0; j < numWanted; j++) {
1090                                         in >> dists[k][j];
1091                                 }
1092                                 gobble(in);
1093                         
1094                         } 
1095
1096                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1097                                 closest[k].resize(numWanted);
1098                                 for (int j = 0; j < closest[k].size(); j++) {
1099                                         closest[k][j].seq = tempClosest[k][j];
1100                                         closest[k][j].dist = dists[k][j]; 
1101                                 }
1102                         } 
1103
1104                         in.close();
1105                         remove(s.c_str());
1106                 }
1107                         
1108         
1109 #else
1110                 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
1111 #endif  
1112
1113         }
1114         catch(exception& e) {
1115                 errorOut(e, "Ccode", "createProcessesClosest");
1116                 exit(1);
1117         }
1118 }
1119
1120 //***************************************************************************************************************
1121 void Ccode::createProcessesRemoveBad() {
1122         try {
1123 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1124                 int process = 0;
1125                 vector<int> processIDS;
1126                 
1127                 //loop through and create all the processes you want
1128                 while (process != processors) {
1129                         int pid = fork();
1130                         
1131                         if (pid > 0) {
1132                                 processIDS.push_back(pid);  
1133                                 process++;
1134                         }else if (pid == 0){
1135                                                         
1136                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1137                                         removeBadReferenceSeqs(closest[i], i);
1138                                 }
1139                                 
1140                                 //write out data to file so parent can read it
1141                                 ofstream out;
1142                                 string s = toString(getpid()) + ".temp";
1143                                 openOutputFile(s, out);
1144                                 
1145                                 //output pairs
1146                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1147                                         out << closest[i].size() << endl;
1148                                          for (int j = 0; j < closest[i].size(); j++) {
1149                                                 closest[i][j].seq->printSequence(out);
1150                                          }
1151                                          out << ">" << endl; //to stop sequence read
1152                                 }
1153                                 
1154                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1155                                          for (int j = 0; j < closest[i].size(); j++) {
1156                                                 out << closest[i][j].dist << '\t';
1157                                          }
1158                                          out << endl;
1159                                 }
1160
1161                                 out.close();
1162                                 
1163                                 exit(0);
1164                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1165                 }
1166                 
1167                 //force parent to wait until all the processes are done
1168                 for (int i=0;i<processors;i++) { 
1169                         int temp = processIDS[i];
1170                         wait(&temp);
1171                 }
1172         
1173                 //get data created by processes
1174                 for (int i=0;i<processors;i++) { 
1175                         ifstream in;
1176                         string s = toString(processIDS[i]) + ".temp";
1177                         openInputFile(s, in);
1178                 
1179                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1180                         vector<int> sizes; 
1181                         //get pairs
1182                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1183                 
1184                                 int num;
1185                                 in >> num; 
1186                                 sizes.push_back(num);
1187                                 gobble(in);
1188                                 
1189                                 vector<Sequence*> tempVector;
1190                                 
1191                                 for (int j = 0; j < num; j++) {
1192                                 
1193                                         Sequence* temp = new Sequence(in);
1194                                         gobble(in);
1195                                                 
1196                                         tempVector.push_back(temp);
1197                                 }
1198                                 string junk;
1199                                 in >> junk;  gobble(in);  // to get ">"
1200
1201                                 tempClosest[k] = tempVector;
1202                         }
1203                         
1204                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1205                         int count = 0;
1206                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1207                                 dists[k].resize(sizes[count]);
1208                                 for (int j = 0; j < sizes[count]; j++) {
1209                                         in >> dists[k][j];
1210                                 }
1211                                 gobble(in);
1212                                 count++;
1213                         } 
1214                         
1215                         count = 0;
1216                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1217                                 for (int j = 0; j < sizes[count]; j++) {
1218                                         closest[k][j].seq = tempClosest[k][j];
1219                                         closest[k][j].dist = dists[k][j]; 
1220                                 }
1221                                 count++;
1222                         } 
1223
1224                         in.close();
1225                         remove(s.c_str());
1226                 }
1227 #else
1228                 for (int i = 0; i < closest.size(); i++) {
1229                         removeBadReferenceSeqs(closest[i], i);
1230                 }
1231 #endif          
1232         
1233         }
1234         catch(exception& e) {
1235                 errorOut(e, "Ccode", "createProcessesRemoveBad");
1236                 exit(1);
1237         }
1238 }
1239 //***************************************************************************************************************
1240 void Ccode::createProcessesAverages() {
1241         try {
1242         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1243                 int process = 0;
1244                 vector<int> processIDS;
1245                 
1246                 //loop through and create all the processes you want
1247                 while (process != processors) {
1248                         int pid = fork();
1249                         
1250                         if (pid > 0) {
1251                                 processIDS.push_back(pid);  
1252                                 process++;
1253                         }else if (pid == 0){
1254                                                         
1255                                 //find the averages for each querys references 
1256                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1257                                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
1258                                 }
1259                                 
1260                                 //find the averages for the query 
1261                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1262                                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
1263                                 }
1264                                 
1265                                 
1266                                 //write out data to file so parent can read it
1267                                 ofstream out;
1268                                 string s = toString(getpid()) + ".temp";
1269                                 openOutputFile(s, out);
1270                                 
1271                                 //output pairs
1272                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1273                                         for (int j = 0; j < windows[i].size(); j++) {
1274                                                 out << sumRef[i][j] << '\t';
1275                                         }
1276                                         out << endl;
1277                                         for (int j = 0; j < windows[i].size(); j++) {
1278                                                 out << averageRef[i][j] << '\t';
1279                                         }
1280                                         out << endl;
1281                                         for (int j = 0; j < windows[i].size(); j++) {
1282                                                 out << sumSquaredRef[i][j] << '\t';
1283                                         }
1284                                         out << endl;
1285                                         for (int j = 0; j < windows[i].size(); j++) {
1286                                                 out << sumQuery[i][j] << '\t';
1287                                         }
1288                                         out << endl;
1289                                         for (int j = 0; j < windows[i].size(); j++) {
1290                                                 out << averageQuery[i][j] << '\t';
1291                                         }
1292                                         out << endl;
1293                                         for (int j = 0; j < windows[i].size(); j++) {
1294                                                 out << sumSquaredQuery[i][j] << '\t';
1295                                         }
1296                                         out << endl;
1297                                         out << refCombo[i] << endl;
1298                                 }
1299
1300                                 out.close();
1301                                 
1302                                 exit(0);
1303                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1304                 }
1305                 
1306                 //force parent to wait until all the processes are done
1307                 for (int i=0;i<processors;i++) { 
1308                         int temp = processIDS[i];
1309                         wait(&temp);
1310                 }
1311         
1312                 //get data created by processes
1313                 for (int i=0;i<processors;i++) { 
1314                         ifstream in;
1315                         string s = toString(processIDS[i]) + ".temp";
1316                         openInputFile(s, in);
1317                 
1318                         //get pairs
1319                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1320                                 sumRef[k].resize(windows[k].size());
1321                                 averageRef[k].resize(windows[k].size());
1322                                 sumSquaredRef[k].resize(windows[k].size());
1323                                 averageQuery[k].resize(windows[k].size());
1324                                 sumQuery[k].resize(windows[k].size());
1325                                 sumSquaredQuery[k].resize(windows[k].size());
1326                                 
1327                                 for (int j = 0; j < windows[k].size(); j++) {
1328                                         in >> sumRef[k][j];
1329                                 }
1330                                 gobble(in);
1331                                 for (int j = 0; j < windows[k].size(); j++) {
1332                                         in >> averageRef[k][j];
1333                                 }
1334                                 gobble(in);
1335                                 for (int j = 0; j < windows[k].size(); j++) {
1336                                         in >> sumSquaredRef[k][j];
1337                                 }
1338                                 gobble(in);
1339                                 for (int j = 0; j < windows[k].size(); j++) {
1340                                         in >> sumQuery[k][j];
1341                                 }
1342                                 gobble(in);
1343                                 for (int j = 0; j < windows[k].size(); j++) {
1344                                         in >> averageQuery[k][j];
1345                                 }
1346                                 gobble(in);
1347                                 for (int j = 0; j < windows[k].size(); j++) {
1348                                         in >> sumSquaredQuery[k][j];
1349                                 }
1350                                 gobble(in);
1351                                 in >> refCombo[k];
1352                                 gobble(in);
1353                         }
1354                         
1355                         in.close();
1356                         remove(s.c_str());
1357                 }
1358 #else
1359                 //find the averages for each querys references 
1360                 for (int i = 0; i < querySeqs.size(); i++) {
1361                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
1362                 }
1363         
1364                 //find the averages for the query 
1365                 for (int i = 0; i < querySeqs.size(); i++) {
1366                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
1367                 }
1368
1369 #endif          
1370         
1371         }
1372         catch(exception& e) {
1373                 errorOut(e, "Ccode", "createProcessesAverages");
1374                 exit(1);
1375         }
1376 }
1377 //***************************************************************************************************************
1378 void Ccode::createProcessesVariances() {
1379         try {
1380         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1381                 int process = 0;
1382                 vector<int> processIDS;
1383                 
1384                 //loop through and create all the processes you want
1385                 while (process != processors) {
1386                         int pid = fork();
1387                         
1388                         if (pid > 0) {
1389                                 processIDS.push_back(pid);  
1390                                 process++;
1391                         }else if (pid == 0){
1392                                                         
1393                                 //find the averages for each querys references 
1394                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1395                                         findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1396                                 }
1397                                 
1398                                 //find the averages for the query 
1399                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1400                                         findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1401                                 }
1402                                 
1403                                 
1404                                 //write out data to file so parent can read it
1405                                 ofstream out;
1406                                 string s = toString(getpid()) + ".temp";
1407                                 openOutputFile(s, out);
1408                                 
1409                                 //output pairs
1410                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1411                                         for (int j = 0; j < windows[i].size(); j++) {
1412                                                 out << varRef[i][j] << '\t';
1413                                         }
1414                                         out << endl;
1415                                         for (int j = 0; j < windows[i].size(); j++) {
1416                                                 out << sdRef[i][j] << '\t';
1417                                         }
1418                                         out << endl;
1419                                         for (int j = 0; j < windows[i].size(); j++) {
1420                                                 out << varQuery[i][j] << '\t';
1421                                         }
1422                                         out << endl;
1423                                         for (int j = 0; j < windows[i].size(); j++) {
1424                                                 out << sdQuery[i][j] << '\t';
1425                                         }
1426                                         out << endl;
1427                                 }
1428
1429                                 out.close();
1430                                 
1431                                 exit(0);
1432                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1433                 }
1434                 
1435                 //force parent to wait until all the processes are done
1436                 for (int i=0;i<processors;i++) { 
1437                         int temp = processIDS[i];
1438                         wait(&temp);
1439                 }
1440         
1441                 //get data created by processes
1442                 for (int i=0;i<processors;i++) { 
1443                         ifstream in;
1444                         string s = toString(processIDS[i]) + ".temp";
1445                         openInputFile(s, in);
1446                 
1447                         //get pairs
1448                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1449                                 varRef[k].resize(windows[k].size());
1450                                 sdRef[k].resize(windows[k].size());
1451                                 varQuery[k].resize(windows[k].size());
1452                                 sdQuery[k].resize(windows[k].size());
1453                                                                 
1454                                 for (int j = 0; j < windows[k].size(); j++) {
1455                                         in >> varRef[k][j];
1456                                 }
1457                                 gobble(in);
1458                                 for (int j = 0; j < windows[k].size(); j++) {
1459                                         in >> sdRef[k][j];
1460                                 }
1461                                 gobble(in);
1462                                 for (int j = 0; j < windows[k].size(); j++) {
1463                                         in >> varQuery[k][j];
1464                                 }
1465                                 gobble(in);
1466                                 for (int j = 0; j < windows[k].size(); j++) {
1467                                         in >> sdQuery[k][j];
1468                                 }
1469                                 gobble(in);
1470                         }
1471                         
1472                         in.close();
1473                         remove(s.c_str());
1474                 }
1475 #else
1476                         //find the averages for each querys references 
1477                         for (int i = 0; i < querySeqs.size(); i++) {
1478                                 findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1479                         }
1480                         
1481                         //find the averages for the query 
1482                         for (int i = 0; i < querySeqs.size(); i++) {
1483                                 findVarianceQuery(i);  //fills v arQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1484                         }
1485 #endif          
1486         }
1487         catch(exception& e) {
1488                 errorOut(e, "Ccode", "createProcessesVariances");
1489                 exit(1);
1490         }
1491 }
1492 //***************************************************************************************************************
1493 void Ccode::createProcessesDetermine() {
1494         try {
1495         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1496                 int process = 0;
1497                 vector<int> processIDS;
1498                 
1499                 //loop through and create all the processes you want
1500                 while (process != processors) {
1501                         int pid = fork();
1502                         
1503                         if (pid > 0) {
1504                                 processIDS.push_back(pid);  
1505                                 process++;
1506                         }else if (pid == 0){
1507                                                         
1508                                 //find the averages for each querys references 
1509                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1510                                         determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. 
1511                                 }
1512
1513                                 //write out data to file so parent can read it
1514                                 ofstream out;
1515                                 string s = toString(getpid()) + ".temp";
1516                                 openOutputFile(s, out);
1517                                 
1518                                 //output pairs
1519                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1520                                         for (int j = 0; j < windows[i].size(); j++) {
1521                                                 out << anova[i][j] << '\t';
1522                                         }
1523                                         out << endl;
1524                                         for (int j = 0; j < windows[i].size(); j++) {
1525                                                 out << isChimericConfidence[i][j] << '\t';
1526                                         }
1527                                         out << endl;
1528                                         for (int j = 0; j < windows[i].size(); j++) {
1529                                                 out << isChimericTStudent[i][j] << '\t';
1530                                         }
1531                                         out << endl;
1532                                         for (int j = 0; j < windows[i].size(); j++) {
1533                                                 out << isChimericANOVA[i][j] << '\t';
1534                                         }
1535                                         out << endl;
1536                                 }
1537
1538                                 out.close();
1539                                 
1540                                 exit(0);
1541                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1542                 }
1543                 
1544                 //force parent to wait until all the processes are done
1545                 for (int i=0;i<processors;i++) { 
1546                         int temp = processIDS[i];
1547                         wait(&temp);
1548                 }
1549         
1550                 //get data created by processes
1551                 for (int i=0;i<processors;i++) { 
1552                         ifstream in;
1553                         string s = toString(processIDS[i]) + ".temp";
1554                         openInputFile(s, in);
1555                 
1556                         //get pairs
1557                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1558                                 anova[k].resize(windows[k].size());
1559                                 isChimericConfidence[k].resize(windows[k].size(), false);
1560                                 isChimericTStudent[k].resize(windows[k].size(), false);
1561                                 isChimericANOVA[k].resize(windows[k].size(), false);
1562                                 int tempBool;
1563                                                                 
1564                                 for (int j = 0; j < windows[k].size(); j++) {
1565                                         in >> anova[k][j];
1566                                 }
1567                                 gobble(in);
1568                                 for (int j = 0; j < windows[k].size(); j++) {
1569                                         in >> tempBool;
1570                                         if (tempBool == 1) {  isChimericConfidence[k][j] = true;  }
1571                                 }
1572                                 gobble(in);
1573                                 for (int j = 0; j < windows[k].size(); j++) {
1574                                         in >> tempBool;
1575                                         if (tempBool == 1) {  isChimericTStudent[k][j] = true;  }
1576                                 }
1577                                 gobble(in);
1578                                 for (int j = 0; j < windows[k].size(); j++) {
1579                                         in >> tempBool;
1580                                         if (tempBool == 1) {  isChimericANOVA[k][j] = true;  }
1581                                 }
1582                                 gobble(in);
1583                         }
1584                         
1585                         in.close();
1586                         remove(s.c_str());
1587                 }
1588         #else
1589                         for (int i = 0; i < querySeqs.size(); i++) {
1590                                 determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i].
1591                         }
1592         #endif          
1593
1594         }
1595         catch(exception& e) {
1596                 errorOut(e, "Ccode", "createProcessesDetermine");
1597                 exit(1);
1598         }
1599 }
1600 //***************************************************************************************************************
1601
1602
1603