]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
added sorted parameter to get.oturep, added error checking to chimera classes in...
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
17 //***************************************************************************************************************
18
19 Ccode::~Ccode() {
20         try {
21                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
22                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
23         }
24         catch(exception& e) {
25                 errorOut(e, "Ccode", "~Ccode");
26                 exit(1);
27         }
28 }       
29 //***************************************************************************************************************
30 void Ccode::print(ostream& out) {
31         try {
32                 
33                 mothurOutEndLine();
34                 
35                 string mapInfo = getRootName(fastafile) + "mapinfo";
36                 ofstream out2;
37                 openOutputFile(mapInfo, out2);
38                 
39                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
40                 
41                 for (int j = 0; j < querySeqs.size(); j++) {
42                         out2 << querySeqs[j]->getName() << endl;
43                         for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) {
44                                 out2 << it->first << '\t' << it->second << endl;
45                         }
46                 }
47                 out2.close();
48                 
49                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
50                 
51                 out << "For full window mapping info refer to " << mapInfo << endl << endl;
52                 
53                 for (int i = 0; i < querySeqs.size(); i++) {
54                 
55                         out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
56                         
57                         for (int j = 0; j < closest[i].size(); j++) {
58                                 out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl;
59                         }
60                         out << endl << endl;
61                 
62                         //for each window
63                         //window mapping info.
64                         out << "Mapping information: ";
65                         //you mask and did not filter
66                         if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
67                                 
68                         //you filtered and did not mask
69                         if ((seqMask == "") && (filter)) { out << "filter and trim."; }
70                                 
71                         //you masked and filtered
72                         if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
73                         
74                         out << endl << "Window\tStartPos\tEndPos" << endl;
75                         it = trim[i].begin();
76                         
77                         for (int k = 0; k < windows[i].size()-1; k++) {
78                                 out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl;
79                         }
80                         
81                         out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl;
82                         out << endl;
83                         
84                         out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
85                         for (int k = 0; k < windows[i].size(); k++) {
86                                 float ds = averageQuery[i][k] / averageRef[i][k]; 
87                                 out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl;
88                         }
89                         out << endl;
90                         
91                         //varRef
92                         //varQuery
93                         /* F test for differences among variances.
94                         * varQuery is expected to be higher or similar than varRef */
95                         //float fs = varQuery[query][i] / varRef[query][i];     /* F-Snedecor, test for differences of variances */
96                         
97                         bool results = false;   
98                                                 
99                         //confidence limit, t - Student, anova
100                         out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
101                         
102                         for (int k = 0; k < windows[i].size(); k++) {
103                                 string temp = "";
104                                 if (isChimericConfidence[i][k]) {  temp += "*\t"; }
105                                 else { temp += "\t"; }
106                                 
107                                 if (isChimericTStudent[i][k]) {  temp += "*\t"; }
108                                 else { temp += "\t"; }
109                                 
110                                 if (isChimericANOVA[i][k]) {  temp += "*\t"; }
111                                 else { temp += "\t"; }
112                         
113                                 out << k+1 << '\t' << temp << endl;
114                                 
115                                 if (temp == "*\t*\t*\t") {  results = true;  }
116                         }
117                         out << endl;    
118                         
119                         if (results) {
120                                 mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
121                         }
122                 }
123         }
124         catch(exception& e) {
125                 errorOut(e, "Ccode", "print");
126                 exit(1);
127         }
128 }
129
130 //***************************************************************************************************************
131 int Ccode::getChimeras() {
132         try {
133                 
134                 //read in query sequences and subject sequences
135                 mothurOut("Reading sequences and template file... "); cout.flush();
136                 querySeqs = readSeqs(fastafile);
137                 templateSeqs = readSeqs(templateFile);
138                 mothurOut("Done."); mothurOutEndLine();
139                 
140                 int numSeqs = querySeqs.size();
141                 
142                 if (unaligned) { mothurOut("Your sequences need to be aligned when you use the bellerophon ccode."); mothurOutEndLine(); return 1;  }
143                 
144                 closest.resize(numSeqs);
145                 
146                 refCombo.resize(numSeqs, 0);
147                 sumRef.resize(numSeqs); 
148                 varRef.resize(numSeqs); 
149                 varQuery.resize(numSeqs); 
150                 sdRef.resize(numSeqs); 
151                 sdQuery.resize(numSeqs);     
152                 sumQuery.resize(numSeqs);
153                 sumSquaredRef.resize(numSeqs); 
154                 sumSquaredQuery.resize(numSeqs); 
155                 averageRef.resize(numSeqs);
156                 averageQuery.resize(numSeqs);
157                 anova.resize(numSeqs);
158                 isChimericConfidence.resize(numSeqs);
159                 isChimericTStudent.resize(numSeqs);
160                 isChimericANOVA.resize(numSeqs);
161                 trim.resize(numSeqs);
162                 spotMap.resize(numSeqs);
163                 windowSizes.resize(numSeqs, window);
164                 windows.resize(numSeqs);
165                 
166                 //break up file if needed
167                 int linesPerProcess = numSeqs / processors ;
168                 
169                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
170                         //find breakup of sequences for all times we will Parallelize
171                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
172                         else {
173                                 //fill line pairs
174                                 for (int i = 0; i < (processors-1); i++) {                      
175                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
176                                 }
177                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
178                                 int i = processors - 1;
179                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
180                         }
181                         
182                 #else
183                         lines.push_back(new linePair(0, numSeqs));
184                 #endif
185         
186                 distCalc = new eachGapDist();
187                 decalc = new DeCalculator();
188                 
189                 //find closest
190                 if (processors == 1) { 
191                         mothurOut("Finding top matches for sequences... "); cout.flush();
192                         closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
193                         mothurOut("Done."); mothurOutEndLine();
194                 }else {         createProcessesClosest();               }
195
196                 //initialize spotMap
197                 for (int j = 0; j < numSeqs; j++) {
198                         for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
199                                 spotMap[j][i] = i;
200                         }
201                 }
202                 
203                 //mask sequences if the user wants to 
204                 if (seqMask != "") {
205                         decalc->setMask(seqMask);
206                         
207                         //mask querys
208                         for (int i = 0; i < querySeqs.size(); i++) {
209                                 decalc->runMask(querySeqs[i]);
210                         }
211                 
212                         //mask templates
213                         for (int i = 0; i < templateSeqs.size(); i++) {
214                                 decalc->runMask(templateSeqs[i]);
215                         }
216                         
217                         for (int i = 0; i < numSeqs; i++) {
218                                 spotMap[i] = decalc->getMaskMap();
219                         }
220                 }
221                 
222                 if (filter) {
223                         vector<Sequence*> temp = templateSeqs;
224                         for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
225                         
226                         createFilter(temp);
227                 
228                         runFilter(querySeqs);
229                         runFilter(templateSeqs);
230                         
231                         //update spotMap
232                         map<int, int> newMap;
233                         int spot = 0;
234                         int j = 0;
235                         
236                         for (int i = 0; i < filterString.length(); i++) {
237                                 if (filterString[i] == '1') {
238                                         //add to newMap
239                                         newMap[spot] = spotMap[j][i];
240                                         spot++;  
241                                 }
242                         }
243                         
244                         for (int i = 0; i < numSeqs; i++) {
245                                 spotMap[i] = newMap;
246                         }
247                 }
248
249                 //trim sequences - this follows ccodes remove_extra_gaps 
250                 for (int i = 0; i < querySeqs.size(); i++) {
251                         trimSequences(i);
252                 }
253                 
254                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
255                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
256                 for (int i = 0; i < querySeqs.size(); i++) {
257                         windows[i] = findWindows(i);  
258                 }
259
260                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
261                 if (processors == 1) { 
262                         for (int i = 0; i < closest.size(); i++) {
263                                 removeBadReferenceSeqs(closest[i], i);
264                         }
265                 }else {         createProcessesRemoveBad();             }
266
267                 
268                 if (processors == 1) { 
269                         //find the averages for each querys references
270                         for (int i = 0; i < numSeqs; i++) {
271                                 getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
272                         }
273                         
274                         //find the averages for the query 
275                         for (int i = 0; i < numSeqs; i++) {
276                                 getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
277                         }
278                 }else {         createProcessesAverages();              }
279                 
280                 if (processors == 1) { 
281                         //find the averages for each querys references 
282                         for (int i = 0; i < numSeqs; i++) {
283                                 findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
284                         }
285                         
286                         //find the averages for the query 
287                         for (int i = 0; i < numSeqs; i++) {
288                                 findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
289                         }
290                 }else {         createProcessesVariances();             }
291                 
292                 if (processors == 1) { 
293                         for (int i = 0; i < numSeqs; i++) {
294                                 determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. 
295                         }
296                 }else {         createProcessesDetermine();             }
297                 
298                 //free memory
299                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
300                 delete distCalc;
301                 delete decalc;
302                 
303                 return 0;
304         }
305         catch(exception& e) {
306                 errorOut(e, "Ccode", "getChimeras");
307                 exit(1);
308         }
309 }
310 /***************************************************************************************************************/
311 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
312 void Ccode::trimSequences(int query) {
313         try {
314                 
315                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
316                 int rearPos = querySeqs[query]->getAligned().length();
317                 
318                 //********find first position in closest seqs that is a non gap character***********//
319                 //find first position all query seqs that is a non gap character
320                 for (int i = 0; i < closest[query].size(); i++) {
321                         
322                         string aligned = closest[query][i].seq->getAligned();
323                         int pos = 0;
324                         
325                         //find first spot in this seq
326                         for (int j = 0; j < aligned.length(); j++) {
327                                 if (isalpha(aligned[j])) {
328                                         pos = j;
329                                         break;
330                                 }
331                         }
332                         
333                         //save this spot if it is the farthest
334                         if (pos > frontPos) { frontPos = pos; }
335                 }
336                 
337                 //find first position all querySeq[query] that is a non gap character
338                 string aligned = querySeqs[query]->getAligned();
339                 int pos = 0;
340                         
341                 //find first spot in this seq
342                 for (int j = 0; j < aligned.length(); j++) {
343                         if (isalpha(aligned[j])) {
344                                 pos = j;
345                                 break;
346                         }
347                 }
348                 
349                 //save this spot if it is the farthest
350                 if (pos > frontPos) { frontPos = pos; }
351                 
352                 
353                 //********find last position in closest seqs that is a non gap character***********//
354                 for (int i = 0; i < closest[query].size(); i++) {
355                         
356                         string aligned = closest[query][i].seq->getAligned();
357                         int pos = aligned.length();
358                         
359                         //find first spot in this seq
360                         for (int j = aligned.length()-1; j >= 0; j--) {
361                                 if (isalpha(aligned[j])) {
362                                         pos = j;
363                                         break;
364                                 }
365                         }
366                         
367                         //save this spot if it is the farthest
368                         if (pos < rearPos) { rearPos = pos; }
369                 }
370                 
371                 //find last position all querySeqs[query] that is a non gap character
372                 aligned = querySeqs[query]->getAligned();
373                 pos = aligned.length();
374                 
375                 //find first spot in this seq
376                 for (int j = aligned.length()-1; j >= 0; j--) {
377                         if (isalpha(aligned[j])) {
378                                 pos = j;
379                                 break;
380                         }
381                 }
382                 
383                 //save this spot if it is the farthest
384                 if (pos < rearPos) { rearPos = pos; }
385
386                 
387                 //check to make sure that is not whole seq
388                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
389                 
390                 map<int, int> tempTrim;
391                 tempTrim[frontPos] = rearPos;
392                 
393                 //save trimmed locations
394                 trim[query] = tempTrim;
395                                                 
396                 //update spotMask
397                 map<int, int> newMap;
398                 int spot = 0;
399                 
400                 for (int i = frontPos; i < rearPos; i++) {
401                         //add to newMap
402 //cout << query << '\t' << i << '\t' << spotMap[query][i] << endl;
403                         newMap[spot] = spotMap[query][i];
404                         spot++;  
405                 }
406                 
407                 //for (it = newMap.begin(); it!=newMap.end(); it++) {
408                         //cout << query << '\t' << it->first << '\t' << it->second << endl;
409                 //}
410                 
411                 spotMap[query] = newMap;
412
413                 
414         }
415         catch(exception& e) {
416                 errorOut(e, "Ccode", "trimSequences");
417                 exit(1);
418         }
419
420 }
421 /***************************************************************************************************************/
422 vector<int> Ccode::findWindows(int query) {
423         try {
424                 
425                 vector<int> win; 
426                 it = trim[query].begin();
427                 
428                 int length = it->second - it->first;
429         
430                 //default is wanted = 10% of total length
431                 if (windowSizes[query] > length) { 
432                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
433                         windowSizes[query] = length / 10;
434                 }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10;  }
435                 else if (windowSizes[query] > (length * 0.20)) {
436                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
437                 }else if (windowSizes[query] < (length * 0.05)) {
438                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
439                 }
440                 
441                 //save starting points of each window
442                 for (int m = it->first;  m < (it->second-windowSizes[query]); m+=windowSizes[query]) {  win.push_back(m);  }
443                 
444                 //save last window
445                 if (win[win.size()-1] < (it->first+length)) {
446                         win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
447                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
448                 
449
450                 return win;
451         
452         }
453         catch(exception& e) {
454                 errorOut(e, "Ccode", "findWindows");
455                 exit(1);
456         }
457 }
458 //***************************************************************************************************************
459 int Ccode::getDiff(string seqA, string seqB) {
460         try {
461                 
462                 int numDiff = 0;
463                 
464                 for (int i = 0; i < seqA.length(); i++) {
465                         //if you are both not gaps
466                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
467                                 //are you different
468                                 if (seqA[i] != seqB[i]) { 
469                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
470
471                                         /* the char in base_a and base_b have been checked and they are different */
472                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
473                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
474                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
475                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
476                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
477                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
478                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
479                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
480                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
481                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
482                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
483                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
484                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
485                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
486                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
487                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
488                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
489                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
490                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
491                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
492                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
493                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
494                                         else ok = 0;  /* the bases are different and not equivalent */
495                                         
496                                         //check if they are both blanks
497                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
498                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
499                                         
500                                         if (ok == 0) {  numDiff++;  }
501                                 }
502                         //}
503                 }
504                 
505                 return numDiff;
506         
507         }
508         catch(exception& e) {
509                 errorOut(e, "Ccode", "getDiff");
510                 exit(1);
511         }
512 }
513 //***************************************************************************************************************
514 //tried to make this look most like ccode original implementation
515 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs, int query) {
516         try {
517                 
518                 vector< vector<int> > numDiffBases;
519                 numDiffBases.resize(seqs.size());
520                 //initialize to 0
521                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
522                 
523                 it = trim[query].begin();
524                 int length = it->second - it->first;
525                 
526                 //calc differences from each sequence to everyother seq in the set
527                 for (int i = 0; i < seqs.size(); i++) {
528                         
529                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
530                         
531                         //so you don't calc i to j and j to i since they are the same
532                         for (int j = 0; j < i; j++) {
533                                 
534                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
535                                 
536                                 //compare strings
537                                 int numDiff = getDiff(seqA, seqB);
538                                 
539                                 numDiffBases[i][j] = numDiff;
540                                 numDiffBases[j][i] = numDiff;
541                         }
542                 }
543                 
544                 //initailize remove to 0
545                 vector<int> remove;  remove.resize(seqs.size(), 0);
546                 float top = ((20*length) / (float) 100);
547                 float bottom = ((0.5*length) / (float) 100);
548                 
549                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
550                 for (int i = 0; i < numDiffBases.size(); i++) {
551                         for (int j = 0; j < i; j++) {   
552                                 //are you more than 20% different
553                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
554                                 //are you less than 0.5% different
555                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
556                         }
557                 }
558                 
559                 int numSeqsLeft = 0;
560                 
561                 //count seqs that are not going to be removed
562                 for (int i = 0; i < remove.size(); i++) {  
563                         if (remove[i] == 0)  { numSeqsLeft++;  }
564                 }
565                 
566                 //if you have enough then remove bad ones
567                 if (numSeqsLeft >= 3) {
568                         vector<SeqDist> goodSeqs;
569                         //remove bad seqs
570                         for (int i = 0; i < remove.size(); i++) {
571                                 if (remove[i] == 0) { 
572                                         goodSeqs.push_back(seqs[i]);
573                                 }
574                         }
575                         
576                         seqs = goodSeqs;
577                         
578                 }else { //warn, but dont remove any
579                         mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
580                 }
581                         
582
583         }
584         catch(exception& e) {
585                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
586                 exit(1);
587         }
588 }
589 //***************************************************************************************************************
590 vector< vector<SeqDist> > Ccode::findClosest(int start, int end, int numWanted) {
591         try{
592         
593                 vector< vector<SeqDist> > topMatches;  topMatches.resize(querySeqs.size());
594         
595                 float smallestOverall, smallestLeft, smallestRight;
596                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
597                 
598                 //for each sequence in querySeqs - find top matches to use as reference
599                 for(int j = start; j < end; j++){
600                         
601                         Sequence query = *(querySeqs[j]);
602                         
603                         vector<SeqDist> distances;
604                         
605                         //calc distance to each sequence in template seqs
606                         for (int i = 0; i < templateSeqs.size(); i++) {
607                         
608                                 Sequence ref = *(templateSeqs[i]); 
609                                         
610                                 //find overall dist
611                                 distCalc->calcDist(query, ref);
612                                 float dist = distCalc->getDist();       
613                                 
614                                 //save distance
615                                 SeqDist temp;
616                                 temp.seq = templateSeqs[i];
617                                 temp.dist = dist;
618
619                                 distances.push_back(temp);
620                         }
621                         
622                         sort(distances.begin(), distances.end(), compareSeqDist);
623                         
624                         //save the number of top matches wanted
625                         for (int h = 0; h < numWanted; h++) {
626                                 topMatches[j].push_back(distances[h]);
627                         }
628                 }
629                         
630                 return topMatches;
631
632         }
633         catch(exception& e) {
634                 errorOut(e, "Ccode", "findClosestSides");
635                 exit(1);
636         }
637 }
638 /**************************************************************************************************/
639 //find the distances from each reference sequence to every other reference sequence for each window for this query
640 void Ccode::getAverageRef(vector<SeqDist> ref, int query) {
641         try {
642                 
643                 vector< vector< vector<int> > > diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
644                 
645                 //initialize diffs vector
646                 diffs.resize(ref.size());
647                 for (int i = 0; i < diffs.size(); i++) {  
648                         diffs[i].resize(ref.size());
649                         for (int j = 0; j < diffs[i].size(); j++) {
650                                 diffs[i][j].resize(windows[query].size(), 0);
651                         }
652                 }
653                 
654                 it = trim[query].begin();
655                                 
656                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
657                 for (int i = 0; i < ref.size(); i++) {
658                         
659                         string refI = ref[i].seq->getAligned();
660                         
661                         //j<i, so you don't find distances from i to j and then j to i.
662                         for (int j = 0; j < i; j++) {
663                         
664                                 string refJ = ref[j].seq->getAligned();
665                         
666                                 for (int k = 0; k < windows[query].size(); k++) {
667                                         
668                                         string refIWindowk, refJWindowk;
669                                         
670                                         if (k < windows[query].size()-1) {
671                                                 //get window strings
672                                                 refIWindowk = refI.substr(windows[query][k], windowSizes[query]);
673                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
674                                         }else { //last window may be smaller than rest - see findwindows
675                                                 //get window strings
676                                                 refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k]));
677                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
678                                         }
679                                         
680                                         //find differences
681                                         int diff = getDiff(refIWindowk, refJWindowk);
682 //cout <<  i << '\t' << j << '\t' << k << '\t' << diff << endl;
683                                         //save differences in [i][j][k] and [j][i][k] since they are the same
684                                         diffs[i][j][k] = diff;
685                                         diffs[j][i][k] = diff;
686
687                                 }//k
688                                 
689                         }//j
690                 
691                 }//i
692                 
693                 //initialize sumRef for this query 
694                 sumRef[query].resize(windows[query].size(), 0);
695                 sumSquaredRef[query].resize(windows[query].size(), 0);
696                 averageRef[query].resize(windows[query].size(), 0);
697                 
698                 //find the sum of the differences for hte reference sequences
699                 for (int i = 0; i < diffs.size(); i++) {
700                         for (int j = 0; j < i; j++) {
701                         
702                                 //increment this querys reference sequences combos
703                                 refCombo[query]++;
704                                 
705                                 for (int k = 0; k < diffs[i][j].size(); k++) {
706                                         sumRef[query][k] += diffs[i][j][k];
707                                         sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]);
708                                 }//k
709                                 
710                         }//j
711                 }//i
712
713                 
714                 //find the average of the differences for the references for each window
715                 for (int i = 0; i < windows[query].size(); i++) {
716                         averageRef[query][i] = sumRef[query][i] / (float) refCombo[query];
717                 }
718                 
719         }
720         catch(exception& e) {
721                 errorOut(e, "Ccode", "getAverageRef");
722                 exit(1);
723         }
724 }
725 /**************************************************************************************************/
726 void Ccode::getAverageQuery (vector<SeqDist> ref, int query) {
727         try {
728         
729                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
730                 
731                 //initialize diffs vector
732                 diffs.resize(ref.size());
733                 for (int j = 0; j < diffs.size(); j++) {
734                         diffs[j].resize(windows[query].size(), 0);
735                 }
736                 
737                 it = trim[query].begin();
738                                                         
739                 string refQuery = querySeqs[query]->getAligned();
740                          
741                 //j<i, so you don't find distances from i to j and then j to i.
742                 for (int j = 0; j < ref.size(); j++) {
743                          
744                          string refJ = ref[j].seq->getAligned();
745                          
746                          for (int k = 0; k < windows[query].size(); k++) {
747                                         
748                                         string QueryWindowk, refJWindowk;
749                                         
750                                         if (k < windows[query].size()-1) {
751                                                 //get window strings
752                                                 QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]);
753                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);                                       
754                                         }else { //last window may be smaller than rest - see findwindows
755                                                 //get window strings
756                                                 QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k]));
757                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
758                                         }
759                                         
760                                         //find differences
761                                         int diff = getDiff(QueryWindowk, refJWindowk);
762 //cout  << j << '\t' << k << '\t' << diff << endl;                                              
763                                         //save differences 
764                                         diffs[j][k] = diff;
765                          
766                          }//k
767                 }//j
768                          
769                 
770                 //initialize sumRef for this query 
771                 sumQuery[query].resize(windows[query].size(), 0);
772                 sumSquaredQuery[query].resize(windows[query].size(), 0);
773                 averageQuery[query].resize(windows[query].size(), 0);
774                 
775                 //find the sum of the differences 
776                 for (int j = 0; j < diffs.size(); j++) {
777                         for (int k = 0; k < diffs[j].size(); k++) {
778                                 sumQuery[query][k] += diffs[j][k];
779                                 sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]);
780                         }//k
781                 }//j
782                 
783                 
784                 //find the average of the differences for the references for each window
785                 for (int i = 0; i < windows[query].size(); i++) {
786                         averageQuery[query][i] = sumQuery[query][i] / (float) ref.size();
787                 }
788
789         
790         }
791         catch(exception& e) {
792                 errorOut(e, "Ccode", "getAverageQuery");
793                 exit(1);
794         }
795 }
796 /**************************************************************************************************/
797 void Ccode::findVarianceRef (int query) {
798         try {
799                 
800                 varRef[query].resize(windows[query].size(), 0);
801                 sdRef[query].resize(windows[query].size(), 0);
802                 
803                 //for each window
804                 for (int i = 0; i < windows[query].size(); i++) {
805                         varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1);
806                         sdRef[query][i] = sqrt(varRef[query][i]);
807                         
808                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
809                         if (averageRef[query][i] < 0.001)                       {       averageRef[query][i] = 0.001;           }
810                         if (sumRef[query][i] < 0.001)                           {       sumRef[query][i] = 0.001;                       }
811                         if (varRef[query][i] < 0.001)                           {       varRef[query][i] = 0.001;                       }
812                         if (sumSquaredRef[query][i] < 0.001)            {       sumSquaredRef[query][i] = 0.001;        }
813                         if (sdRef[query][i] < 0.001)                            {       sdRef[query][i] = 0.001;                        }
814                                 
815                 }
816         }
817         catch(exception& e) {
818                 errorOut(e, "Ccode", "findVarianceRef");
819                 exit(1);
820         }
821 }
822 /**************************************************************************************************/
823 void Ccode::findVarianceQuery (int query) {
824         try {
825                 varQuery[query].resize(windows[query].size(), 0);
826                 sdQuery[query].resize(windows[query].size(), 0);
827                 
828                 //for each window
829                 for (int i = 0; i < windows[query].size(); i++) {
830                         varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1);
831                         sdQuery[query][i] = sqrt(varQuery[query][i]);
832                         
833                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
834                         if (averageQuery[query][i] < 0.001)                     {       averageQuery[query][i] = 0.001;         }
835                         if (sumQuery[query][i] < 0.001)                         {       sumQuery[query][i] = 0.001;                     }
836                         if (varQuery[query][i] < 0.001)                         {       varQuery[query][i] = 0.001;                     }
837                         if (sumSquaredQuery[query][i] < 0.001)          {       sumSquaredQuery[query][i] = 0.001;      }
838                         if (sdQuery[query][i] < 0.001)                          {       sdQuery[query][i] = 0.001;                      }
839                 }
840
841         }
842         catch(exception& e) {
843                 errorOut(e, "Ccode", "findVarianceQuery");
844                 exit(1);
845         }
846 }
847 /**************************************************************************************************/
848 void Ccode::determineChimeras (int query) {
849         try {
850                 
851                 isChimericConfidence[query].resize(windows[query].size(), false);
852                 isChimericTStudent[query].resize(windows[query].size(), false);
853                 isChimericANOVA[query].resize(windows[query].size(), false);
854                 anova[query].resize(windows[query].size());
855
856                 
857                 //for each window
858                 for (int i = 0; i < windows[query].size(); i++) {
859                 
860                         //get confidence limits
861                         float t = getT(closest[query].size()-1);  //how many seqs you are comparing to this querySeq
862                         float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; 
863                         float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; 
864 //cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl;                    
865                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) {  /* range does not include 1 */
866                                         isChimericConfidence[query][i] = true;   /* significantly higher at P<0.05 */ 
867 //cout << i << " made it here" << endl;
868                         }
869                         
870                         //student t test
871                         int degreeOfFreedom = refCombo[query] + closest[query].size() - 2;
872                         float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size()));    /* denominator, without sqrt(), for ts calculations */
873                                 
874                         float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT)));  /* value of ts for t-student test */     
875                         t = getT(degreeOfFreedom);
876 //cout << i << '\t' << t << '\t' << ts << endl;                 
877                         if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) {   
878                                 isChimericTStudent[query][i] = true;   /* significantly higher at P<0.05 */ 
879                         }
880                 
881                         //anova test
882                         float value1 = sumQuery[query][i] + sumRef[query][i];
883                         float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i];
884                         float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]);
885                         float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) );
886                         float value5 = value2 - value4;
887                         float value6 = value3 - value4;
888                         float value7 = value5 - value6;
889                         float value8 = value7 / ((float) degreeOfFreedom);
890                         float anovaValue = value6 / value8;
891                         
892                         float f = getF(degreeOfFreedom);
893                         
894                          if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i]))  {
895                        isChimericANOVA[query][i] = true;   /* significant P<0.05 */
896                 }
897                         
898                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
899                         
900                         anova[query][i] = anovaValue;
901                 }
902                 
903         }
904         catch(exception& e) {
905                 errorOut(e, "Ccode", "determineChimeras");
906                 exit(1);
907         }
908 }
909 /**************************************************************************************************/    
910 float Ccode::getT(int numseq) {
911         try {
912         
913                 float tvalue = 0;
914                 
915                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
916                 if (numseq > 120) tvalue = 1.645;
917                 else if (numseq > 60) tvalue = 1.658;
918         else if (numseq > 40) tvalue = 1.671;
919         else if (numseq > 30) tvalue = 1.684;
920         else if (numseq > 29) tvalue = 1.697;
921         else if (numseq > 28) tvalue = 1.699;
922         else if (numseq > 27) tvalue = 1.701;
923         else if (numseq > 26) tvalue = 1.703;
924         else if (numseq > 25) tvalue = 1.706;
925         else if (numseq > 24) tvalue = 1.708;
926         else if (numseq > 23) tvalue = 1.711;
927         else if (numseq > 22) tvalue = 1.714;
928         else if (numseq > 21) tvalue = 1.717;
929         else if (numseq > 20) tvalue = 1.721;
930         else if (numseq > 19) tvalue = 1.725;
931         else if (numseq > 18) tvalue = 1.729;
932         else if (numseq > 17) tvalue = 1.734;
933         else if (numseq > 16) tvalue = 1.740;
934         else if (numseq > 15) tvalue = 1.746;
935         else if (numseq > 14) tvalue = 1.753;
936         else if (numseq > 13) tvalue = 1.761;
937         else if (numseq > 12) tvalue = 1.771;
938         else if (numseq > 11) tvalue = 1.782;
939         else if (numseq > 10) tvalue = 1.796;
940         else if (numseq > 9) tvalue = 1.812;
941         else if (numseq > 8) tvalue = 1.833;
942         else if (numseq > 7) tvalue = 1.860;
943         else if (numseq > 6) tvalue = 1.895;
944         else if (numseq > 5) tvalue = 1.943;
945         else if (numseq > 4) tvalue = 2.015;
946         else if (numseq > 3) tvalue = 2.132;
947         else if (numseq > 2) tvalue = 2.353;
948         else if (numseq > 1) tvalue = 2.920;
949                 else if (numseq <= 1) {
950                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
951                 }
952                 
953                 return tvalue;
954         }
955         catch(exception& e) {
956                 errorOut(e, "Ccode", "getT");
957                 exit(1);
958         }
959 }
960 /**************************************************************************************************/    
961 float Ccode::getF(int numseq) {
962         try {
963         
964                 float fvalue = 0;
965                 
966                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
967         if (numseq > 120) fvalue = 3.84;
968         else if (numseq > 60) fvalue = 3.92;
969         else if (numseq > 40) fvalue = 4.00;
970         else if (numseq > 30) fvalue = 4.08;
971         else if (numseq > 29) fvalue = 4.17;
972         else if (numseq > 28) fvalue = 4.18;
973         else if (numseq > 27) fvalue = 4.20;
974         else if (numseq > 26) fvalue = 4.21;
975         else if (numseq > 25) fvalue = 4.23;
976         else if (numseq > 24) fvalue = 4.24;
977         else if (numseq > 23) fvalue = 4.26;
978         else if (numseq > 22) fvalue = 4.28;
979         else if (numseq > 21) fvalue = 4.30;
980         else if (numseq > 20) fvalue = 4.32;
981         else if (numseq > 19) fvalue = 4.35;
982         else if (numseq > 18) fvalue = 4.38;
983         else if (numseq > 17) fvalue = 4.41;
984         else if (numseq > 16) fvalue = 4.45;
985         else if (numseq > 15) fvalue = 4.49;
986         else if (numseq > 14) fvalue = 4.54;
987         else if (numseq > 13) fvalue = 4.60;
988         else if (numseq > 12) fvalue = 4.67;
989         else if (numseq > 11) fvalue = 4.75;
990         else if (numseq > 10) fvalue = 4.84;
991         else if (numseq > 9) fvalue = 4.96;
992         else if (numseq > 8) fvalue = 5.12;
993         else if (numseq > 7) fvalue = 5.32;
994         else if (numseq > 6) fvalue = 5.59;
995         else if (numseq > 5) fvalue = 5.99;
996         else if (numseq > 4) fvalue = 6.61;
997         else if (numseq > 3) fvalue = 7.71;
998         else if (numseq > 2) fvalue = 10.1;
999         else if (numseq > 1) fvalue = 18.5;
1000         else if (numseq > 0) fvalue = 161;
1001                 else if (numseq <= 0) {
1002                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
1003         }
1004                 
1005                 return fvalue;
1006         }
1007         catch(exception& e) {
1008                 errorOut(e, "Ccode", "getF");
1009                 exit(1);
1010         }
1011 }
1012
1013 /**************************************************************************************************/
1014 void Ccode::createProcessesClosest() {
1015         try {
1016 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1017                 int process = 0;
1018                 vector<int> processIDS;
1019                 
1020                 //loop through and create all the processes you want
1021                 while (process != processors) {
1022                         int pid = fork();
1023                         
1024                         if (pid > 0) {
1025                                 processIDS.push_back(pid);  
1026                                 process++;
1027                         }else if (pid == 0){
1028                                 
1029                                 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1030                                 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
1031                                 mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1032                                 
1033                                 //write out data to file so parent can read it
1034                                 ofstream out;
1035                                 string s = toString(getpid()) + ".temp";
1036                                 openOutputFile(s, out);
1037                                                         
1038                                 //output pairs
1039                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1040                                          for (int j = 0; j < closest[i].size(); j++) {
1041                                                 closest[i][j].seq->printSequence(out);
1042                                          }
1043                                 }
1044                                 out << ">" << endl; //to stop sequence read
1045                                 
1046                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1047                                          for (int j = 0; j < closest[i].size(); j++) {
1048                                                 out << closest[i][j].dist << '\t';
1049                                          }
1050                                          out << endl;
1051                                 }
1052                                 out.close();
1053                                 exit(0);
1054                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1055                 }
1056                 
1057                 //force parent to wait until all the processes are done
1058                 for (int i=0;i<processors;i++) { 
1059                         int temp = processIDS[i];
1060                         wait(&temp);
1061                 }
1062                 
1063                 //get data created by processes
1064                 for (int i=0;i<processors;i++) { 
1065                         ifstream in;
1066                         string s = toString(processIDS[i]) + ".temp";
1067                         openInputFile(s, in);
1068                         
1069                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1070                         //get pairs
1071                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1072                                 vector<Sequence*> tempVector;
1073         
1074                                 for (int j = 0; j < numWanted; j++) {
1075                                 
1076                                         Sequence* temp = new Sequence(in);
1077                                         gobble(in);
1078                                                 
1079                                         tempVector.push_back(temp);
1080                                 }
1081                                 
1082                                 tempClosest[k] = tempVector;
1083                         }
1084                         
1085                         string junk;
1086                         in >> junk;  gobble(in);  // to get ">"
1087                 
1088                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1089                         
1090                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1091                                 dists[k].resize(numWanted);
1092                                 for (int j = 0; j < numWanted; j++) {
1093                                         in >> dists[k][j];
1094                                 }
1095                                 gobble(in);
1096                         
1097                         } 
1098
1099                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1100                                 closest[k].resize(numWanted);
1101                                 for (int j = 0; j < closest[k].size(); j++) {
1102                                         closest[k][j].seq = tempClosest[k][j];
1103                                         closest[k][j].dist = dists[k][j]; 
1104                                 }
1105                         } 
1106
1107                         in.close();
1108                         remove(s.c_str());
1109                 }
1110                         
1111         
1112 #else
1113                 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
1114 #endif  
1115
1116         }
1117         catch(exception& e) {
1118                 errorOut(e, "Ccode", "createProcessesClosest");
1119                 exit(1);
1120         }
1121 }
1122
1123 //***************************************************************************************************************
1124 void Ccode::createProcessesRemoveBad() {
1125         try {
1126 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1127                 int process = 0;
1128                 vector<int> processIDS;
1129                 
1130                 //loop through and create all the processes you want
1131                 while (process != processors) {
1132                         int pid = fork();
1133                         
1134                         if (pid > 0) {
1135                                 processIDS.push_back(pid);  
1136                                 process++;
1137                         }else if (pid == 0){
1138                                                         
1139                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1140                                         removeBadReferenceSeqs(closest[i], i);
1141                                 }
1142                                 
1143                                 //write out data to file so parent can read it
1144                                 ofstream out;
1145                                 string s = toString(getpid()) + ".temp";
1146                                 openOutputFile(s, out);
1147                                 
1148                                 //output pairs
1149                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1150                                         out << closest[i].size() << endl;
1151                                          for (int j = 0; j < closest[i].size(); j++) {
1152                                                 closest[i][j].seq->printSequence(out);
1153                                          }
1154                                          out << ">" << endl; //to stop sequence read
1155                                 }
1156                                 
1157                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1158                                          for (int j = 0; j < closest[i].size(); j++) {
1159                                                 out << closest[i][j].dist << '\t';
1160                                          }
1161                                          out << endl;
1162                                 }
1163
1164                                 out.close();
1165                                 
1166                                 exit(0);
1167                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1168                 }
1169                 
1170                 //force parent to wait until all the processes are done
1171                 for (int i=0;i<processors;i++) { 
1172                         int temp = processIDS[i];
1173                         wait(&temp);
1174                 }
1175         
1176                 //get data created by processes
1177                 for (int i=0;i<processors;i++) { 
1178                         ifstream in;
1179                         string s = toString(processIDS[i]) + ".temp";
1180                         openInputFile(s, in);
1181                 
1182                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1183                         vector<int> sizes; 
1184                         //get pairs
1185                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1186                 
1187                                 int num;
1188                                 in >> num; 
1189                                 sizes.push_back(num);
1190                                 gobble(in);
1191                                 
1192                                 vector<Sequence*> tempVector;
1193                                 
1194                                 for (int j = 0; j < num; j++) {
1195                                 
1196                                         Sequence* temp = new Sequence(in);
1197                                         gobble(in);
1198                                                 
1199                                         tempVector.push_back(temp);
1200                                 }
1201                                 string junk;
1202                                 in >> junk;  gobble(in);  // to get ">"
1203
1204                                 tempClosest[k] = tempVector;
1205                         }
1206                         
1207                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1208                         int count = 0;
1209                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1210                                 dists[k].resize(sizes[count]);
1211                                 for (int j = 0; j < sizes[count]; j++) {
1212                                         in >> dists[k][j];
1213                                 }
1214                                 gobble(in);
1215                                 count++;
1216                         } 
1217                         
1218                         count = 0;
1219                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1220                                 for (int j = 0; j < sizes[count]; j++) {
1221                                         closest[k][j].seq = tempClosest[k][j];
1222                                         closest[k][j].dist = dists[k][j]; 
1223                                 }
1224                                 count++;
1225                         } 
1226
1227                         in.close();
1228                         remove(s.c_str());
1229                 }
1230 #else
1231                 for (int i = 0; i < closest.size(); i++) {
1232                         removeBadReferenceSeqs(closest[i], i);
1233                 }
1234 #endif          
1235         
1236         }
1237         catch(exception& e) {
1238                 errorOut(e, "Ccode", "createProcessesRemoveBad");
1239                 exit(1);
1240         }
1241 }
1242 //***************************************************************************************************************
1243 void Ccode::createProcessesAverages() {
1244         try {
1245         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1246                 int process = 0;
1247                 vector<int> processIDS;
1248                 
1249                 //loop through and create all the processes you want
1250                 while (process != processors) {
1251                         int pid = fork();
1252                         
1253                         if (pid > 0) {
1254                                 processIDS.push_back(pid);  
1255                                 process++;
1256                         }else if (pid == 0){
1257                                                         
1258                                 //find the averages for each querys references 
1259                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1260                                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
1261                                 }
1262                                 
1263                                 //find the averages for the query 
1264                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1265                                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
1266                                 }
1267                                 
1268                                 
1269                                 //write out data to file so parent can read it
1270                                 ofstream out;
1271                                 string s = toString(getpid()) + ".temp";
1272                                 openOutputFile(s, out);
1273                                 
1274                                 //output pairs
1275                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1276                                         for (int j = 0; j < windows[i].size(); j++) {
1277                                                 out << sumRef[i][j] << '\t';
1278                                         }
1279                                         out << endl;
1280                                         for (int j = 0; j < windows[i].size(); j++) {
1281                                                 out << averageRef[i][j] << '\t';
1282                                         }
1283                                         out << endl;
1284                                         for (int j = 0; j < windows[i].size(); j++) {
1285                                                 out << sumSquaredRef[i][j] << '\t';
1286                                         }
1287                                         out << endl;
1288                                         for (int j = 0; j < windows[i].size(); j++) {
1289                                                 out << sumQuery[i][j] << '\t';
1290                                         }
1291                                         out << endl;
1292                                         for (int j = 0; j < windows[i].size(); j++) {
1293                                                 out << averageQuery[i][j] << '\t';
1294                                         }
1295                                         out << endl;
1296                                         for (int j = 0; j < windows[i].size(); j++) {
1297                                                 out << sumSquaredQuery[i][j] << '\t';
1298                                         }
1299                                         out << endl;
1300                                         out << refCombo[i] << endl;
1301                                 }
1302
1303                                 out.close();
1304                                 
1305                                 exit(0);
1306                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1307                 }
1308                 
1309                 //force parent to wait until all the processes are done
1310                 for (int i=0;i<processors;i++) { 
1311                         int temp = processIDS[i];
1312                         wait(&temp);
1313                 }
1314         
1315                 //get data created by processes
1316                 for (int i=0;i<processors;i++) { 
1317                         ifstream in;
1318                         string s = toString(processIDS[i]) + ".temp";
1319                         openInputFile(s, in);
1320                 
1321                         //get pairs
1322                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1323                                 sumRef[k].resize(windows[k].size());
1324                                 averageRef[k].resize(windows[k].size());
1325                                 sumSquaredRef[k].resize(windows[k].size());
1326                                 averageQuery[k].resize(windows[k].size());
1327                                 sumQuery[k].resize(windows[k].size());
1328                                 sumSquaredQuery[k].resize(windows[k].size());
1329                                 
1330                                 for (int j = 0; j < windows[k].size(); j++) {
1331                                         in >> sumRef[k][j];
1332                                 }
1333                                 gobble(in);
1334                                 for (int j = 0; j < windows[k].size(); j++) {
1335                                         in >> averageRef[k][j];
1336                                 }
1337                                 gobble(in);
1338                                 for (int j = 0; j < windows[k].size(); j++) {
1339                                         in >> sumSquaredRef[k][j];
1340                                 }
1341                                 gobble(in);
1342                                 for (int j = 0; j < windows[k].size(); j++) {
1343                                         in >> sumQuery[k][j];
1344                                 }
1345                                 gobble(in);
1346                                 for (int j = 0; j < windows[k].size(); j++) {
1347                                         in >> averageQuery[k][j];
1348                                 }
1349                                 gobble(in);
1350                                 for (int j = 0; j < windows[k].size(); j++) {
1351                                         in >> sumSquaredQuery[k][j];
1352                                 }
1353                                 gobble(in);
1354                                 in >> refCombo[k];
1355                                 gobble(in);
1356                         }
1357                         
1358                         in.close();
1359                         remove(s.c_str());
1360                 }
1361 #else
1362                 //find the averages for each querys references 
1363                 for (int i = 0; i < querySeqs.size(); i++) {
1364                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
1365                 }
1366         
1367                 //find the averages for the query 
1368                 for (int i = 0; i < querySeqs.size(); i++) {
1369                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
1370                 }
1371
1372 #endif          
1373         
1374         }
1375         catch(exception& e) {
1376                 errorOut(e, "Ccode", "createProcessesAverages");
1377                 exit(1);
1378         }
1379 }
1380 //***************************************************************************************************************
1381 void Ccode::createProcessesVariances() {
1382         try {
1383         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1384                 int process = 0;
1385                 vector<int> processIDS;
1386                 
1387                 //loop through and create all the processes you want
1388                 while (process != processors) {
1389                         int pid = fork();
1390                         
1391                         if (pid > 0) {
1392                                 processIDS.push_back(pid);  
1393                                 process++;
1394                         }else if (pid == 0){
1395                                                         
1396                                 //find the averages for each querys references 
1397                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1398                                         findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1399                                 }
1400                                 
1401                                 //find the averages for the query 
1402                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1403                                         findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1404                                 }
1405                                 
1406                                 
1407                                 //write out data to file so parent can read it
1408                                 ofstream out;
1409                                 string s = toString(getpid()) + ".temp";
1410                                 openOutputFile(s, out);
1411                                 
1412                                 //output pairs
1413                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1414                                         for (int j = 0; j < windows[i].size(); j++) {
1415                                                 out << varRef[i][j] << '\t';
1416                                         }
1417                                         out << endl;
1418                                         for (int j = 0; j < windows[i].size(); j++) {
1419                                                 out << sdRef[i][j] << '\t';
1420                                         }
1421                                         out << endl;
1422                                         for (int j = 0; j < windows[i].size(); j++) {
1423                                                 out << varQuery[i][j] << '\t';
1424                                         }
1425                                         out << endl;
1426                                         for (int j = 0; j < windows[i].size(); j++) {
1427                                                 out << sdQuery[i][j] << '\t';
1428                                         }
1429                                         out << endl;
1430                                 }
1431
1432                                 out.close();
1433                                 
1434                                 exit(0);
1435                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1436                 }
1437                 
1438                 //force parent to wait until all the processes are done
1439                 for (int i=0;i<processors;i++) { 
1440                         int temp = processIDS[i];
1441                         wait(&temp);
1442                 }
1443         
1444                 //get data created by processes
1445                 for (int i=0;i<processors;i++) { 
1446                         ifstream in;
1447                         string s = toString(processIDS[i]) + ".temp";
1448                         openInputFile(s, in);
1449                 
1450                         //get pairs
1451                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1452                                 varRef[k].resize(windows[k].size());
1453                                 sdRef[k].resize(windows[k].size());
1454                                 varQuery[k].resize(windows[k].size());
1455                                 sdQuery[k].resize(windows[k].size());
1456                                                                 
1457                                 for (int j = 0; j < windows[k].size(); j++) {
1458                                         in >> varRef[k][j];
1459                                 }
1460                                 gobble(in);
1461                                 for (int j = 0; j < windows[k].size(); j++) {
1462                                         in >> sdRef[k][j];
1463                                 }
1464                                 gobble(in);
1465                                 for (int j = 0; j < windows[k].size(); j++) {
1466                                         in >> varQuery[k][j];
1467                                 }
1468                                 gobble(in);
1469                                 for (int j = 0; j < windows[k].size(); j++) {
1470                                         in >> sdQuery[k][j];
1471                                 }
1472                                 gobble(in);
1473                         }
1474                         
1475                         in.close();
1476                         remove(s.c_str());
1477                 }
1478 #else
1479                         //find the averages for each querys references 
1480                         for (int i = 0; i < querySeqs.size(); i++) {
1481                                 findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1482                         }
1483                         
1484                         //find the averages for the query 
1485                         for (int i = 0; i < querySeqs.size(); i++) {
1486                                 findVarianceQuery(i);  //fills v arQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1487                         }
1488 #endif          
1489         }
1490         catch(exception& e) {
1491                 errorOut(e, "Ccode", "createProcessesVariances");
1492                 exit(1);
1493         }
1494 }
1495 //***************************************************************************************************************
1496 void Ccode::createProcessesDetermine() {
1497         try {
1498         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1499                 int process = 0;
1500                 vector<int> processIDS;
1501                 
1502                 //loop through and create all the processes you want
1503                 while (process != processors) {
1504                         int pid = fork();
1505                         
1506                         if (pid > 0) {
1507                                 processIDS.push_back(pid);  
1508                                 process++;
1509                         }else if (pid == 0){
1510                                                         
1511                                 //find the averages for each querys references 
1512                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1513                                         determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. 
1514                                 }
1515
1516                                 //write out data to file so parent can read it
1517                                 ofstream out;
1518                                 string s = toString(getpid()) + ".temp";
1519                                 openOutputFile(s, out);
1520                                 
1521                                 //output pairs
1522                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1523                                         for (int j = 0; j < windows[i].size(); j++) {
1524                                                 out << anova[i][j] << '\t';
1525                                         }
1526                                         out << endl;
1527                                         for (int j = 0; j < windows[i].size(); j++) {
1528                                                 out << isChimericConfidence[i][j] << '\t';
1529                                         }
1530                                         out << endl;
1531                                         for (int j = 0; j < windows[i].size(); j++) {
1532                                                 out << isChimericTStudent[i][j] << '\t';
1533                                         }
1534                                         out << endl;
1535                                         for (int j = 0; j < windows[i].size(); j++) {
1536                                                 out << isChimericANOVA[i][j] << '\t';
1537                                         }
1538                                         out << endl;
1539                                 }
1540
1541                                 out.close();
1542                                 
1543                                 exit(0);
1544                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1545                 }
1546                 
1547                 //force parent to wait until all the processes are done
1548                 for (int i=0;i<processors;i++) { 
1549                         int temp = processIDS[i];
1550                         wait(&temp);
1551                 }
1552         
1553                 //get data created by processes
1554                 for (int i=0;i<processors;i++) { 
1555                         ifstream in;
1556                         string s = toString(processIDS[i]) + ".temp";
1557                         openInputFile(s, in);
1558                 
1559                         //get pairs
1560                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1561                                 anova[k].resize(windows[k].size());
1562                                 isChimericConfidence[k].resize(windows[k].size(), false);
1563                                 isChimericTStudent[k].resize(windows[k].size(), false);
1564                                 isChimericANOVA[k].resize(windows[k].size(), false);
1565                                 int tempBool;
1566                                                                 
1567                                 for (int j = 0; j < windows[k].size(); j++) {
1568                                         in >> anova[k][j];
1569                                 }
1570                                 gobble(in);
1571                                 for (int j = 0; j < windows[k].size(); j++) {
1572                                         in >> tempBool;
1573                                         if (tempBool == 1) {  isChimericConfidence[k][j] = true;  }
1574                                 }
1575                                 gobble(in);
1576                                 for (int j = 0; j < windows[k].size(); j++) {
1577                                         in >> tempBool;
1578                                         if (tempBool == 1) {  isChimericTStudent[k][j] = true;  }
1579                                 }
1580                                 gobble(in);
1581                                 for (int j = 0; j < windows[k].size(); j++) {
1582                                         in >> tempBool;
1583                                         if (tempBool == 1) {  isChimericANOVA[k][j] = true;  }
1584                                 }
1585                                 gobble(in);
1586                         }
1587                         
1588                         in.close();
1589                         remove(s.c_str());
1590                 }
1591         #else
1592                         for (int i = 0; i < querySeqs.size(); i++) {
1593                                 determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i].
1594                         }
1595         #endif          
1596
1597         }
1598         catch(exception& e) {
1599                 errorOut(e, "Ccode", "createProcessesDetermine");
1600                 exit(1);
1601         }
1602 }
1603 //***************************************************************************************************************
1604
1605
1606