]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
finished with ccode, returned bellerophon to last save before move, cleaned up pintai...
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
17 //***************************************************************************************************************
18
19 Ccode::~Ccode() {
20         try {
21                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
22                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
23         }
24         catch(exception& e) {
25                 errorOut(e, "Ccode", "~Ccode");
26                 exit(1);
27         }
28 }       
29 //***************************************************************************************************************
30 void Ccode::print(ostream& out) {
31         try {
32                 
33                 mothurOutEndLine();
34                 
35                 string mapInfo = getRootName(fastafile) + "mapinfo";
36                 ofstream out2;
37                 openOutputFile(mapInfo, out2);
38                 
39                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
40                 
41                 for (int j = 0; j < querySeqs.size(); j++) {
42                         out2 << querySeqs[j]->getName() << endl;
43                         for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) {
44                                 out2 << it->first << '\t' << it->second << endl;
45                         }
46                 }
47                 out2.close();
48                 
49                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
50                 
51                 out << "For full window mapping info refer to " << mapInfo << endl << endl;
52                 
53                 for (int i = 0; i < querySeqs.size(); i++) {
54                 
55                         out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
56                         
57                         for (int j = 0; j < closest[i].size(); j++) {
58                                 out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl;
59                         }
60                         out << endl << endl;
61                 
62                         //for each window
63                         //window mapping info.
64                         out << "Mapping information: ";
65                         //you mask and did not filter
66                         if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
67                                 
68                         //you filtered and did not mask
69                         if ((seqMask == "") && (filter)) { out << "filter and trim."; }
70                                 
71                         //you masked and filtered
72                         if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
73                         
74                         out << endl << "Window\tStartPos\tEndPos" << endl;
75                         it = trim[i].begin();
76                         
77                         for (int k = 0; k < windows[i].size()-1; k++) {
78                                 out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl;
79                         }
80                         
81                         out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl;
82                         out << endl;
83                         
84                         out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
85                         for (int k = 0; k < windows[i].size(); k++) {
86                                 float ds = averageQuery[i][k] / averageRef[i][k]; 
87                                 out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl;
88                         }
89                         out << endl;
90                         
91                         //varRef
92                         //varQuery
93                         /* F test for differences among variances.
94                         * varQuery is expected to be higher or similar than varRef */
95                         //float fs = varQuery[query][i] / varRef[query][i];     /* F-Snedecor, test for differences of variances */
96                         
97                         bool results = false;   
98                                                 
99                         //confidence limit, t - Student, anova
100                         out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
101                         
102                         for (int k = 0; k < windows[i].size(); k++) {
103                                 string temp = "";
104                                 if (isChimericConfidence[i][k]) {  temp += "*\t"; }
105                                 else { temp += "\t"; }
106                                 
107                                 if (isChimericTStudent[i][k]) {  temp += "*\t"; }
108                                 else { temp += "\t"; }
109                                 
110                                 if (isChimericANOVA[i][k]) {  temp += "*\t"; }
111                                 else { temp += "\t"; }
112                         
113                                 out << k+1 << '\t' << temp << endl;
114                                 
115                                 if (temp == "*\t*\t*\t") {  results = true;  }
116                         }
117                         out << endl;    
118                         
119                         if (results) {
120                                 mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
121                         }
122                 }
123         }
124         catch(exception& e) {
125                 errorOut(e, "Ccode", "print");
126                 exit(1);
127         }
128 }
129
130 //***************************************************************************************************************
131 void Ccode::getChimeras() {
132         try {
133                 
134                 //read in query sequences and subject sequences
135                 mothurOut("Reading sequences and template file... "); cout.flush();
136                 querySeqs = readSeqs(fastafile);
137                 templateSeqs = readSeqs(templateFile);
138                 mothurOut("Done."); mothurOutEndLine();
139                 
140                 int numSeqs = querySeqs.size();
141                 
142                 closest.resize(numSeqs);
143                 
144                 refCombo.resize(numSeqs, 0);
145                 sumRef.resize(numSeqs); 
146                 varRef.resize(numSeqs); 
147                 varQuery.resize(numSeqs); 
148                 sdRef.resize(numSeqs); 
149                 sdQuery.resize(numSeqs);     
150                 sumQuery.resize(numSeqs);
151                 sumSquaredRef.resize(numSeqs); 
152                 sumSquaredQuery.resize(numSeqs); 
153                 averageRef.resize(numSeqs);
154                 averageQuery.resize(numSeqs);
155                 anova.resize(numSeqs);
156                 isChimericConfidence.resize(numSeqs);
157                 isChimericTStudent.resize(numSeqs);
158                 isChimericANOVA.resize(numSeqs);
159                 trim.resize(numSeqs);
160                 spotMap.resize(numSeqs);
161                 windowSizes.resize(numSeqs, window);
162                 windows.resize(numSeqs);
163                 
164                 //break up file if needed
165                 int linesPerProcess = numSeqs / processors ;
166                 
167                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
168                         //find breakup of sequences for all times we will Parallelize
169                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
170                         else {
171                                 //fill line pairs
172                                 for (int i = 0; i < (processors-1); i++) {                      
173                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
174                                 }
175                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
176                                 int i = processors - 1;
177                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
178                         }
179                         
180                         //find breakup of templatefile for quantiles
181                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
182                         else { 
183                                 for (int i = 0; i < processors; i++) {
184                                         templateLines.push_back(new linePair());
185                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
186                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
187                                 }
188                         }
189                 #else
190                         lines.push_back(new linePair(0, numSeqs));
191                         templateLines.push_back(new linePair(0, templateSeqs.size()));
192                 #endif
193         
194                 distCalc = new eachGapDist();
195                 decalc = new DeCalculator();
196                 
197                 //find closest
198                 if (processors == 1) { 
199                         mothurOut("Finding top matches for sequences... "); cout.flush();
200                         closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
201                         mothurOut("Done."); mothurOutEndLine();
202                 }else {         createProcessesClosest();               }
203
204                 //initialize spotMap
205                 for (int j = 0; j < numSeqs; j++) {
206                         for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
207                                 spotMap[j][i] = i;
208                         }
209                 }
210                 
211                 //mask sequences if the user wants to 
212                 if (seqMask != "") {
213                         decalc->setMask(seqMask);
214                         
215                         //mask querys
216                         for (int i = 0; i < querySeqs.size(); i++) {
217                                 decalc->runMask(querySeqs[i]);
218                         }
219                 
220                         //mask templates
221                         for (int i = 0; i < templateSeqs.size(); i++) {
222                                 decalc->runMask(templateSeqs[i]);
223                         }
224                         
225                         for (int i = 0; i < numSeqs; i++) {
226                                 spotMap[i] = decalc->getMaskMap();
227                         }
228                 }
229                 
230                 if (filter) {
231                         vector<Sequence*> temp = templateSeqs;
232                         for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
233                         
234                         createFilter(temp);
235                 
236                         runFilter(querySeqs);
237                         runFilter(templateSeqs);
238                         
239                         //update spotMap
240                         map<int, int> newMap;
241                         int spot = 0;
242                         int j = 0;
243                         
244                         for (int i = 0; i < filterString.length(); i++) {
245                                 if (filterString[i] == '1') {
246                                         //add to newMap
247                                         newMap[spot] = spotMap[j][i];
248                                         spot++;  
249                                 }
250                         }
251                         
252                         for (int i = 0; i < numSeqs; i++) {
253                                 spotMap[i] = newMap;
254                         }
255                 }
256
257                 //trim sequences - this follows ccodes remove_extra_gaps 
258                 for (int i = 0; i < querySeqs.size(); i++) {
259                         trimSequences(i);
260                 }
261                 
262                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
263                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
264                 for (int i = 0; i < querySeqs.size(); i++) {
265                         windows[i] = findWindows(i);  
266                 }
267
268                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
269                 if (processors == 1) { 
270                         for (int i = 0; i < closest.size(); i++) {
271                                 removeBadReferenceSeqs(closest[i], i);
272                         }
273                 }else {         createProcessesRemoveBad();             }
274
275                 
276                 if (processors == 1) { 
277                         //find the averages for each querys references
278                         for (int i = 0; i < numSeqs; i++) {
279                                 getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
280                         }
281                         
282                         //find the averages for the query 
283                         for (int i = 0; i < numSeqs; i++) {
284                                 getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
285                         }
286                 }else {         createProcessesAverages();              }
287                 
288                 if (processors == 1) { 
289                         //find the averages for each querys references 
290                         for (int i = 0; i < numSeqs; i++) {
291                                 findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
292                         }
293                         
294                         //find the averages for the query 
295                         for (int i = 0; i < numSeqs; i++) {
296                                 findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
297                         }
298                 }else {         createProcessesVariances();             }
299                 
300                 if (processors == 1) { 
301                         for (int i = 0; i < numSeqs; i++) {
302                                 determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. 
303                         }
304                 }else {         createProcessesDetermine();             }
305                 
306                 //free memory
307                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
308                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
309                 delete distCalc;
310                 delete decalc;
311                         
312         }
313         catch(exception& e) {
314                 errorOut(e, "Ccode", "getChimeras");
315                 exit(1);
316         }
317 }
318 /***************************************************************************************************************/
319 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
320 void Ccode::trimSequences(int query) {
321         try {
322                 
323                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
324                 int rearPos = querySeqs[query]->getAligned().length();
325                 
326                 //********find first position in closest seqs that is a non gap character***********//
327                 //find first position all query seqs that is a non gap character
328                 for (int i = 0; i < closest[query].size(); i++) {
329                         
330                         string aligned = closest[query][i].seq->getAligned();
331                         int pos = 0;
332                         
333                         //find first spot in this seq
334                         for (int j = 0; j < aligned.length(); j++) {
335                                 if (isalpha(aligned[j])) {
336                                         pos = j;
337                                         break;
338                                 }
339                         }
340                         
341                         //save this spot if it is the farthest
342                         if (pos > frontPos) { frontPos = pos; }
343                 }
344                 
345                 //find first position all querySeq[query] that is a non gap character
346                 string aligned = querySeqs[query]->getAligned();
347                 int pos = 0;
348                         
349                 //find first spot in this seq
350                 for (int j = 0; j < aligned.length(); j++) {
351                         if (isalpha(aligned[j])) {
352                                 pos = j;
353                                 break;
354                         }
355                 }
356                 
357                 //save this spot if it is the farthest
358                 if (pos > frontPos) { frontPos = pos; }
359                 
360                 
361                 //********find last position in closest seqs that is a non gap character***********//
362                 for (int i = 0; i < closest[query].size(); i++) {
363                         
364                         string aligned = closest[query][i].seq->getAligned();
365                         int pos = aligned.length();
366                         
367                         //find first spot in this seq
368                         for (int j = aligned.length()-1; j >= 0; j--) {
369                                 if (isalpha(aligned[j])) {
370                                         pos = j;
371                                         break;
372                                 }
373                         }
374                         
375                         //save this spot if it is the farthest
376                         if (pos < rearPos) { rearPos = pos; }
377                 }
378                 
379                 //find last position all querySeqs[query] that is a non gap character
380                 aligned = querySeqs[query]->getAligned();
381                 pos = aligned.length();
382                 
383                 //find first spot in this seq
384                 for (int j = aligned.length()-1; j >= 0; j--) {
385                         if (isalpha(aligned[j])) {
386                                 pos = j;
387                                 break;
388                         }
389                 }
390                 
391                 //save this spot if it is the farthest
392                 if (pos < rearPos) { rearPos = pos; }
393
394                 
395                 //check to make sure that is not whole seq
396                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
397                 
398                 map<int, int> tempTrim;
399                 tempTrim[frontPos] = rearPos;
400                 
401                 //save trimmed locations
402                 trim[query] = tempTrim;
403                                                 
404                 //update spotMask
405                 map<int, int> newMap;
406                 int spot = 0;
407                 
408                 for (int i = frontPos; i < rearPos; i++) {
409                         //add to newMap
410 //cout << query << '\t' << i << '\t' << spotMap[query][i] << endl;
411                         newMap[spot] = spotMap[query][i];
412                         spot++;  
413                 }
414                 
415                 //for (it = newMap.begin(); it!=newMap.end(); it++) {
416                         //cout << query << '\t' << it->first << '\t' << it->second << endl;
417                 //}
418                 
419                 spotMap[query] = newMap;
420
421                 
422         }
423         catch(exception& e) {
424                 errorOut(e, "Ccode", "trimSequences");
425                 exit(1);
426         }
427
428 }
429 /***************************************************************************************************************/
430 vector<int> Ccode::findWindows(int query) {
431         try {
432                 
433                 vector<int> win; 
434                 it = trim[query].begin();
435                 
436                 int length = it->second - it->first;
437                 
438                 //default is wanted = 10% of total length
439                 if (windowSizes[query] > length) { 
440                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
441                         windowSizes[query] = length / 10;
442                 }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10;  }
443                 else if (windowSizes[query] > (length / 20)) {
444                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
445                 }else if (windowSizes[query] < (length / 5)) {
446                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
447                 }
448                 
449                 //save starting points of each window
450                 for (int m = it->first;  m < (it->second-windowSizes[query]); m+=windowSizes[query]) {  win.push_back(m);  }
451                 
452                 //save last window
453                 if (win[win.size()-1] < (it->first+length)) {
454                         win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
455                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
456                 
457
458                 return win;
459         
460         }
461         catch(exception& e) {
462                 errorOut(e, "Ccode", "findWindows");
463                 exit(1);
464         }
465 }
466 //***************************************************************************************************************
467 int Ccode::getDiff(string seqA, string seqB) {
468         try {
469                 
470                 int numDiff = 0;
471                 
472                 for (int i = 0; i < seqA.length(); i++) {
473                         //if you are both not gaps
474                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
475                                 //are you different
476                                 if (seqA[i] != seqB[i]) { 
477                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
478
479                                         /* the char in base_a and base_b have been checked and they are different */
480                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
481                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
482                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
483                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
484                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
485                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
486                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
487                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
488                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
489                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
490                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
491                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
492                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
493                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
494                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
495                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
496                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
497                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
498                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
499                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
500                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
501                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
502                                         else ok = 0;  /* the bases are different and not equivalent */
503                                         
504                                         //check if they are both blanks
505                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
506                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
507                                         
508                                         if (ok == 0) {  numDiff++;  }
509                                 }
510                         //}
511                 }
512                 
513                 return numDiff;
514         
515         }
516         catch(exception& e) {
517                 errorOut(e, "Ccode", "getDiff");
518                 exit(1);
519         }
520 }
521 //***************************************************************************************************************
522 //tried to make this look most like ccode original implementation
523 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs, int query) {
524         try {
525                 
526                 vector< vector<int> > numDiffBases;
527                 numDiffBases.resize(seqs.size());
528                 //initialize to 0
529                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
530                 
531                 it = trim[query].begin();
532                 int length = it->second - it->first;
533                 
534                 //calc differences from each sequence to everyother seq in the set
535                 for (int i = 0; i < seqs.size(); i++) {
536                         
537                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
538                         
539                         //so you don't calc i to j and j to i since they are the same
540                         for (int j = 0; j < i; j++) {
541                                 
542                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
543                                 
544                                 //compare strings
545                                 int numDiff = getDiff(seqA, seqB);
546                                 
547                                 numDiffBases[i][j] = numDiff;
548                                 numDiffBases[j][i] = numDiff;
549                         }
550                 }
551                 
552                 //initailize remove to 0
553                 vector<int> remove;  remove.resize(seqs.size(), 0);
554                 float top = ((20*length) / (float) 100);
555                 float bottom = ((0.5*length) / (float) 100);
556                 
557                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
558                 for (int i = 0; i < numDiffBases.size(); i++) {
559                         for (int j = 0; j < i; j++) {   
560                                 //are you more than 20% different
561                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
562                                 //are you less than 0.5% different
563                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
564                         }
565                 }
566                 
567                 int numSeqsLeft = 0;
568                 
569                 //count seqs that are not going to be removed
570                 for (int i = 0; i < remove.size(); i++) {  
571                         if (remove[i] == 0)  { numSeqsLeft++;  }
572                 }
573                 
574                 //if you have enough then remove bad ones
575                 if (numSeqsLeft >= 3) {
576                         vector<SeqDist> goodSeqs;
577                         //remove bad seqs
578                         for (int i = 0; i < remove.size(); i++) {
579                                 if (remove[i] == 0) { 
580                                         goodSeqs.push_back(seqs[i]);
581                                 }
582                         }
583                         
584                         seqs = goodSeqs;
585                         
586                 }else { //warn, but dont remove any
587                         mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
588                 }
589                         
590
591         }
592         catch(exception& e) {
593                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
594                 exit(1);
595         }
596 }
597 //***************************************************************************************************************
598 vector< vector<SeqDist> > Ccode::findClosest(int start, int end, int numWanted) {
599         try{
600         
601                 vector< vector<SeqDist> > topMatches;  topMatches.resize(querySeqs.size());
602         
603                 float smallestOverall, smallestLeft, smallestRight;
604                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
605                 
606                 //for each sequence in querySeqs - find top matches to use as reference
607                 for(int j = start; j < end; j++){
608                         
609                         Sequence query = *(querySeqs[j]);
610                         
611                         vector<SeqDist> distances;
612                         
613                         //calc distance to each sequence in template seqs
614                         for (int i = 0; i < templateSeqs.size(); i++) {
615                         
616                                 Sequence ref = *(templateSeqs[i]); 
617                                         
618                                 //find overall dist
619                                 distCalc->calcDist(query, ref);
620                                 float dist = distCalc->getDist();       
621                                 
622                                 //save distance
623                                 SeqDist temp;
624                                 temp.seq = templateSeqs[i];
625                                 temp.dist = dist;
626
627                                 distances.push_back(temp);
628                         }
629                         
630                         sort(distances.begin(), distances.end(), compareSeqDist);
631                         
632                         //save the number of top matches wanted
633                         for (int h = 0; h < numWanted; h++) {
634                                 topMatches[j].push_back(distances[h]);
635                         }
636                 }
637                         
638                 return topMatches;
639
640         }
641         catch(exception& e) {
642                 errorOut(e, "Ccode", "findClosestSides");
643                 exit(1);
644         }
645 }
646 /**************************************************************************************************/
647 //find the distances from each reference sequence to every other reference sequence for each window for this query
648 void Ccode::getAverageRef(vector<SeqDist> ref, int query) {
649         try {
650                 
651                 vector< vector< vector<int> > > diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
652                 
653                 //initialize diffs vector
654                 diffs.resize(ref.size());
655                 for (int i = 0; i < diffs.size(); i++) {  
656                         diffs[i].resize(ref.size());
657                         for (int j = 0; j < diffs[i].size(); j++) {
658                                 diffs[i][j].resize(windows[query].size(), 0);
659                         }
660                 }
661                 
662                 it = trim[query].begin();
663                                 
664                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
665                 for (int i = 0; i < ref.size(); i++) {
666                         
667                         string refI = ref[i].seq->getAligned();
668                         
669                         //j<i, so you don't find distances from i to j and then j to i.
670                         for (int j = 0; j < i; j++) {
671                         
672                                 string refJ = ref[j].seq->getAligned();
673                         
674                                 for (int k = 0; k < windows[query].size(); k++) {
675                                         
676                                         string refIWindowk, refJWindowk;
677                                         
678                                         if (k < windows[query].size()-1) {
679                                                 //get window strings
680                                                 refIWindowk = refI.substr(windows[query][k], windowSizes[query]);
681                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
682                                         }else { //last window may be smaller than rest - see findwindows
683                                                 //get window strings
684                                                 refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k]));
685                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
686                                         }
687                                         
688                                         //find differences
689                                         int diff = getDiff(refIWindowk, refJWindowk);
690 //cout <<  i << '\t' << j << '\t' << k << '\t' << diff << endl;
691                                         //save differences in [i][j][k] and [j][i][k] since they are the same
692                                         diffs[i][j][k] = diff;
693                                         diffs[j][i][k] = diff;
694
695                                 }//k
696                                 
697                         }//j
698                 
699                 }//i
700                 
701                 //initialize sumRef for this query 
702                 sumRef[query].resize(windows[query].size(), 0);
703                 sumSquaredRef[query].resize(windows[query].size(), 0);
704                 averageRef[query].resize(windows[query].size(), 0);
705                 
706                 //find the sum of the differences for hte reference sequences
707                 for (int i = 0; i < diffs.size(); i++) {
708                         for (int j = 0; j < i; j++) {
709                         
710                                 //increment this querys reference sequences combos
711                                 refCombo[query]++;
712                                 
713                                 for (int k = 0; k < diffs[i][j].size(); k++) {
714                                         sumRef[query][k] += diffs[i][j][k];
715                                         sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]);
716                                 }//k
717                                 
718                         }//j
719                 }//i
720
721                 
722                 //find the average of the differences for the references for each window
723                 for (int i = 0; i < windows[query].size(); i++) {
724                         averageRef[query][i] = sumRef[query][i] / (float) refCombo[query];
725                 }
726                 
727         }
728         catch(exception& e) {
729                 errorOut(e, "Ccode", "getAverageRef");
730                 exit(1);
731         }
732 }
733 /**************************************************************************************************/
734 void Ccode::getAverageQuery (vector<SeqDist> ref, int query) {
735         try {
736         
737                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
738                 
739                 //initialize diffs vector
740                 diffs.resize(ref.size());
741                 for (int j = 0; j < diffs.size(); j++) {
742                         diffs[j].resize(windows[query].size(), 0);
743                 }
744                 
745                 it = trim[query].begin();
746                                                         
747                 string refQuery = querySeqs[query]->getAligned();
748                          
749                 //j<i, so you don't find distances from i to j and then j to i.
750                 for (int j = 0; j < ref.size(); j++) {
751                          
752                          string refJ = ref[j].seq->getAligned();
753                          
754                          for (int k = 0; k < windows[query].size(); k++) {
755                                         
756                                         string QueryWindowk, refJWindowk;
757                                         
758                                         if (k < windows[query].size()-1) {
759                                                 //get window strings
760                                                 QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]);
761                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);                                       
762                                         }else { //last window may be smaller than rest - see findwindows
763                                                 //get window strings
764                                                 QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k]));
765                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
766                                         }
767                                         
768                                         //find differences
769                                         int diff = getDiff(QueryWindowk, refJWindowk);
770 //cout  << j << '\t' << k << '\t' << diff << endl;                                              
771                                         //save differences 
772                                         diffs[j][k] = diff;
773                          
774                          }//k
775                 }//j
776                          
777                 
778                 //initialize sumRef for this query 
779                 sumQuery[query].resize(windows[query].size(), 0);
780                 sumSquaredQuery[query].resize(windows[query].size(), 0);
781                 averageQuery[query].resize(windows[query].size(), 0);
782                 
783                 //find the sum of the differences 
784                 for (int j = 0; j < diffs.size(); j++) {
785                         for (int k = 0; k < diffs[j].size(); k++) {
786                                 sumQuery[query][k] += diffs[j][k];
787                                 sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]);
788                         }//k
789                 }//j
790                 
791                 
792                 //find the average of the differences for the references for each window
793                 for (int i = 0; i < windows[query].size(); i++) {
794                         averageQuery[query][i] = sumQuery[query][i] / (float) ref.size();
795                 }
796
797         
798         }
799         catch(exception& e) {
800                 errorOut(e, "Ccode", "getAverageQuery");
801                 exit(1);
802         }
803 }
804 /**************************************************************************************************/
805 void Ccode::findVarianceRef (int query) {
806         try {
807                 
808                 varRef[query].resize(windows[query].size(), 0);
809                 sdRef[query].resize(windows[query].size(), 0);
810                 
811                 //for each window
812                 for (int i = 0; i < windows[query].size(); i++) {
813                         varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1);
814                         sdRef[query][i] = sqrt(varRef[query][i]);
815                         
816                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
817                         if (averageRef[query][i] < 0.001)                       {       averageRef[query][i] = 0.001;           }
818                         if (sumRef[query][i] < 0.001)                           {       sumRef[query][i] = 0.001;                       }
819                         if (varRef[query][i] < 0.001)                           {       varRef[query][i] = 0.001;                       }
820                         if (sumSquaredRef[query][i] < 0.001)            {       sumSquaredRef[query][i] = 0.001;        }
821                         if (sdRef[query][i] < 0.001)                            {       sdRef[query][i] = 0.001;                        }
822                                 
823                 }
824         }
825         catch(exception& e) {
826                 errorOut(e, "Ccode", "findVarianceRef");
827                 exit(1);
828         }
829 }
830 /**************************************************************************************************/
831 void Ccode::findVarianceQuery (int query) {
832         try {
833                 varQuery[query].resize(windows[query].size(), 0);
834                 sdQuery[query].resize(windows[query].size(), 0);
835                 
836                 //for each window
837                 for (int i = 0; i < windows[query].size(); i++) {
838                         varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1);
839                         sdQuery[query][i] = sqrt(varQuery[query][i]);
840                         
841                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
842                         if (averageQuery[query][i] < 0.001)                     {       averageQuery[query][i] = 0.001;         }
843                         if (sumQuery[query][i] < 0.001)                         {       sumQuery[query][i] = 0.001;                     }
844                         if (varQuery[query][i] < 0.001)                         {       varQuery[query][i] = 0.001;                     }
845                         if (sumSquaredQuery[query][i] < 0.001)          {       sumSquaredQuery[query][i] = 0.001;      }
846                         if (sdQuery[query][i] < 0.001)                          {       sdQuery[query][i] = 0.001;                      }
847                 }
848
849         }
850         catch(exception& e) {
851                 errorOut(e, "Ccode", "findVarianceQuery");
852                 exit(1);
853         }
854 }
855 /**************************************************************************************************/
856 void Ccode::determineChimeras (int query) {
857         try {
858                 
859                 isChimericConfidence[query].resize(windows[query].size(), false);
860                 isChimericTStudent[query].resize(windows[query].size(), false);
861                 isChimericANOVA[query].resize(windows[query].size(), false);
862                 anova[query].resize(windows[query].size());
863
864                 
865                 //for each window
866                 for (int i = 0; i < windows[query].size(); i++) {
867                 
868                         //get confidence limits
869                         float t = getT(closest[query].size()-1);  //how many seqs you are comparing to this querySeq
870                         float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; 
871                         float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; 
872 //cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl;                    
873                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) {  /* range does not include 1 */
874                                         isChimericConfidence[query][i] = true;   /* significantly higher at P<0.05 */ 
875 //cout << i << " made it here" << endl;
876                         }
877                         
878                         //student t test
879                         int degreeOfFreedom = refCombo[query] + closest[query].size() - 2;
880                         float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size()));    /* denominator, without sqrt(), for ts calculations */
881                                 
882                         float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT)));  /* value of ts for t-student test */     
883                         t = getT(degreeOfFreedom);
884 //cout << i << '\t' << t << '\t' << ts << endl;                 
885                         if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) {   
886                                 isChimericTStudent[query][i] = true;   /* significantly higher at P<0.05 */ 
887                         }
888                 
889                         //anova test
890                         float value1 = sumQuery[query][i] + sumRef[query][i];
891                         float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i];
892                         float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]);
893                         float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) );
894                         float value5 = value2 - value4;
895                         float value6 = value3 - value4;
896                         float value7 = value5 - value6;
897                         float value8 = value7 / ((float) degreeOfFreedom);
898                         float anovaValue = value6 / value8;
899                         
900                         float f = getF(degreeOfFreedom);
901                         
902                          if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i]))  {
903                        isChimericANOVA[query][i] = true;   /* significant P<0.05 */
904                 }
905                         
906                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
907                         
908                         anova[query][i] = anovaValue;
909                 }
910                 
911         }
912         catch(exception& e) {
913                 errorOut(e, "Ccode", "determineChimeras");
914                 exit(1);
915         }
916 }
917 /**************************************************************************************************/    
918 float Ccode::getT(int numseq) {
919         try {
920         
921                 float tvalue = 0;
922                 
923                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
924                 if (numseq > 120) tvalue = 1.645;
925                 else if (numseq > 60) tvalue = 1.658;
926         else if (numseq > 40) tvalue = 1.671;
927         else if (numseq > 30) tvalue = 1.684;
928         else if (numseq > 29) tvalue = 1.697;
929         else if (numseq > 28) tvalue = 1.699;
930         else if (numseq > 27) tvalue = 1.701;
931         else if (numseq > 26) tvalue = 1.703;
932         else if (numseq > 25) tvalue = 1.706;
933         else if (numseq > 24) tvalue = 1.708;
934         else if (numseq > 23) tvalue = 1.711;
935         else if (numseq > 22) tvalue = 1.714;
936         else if (numseq > 21) tvalue = 1.717;
937         else if (numseq > 20) tvalue = 1.721;
938         else if (numseq > 19) tvalue = 1.725;
939         else if (numseq > 18) tvalue = 1.729;
940         else if (numseq > 17) tvalue = 1.734;
941         else if (numseq > 16) tvalue = 1.740;
942         else if (numseq > 15) tvalue = 1.746;
943         else if (numseq > 14) tvalue = 1.753;
944         else if (numseq > 13) tvalue = 1.761;
945         else if (numseq > 12) tvalue = 1.771;
946         else if (numseq > 11) tvalue = 1.782;
947         else if (numseq > 10) tvalue = 1.796;
948         else if (numseq > 9) tvalue = 1.812;
949         else if (numseq > 8) tvalue = 1.833;
950         else if (numseq > 7) tvalue = 1.860;
951         else if (numseq > 6) tvalue = 1.895;
952         else if (numseq > 5) tvalue = 1.943;
953         else if (numseq > 4) tvalue = 2.015;
954         else if (numseq > 3) tvalue = 2.132;
955         else if (numseq > 2) tvalue = 2.353;
956         else if (numseq > 1) tvalue = 2.920;
957                 else if (numseq <= 1) {
958                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
959                 }
960                 
961                 return tvalue;
962         }
963         catch(exception& e) {
964                 errorOut(e, "Ccode", "getT");
965                 exit(1);
966         }
967 }
968 /**************************************************************************************************/    
969 float Ccode::getF(int numseq) {
970         try {
971         
972                 float fvalue = 0;
973                 
974                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
975         if (numseq > 120) fvalue = 3.84;
976         else if (numseq > 60) fvalue = 3.92;
977         else if (numseq > 40) fvalue = 4.00;
978         else if (numseq > 30) fvalue = 4.08;
979         else if (numseq > 29) fvalue = 4.17;
980         else if (numseq > 28) fvalue = 4.18;
981         else if (numseq > 27) fvalue = 4.20;
982         else if (numseq > 26) fvalue = 4.21;
983         else if (numseq > 25) fvalue = 4.23;
984         else if (numseq > 24) fvalue = 4.24;
985         else if (numseq > 23) fvalue = 4.26;
986         else if (numseq > 22) fvalue = 4.28;
987         else if (numseq > 21) fvalue = 4.30;
988         else if (numseq > 20) fvalue = 4.32;
989         else if (numseq > 19) fvalue = 4.35;
990         else if (numseq > 18) fvalue = 4.38;
991         else if (numseq > 17) fvalue = 4.41;
992         else if (numseq > 16) fvalue = 4.45;
993         else if (numseq > 15) fvalue = 4.49;
994         else if (numseq > 14) fvalue = 4.54;
995         else if (numseq > 13) fvalue = 4.60;
996         else if (numseq > 12) fvalue = 4.67;
997         else if (numseq > 11) fvalue = 4.75;
998         else if (numseq > 10) fvalue = 4.84;
999         else if (numseq > 9) fvalue = 4.96;
1000         else if (numseq > 8) fvalue = 5.12;
1001         else if (numseq > 7) fvalue = 5.32;
1002         else if (numseq > 6) fvalue = 5.59;
1003         else if (numseq > 5) fvalue = 5.99;
1004         else if (numseq > 4) fvalue = 6.61;
1005         else if (numseq > 3) fvalue = 7.71;
1006         else if (numseq > 2) fvalue = 10.1;
1007         else if (numseq > 1) fvalue = 18.5;
1008         else if (numseq > 0) fvalue = 161;
1009                 else if (numseq <= 0) {
1010                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
1011         }
1012                 
1013                 return fvalue;
1014         }
1015         catch(exception& e) {
1016                 errorOut(e, "Ccode", "getF");
1017                 exit(1);
1018         }
1019 }
1020
1021 /**************************************************************************************************/
1022 void Ccode::createProcessesClosest() {
1023         try {
1024 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1025                 int process = 0;
1026                 vector<int> processIDS;
1027                 
1028                 //loop through and create all the processes you want
1029                 while (process != processors) {
1030                         int pid = fork();
1031                         
1032                         if (pid > 0) {
1033                                 processIDS.push_back(pid);  
1034                                 process++;
1035                         }else if (pid == 0){
1036                                 
1037                                 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1038                                 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
1039                                 mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1040                                 
1041                                 //write out data to file so parent can read it
1042                                 ofstream out;
1043                                 string s = toString(getpid()) + ".temp";
1044                                 openOutputFile(s, out);
1045                                                         
1046                                 //output pairs
1047                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1048                                          for (int j = 0; j < closest[i].size(); j++) {
1049                                                 closest[i][j].seq->printSequence(out);
1050                                          }
1051                                 }
1052                                 out << ">" << endl; //to stop sequence read
1053                                 
1054                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1055                                          for (int j = 0; j < closest[i].size(); j++) {
1056                                                 out << closest[i][j].dist << '\t';
1057                                          }
1058                                          out << endl;
1059                                 }
1060                                 out.close();
1061                                 exit(0);
1062                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1063                 }
1064                 
1065                 //force parent to wait until all the processes are done
1066                 for (int i=0;i<processors;i++) { 
1067                         int temp = processIDS[i];
1068                         wait(&temp);
1069                 }
1070                 
1071                 //get data created by processes
1072                 for (int i=0;i<processors;i++) { 
1073                         ifstream in;
1074                         string s = toString(processIDS[i]) + ".temp";
1075                         openInputFile(s, in);
1076                         
1077                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1078                         //get pairs
1079                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1080                                 vector<Sequence*> tempVector;
1081         
1082                                 for (int j = 0; j < numWanted; j++) {
1083                                 
1084                                         Sequence* temp = new Sequence(in);
1085                                         gobble(in);
1086                                                 
1087                                         tempVector.push_back(temp);
1088                                 }
1089                                 
1090                                 tempClosest[k] = tempVector;
1091                         }
1092                         
1093                         string junk;
1094                         in >> junk;  gobble(in);  // to get ">"
1095                 
1096                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1097                         
1098                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1099                                 dists[k].resize(numWanted);
1100                                 for (int j = 0; j < numWanted; j++) {
1101                                         in >> dists[k][j];
1102                                 }
1103                                 gobble(in);
1104                         
1105                         } 
1106
1107                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1108                                 closest[k].resize(numWanted);
1109                                 for (int j = 0; j < closest[k].size(); j++) {
1110                                         closest[k][j].seq = tempClosest[k][j];
1111                                         closest[k][j].dist = dists[k][j]; 
1112                                 }
1113                         } 
1114
1115                         in.close();
1116                         remove(s.c_str());
1117                 }
1118                         
1119         
1120 #else
1121                 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
1122 #endif  
1123
1124         }
1125         catch(exception& e) {
1126                 errorOut(e, "Ccode", "createProcessesClosest");
1127                 exit(1);
1128         }
1129 }
1130
1131 //***************************************************************************************************************
1132 void Ccode::createProcessesRemoveBad() {
1133         try {
1134 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1135                 int process = 0;
1136                 vector<int> processIDS;
1137                 
1138                 //loop through and create all the processes you want
1139                 while (process != processors) {
1140                         int pid = fork();
1141                         
1142                         if (pid > 0) {
1143                                 processIDS.push_back(pid);  
1144                                 process++;
1145                         }else if (pid == 0){
1146                                                         
1147                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1148                                         removeBadReferenceSeqs(closest[i], i);
1149                                 }
1150                                 
1151                                 //write out data to file so parent can read it
1152                                 ofstream out;
1153                                 string s = toString(getpid()) + ".temp";
1154                                 openOutputFile(s, out);
1155                                 
1156                                 //output pairs
1157                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1158                                         out << closest[i].size() << endl;
1159                                          for (int j = 0; j < closest[i].size(); j++) {
1160                                                 closest[i][j].seq->printSequence(out);
1161                                          }
1162                                          out << ">" << endl; //to stop sequence read
1163                                 }
1164                                 
1165                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1166                                          for (int j = 0; j < closest[i].size(); j++) {
1167                                                 out << closest[i][j].dist << '\t';
1168                                          }
1169                                          out << endl;
1170                                 }
1171
1172                                 out.close();
1173                                 
1174                                 exit(0);
1175                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1176                 }
1177                 
1178                 //force parent to wait until all the processes are done
1179                 for (int i=0;i<processors;i++) { 
1180                         int temp = processIDS[i];
1181                         wait(&temp);
1182                 }
1183         
1184                 //get data created by processes
1185                 for (int i=0;i<processors;i++) { 
1186                         ifstream in;
1187                         string s = toString(processIDS[i]) + ".temp";
1188                         openInputFile(s, in);
1189                 
1190                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1191                         vector<int> sizes; 
1192                         //get pairs
1193                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1194                 
1195                                 int num;
1196                                 in >> num; 
1197                                 sizes.push_back(num);
1198                                 gobble(in);
1199                                 
1200                                 vector<Sequence*> tempVector;
1201                                 
1202                                 for (int j = 0; j < num; j++) {
1203                                 
1204                                         Sequence* temp = new Sequence(in);
1205                                         gobble(in);
1206                                                 
1207                                         tempVector.push_back(temp);
1208                                 }
1209                                 string junk;
1210                                 in >> junk;  gobble(in);  // to get ">"
1211
1212                                 tempClosest[k] = tempVector;
1213                         }
1214                         
1215                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1216                         int count = 0;
1217                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1218                                 dists[k].resize(sizes[count]);
1219                                 for (int j = 0; j < sizes[count]; j++) {
1220                                         in >> dists[k][j];
1221                                 }
1222                                 gobble(in);
1223                                 count++;
1224                         } 
1225                         
1226                         count = 0;
1227                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1228                                 for (int j = 0; j < sizes[count]; j++) {
1229                                         closest[k][j].seq = tempClosest[k][j];
1230                                         closest[k][j].dist = dists[k][j]; 
1231                                 }
1232                                 count++;
1233                         } 
1234
1235                         in.close();
1236                         remove(s.c_str());
1237                 }
1238 #else
1239                 for (int i = 0; i < closest.size(); i++) {
1240                         removeBadReferenceSeqs(closest[i], i);
1241                 }
1242 #endif          
1243         
1244         }
1245         catch(exception& e) {
1246                 errorOut(e, "Ccode", "createProcessesRemoveBad");
1247                 exit(1);
1248         }
1249 }
1250 //***************************************************************************************************************
1251 void Ccode::createProcessesAverages() {
1252         try {
1253         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1254                 int process = 0;
1255                 vector<int> processIDS;
1256                 
1257                 //loop through and create all the processes you want
1258                 while (process != processors) {
1259                         int pid = fork();
1260                         
1261                         if (pid > 0) {
1262                                 processIDS.push_back(pid);  
1263                                 process++;
1264                         }else if (pid == 0){
1265                                                         
1266                                 //find the averages for each querys references 
1267                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1268                                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
1269                                 }
1270                                 
1271                                 //find the averages for the query 
1272                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1273                                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
1274                                 }
1275                                 
1276                                 
1277                                 //write out data to file so parent can read it
1278                                 ofstream out;
1279                                 string s = toString(getpid()) + ".temp";
1280                                 openOutputFile(s, out);
1281                                 
1282                                 //output pairs
1283                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1284                                         for (int j = 0; j < windows[i].size(); j++) {
1285                                                 out << sumRef[i][j] << '\t';
1286                                         }
1287                                         out << endl;
1288                                         for (int j = 0; j < windows[i].size(); j++) {
1289                                                 out << averageRef[i][j] << '\t';
1290                                         }
1291                                         out << endl;
1292                                         for (int j = 0; j < windows[i].size(); j++) {
1293                                                 out << sumSquaredRef[i][j] << '\t';
1294                                         }
1295                                         out << endl;
1296                                         for (int j = 0; j < windows[i].size(); j++) {
1297                                                 out << sumQuery[i][j] << '\t';
1298                                         }
1299                                         out << endl;
1300                                         for (int j = 0; j < windows[i].size(); j++) {
1301                                                 out << averageQuery[i][j] << '\t';
1302                                         }
1303                                         out << endl;
1304                                         for (int j = 0; j < windows[i].size(); j++) {
1305                                                 out << sumSquaredQuery[i][j] << '\t';
1306                                         }
1307                                         out << endl;
1308                                         out << refCombo[i] << endl;
1309                                 }
1310
1311                                 out.close();
1312                                 
1313                                 exit(0);
1314                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1315                 }
1316                 
1317                 //force parent to wait until all the processes are done
1318                 for (int i=0;i<processors;i++) { 
1319                         int temp = processIDS[i];
1320                         wait(&temp);
1321                 }
1322         
1323                 //get data created by processes
1324                 for (int i=0;i<processors;i++) { 
1325                         ifstream in;
1326                         string s = toString(processIDS[i]) + ".temp";
1327                         openInputFile(s, in);
1328                 
1329                         //get pairs
1330                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1331                                 sumRef[k].resize(windows[k].size());
1332                                 averageRef[k].resize(windows[k].size());
1333                                 sumSquaredRef[k].resize(windows[k].size());
1334                                 averageQuery[k].resize(windows[k].size());
1335                                 sumQuery[k].resize(windows[k].size());
1336                                 sumSquaredQuery[k].resize(windows[k].size());
1337                                 
1338                                 for (int j = 0; j < windows[k].size(); j++) {
1339                                         in >> sumRef[k][j];
1340                                 }
1341                                 gobble(in);
1342                                 for (int j = 0; j < windows[k].size(); j++) {
1343                                         in >> averageRef[k][j];
1344                                 }
1345                                 gobble(in);
1346                                 for (int j = 0; j < windows[k].size(); j++) {
1347                                         in >> sumSquaredRef[k][j];
1348                                 }
1349                                 gobble(in);
1350                                 for (int j = 0; j < windows[k].size(); j++) {
1351                                         in >> sumQuery[k][j];
1352                                 }
1353                                 gobble(in);
1354                                 for (int j = 0; j < windows[k].size(); j++) {
1355                                         in >> averageQuery[k][j];
1356                                 }
1357                                 gobble(in);
1358                                 for (int j = 0; j < windows[k].size(); j++) {
1359                                         in >> sumSquaredQuery[k][j];
1360                                 }
1361                                 gobble(in);
1362                                 in >> refCombo[k];
1363                                 gobble(in);
1364                         }
1365                         
1366                         in.close();
1367                         remove(s.c_str());
1368                 }
1369 #else
1370                 //find the averages for each querys references 
1371                 for (int i = 0; i < querySeqs.size(); i++) {
1372                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
1373                 }
1374         
1375                 //find the averages for the query 
1376                 for (int i = 0; i < querySeqs.size(); i++) {
1377                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
1378                 }
1379
1380 #endif          
1381         
1382         }
1383         catch(exception& e) {
1384                 errorOut(e, "Ccode", "createProcessesAverages");
1385                 exit(1);
1386         }
1387 }
1388 //***************************************************************************************************************
1389 void Ccode::createProcessesVariances() {
1390         try {
1391         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1392                 int process = 0;
1393                 vector<int> processIDS;
1394                 
1395                 //loop through and create all the processes you want
1396                 while (process != processors) {
1397                         int pid = fork();
1398                         
1399                         if (pid > 0) {
1400                                 processIDS.push_back(pid);  
1401                                 process++;
1402                         }else if (pid == 0){
1403                                                         
1404                                 //find the averages for each querys references 
1405                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1406                                         findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1407                                 }
1408                                 
1409                                 //find the averages for the query 
1410                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1411                                         findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1412                                 }
1413                                 
1414                                 
1415                                 //write out data to file so parent can read it
1416                                 ofstream out;
1417                                 string s = toString(getpid()) + ".temp";
1418                                 openOutputFile(s, out);
1419                                 
1420                                 //output pairs
1421                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1422                                         for (int j = 0; j < windows[i].size(); j++) {
1423                                                 out << varRef[i][j] << '\t';
1424                                         }
1425                                         out << endl;
1426                                         for (int j = 0; j < windows[i].size(); j++) {
1427                                                 out << sdRef[i][j] << '\t';
1428                                         }
1429                                         out << endl;
1430                                         for (int j = 0; j < windows[i].size(); j++) {
1431                                                 out << varQuery[i][j] << '\t';
1432                                         }
1433                                         out << endl;
1434                                         for (int j = 0; j < windows[i].size(); j++) {
1435                                                 out << sdQuery[i][j] << '\t';
1436                                         }
1437                                         out << endl;
1438                                 }
1439
1440                                 out.close();
1441                                 
1442                                 exit(0);
1443                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1444                 }
1445                 
1446                 //force parent to wait until all the processes are done
1447                 for (int i=0;i<processors;i++) { 
1448                         int temp = processIDS[i];
1449                         wait(&temp);
1450                 }
1451         
1452                 //get data created by processes
1453                 for (int i=0;i<processors;i++) { 
1454                         ifstream in;
1455                         string s = toString(processIDS[i]) + ".temp";
1456                         openInputFile(s, in);
1457                 
1458                         //get pairs
1459                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1460                                 varRef[k].resize(windows[k].size());
1461                                 sdRef[k].resize(windows[k].size());
1462                                 varQuery[k].resize(windows[k].size());
1463                                 sdQuery[k].resize(windows[k].size());
1464                                                                 
1465                                 for (int j = 0; j < windows[k].size(); j++) {
1466                                         in >> varRef[k][j];
1467                                 }
1468                                 gobble(in);
1469                                 for (int j = 0; j < windows[k].size(); j++) {
1470                                         in >> sdRef[k][j];
1471                                 }
1472                                 gobble(in);
1473                                 for (int j = 0; j < windows[k].size(); j++) {
1474                                         in >> varQuery[k][j];
1475                                 }
1476                                 gobble(in);
1477                                 for (int j = 0; j < windows[k].size(); j++) {
1478                                         in >> sdQuery[k][j];
1479                                 }
1480                                 gobble(in);
1481                         }
1482                         
1483                         in.close();
1484                         remove(s.c_str());
1485                 }
1486 #else
1487                         //find the averages for each querys references 
1488                         for (int i = 0; i < querySeqs.size(); i++) {
1489                                 findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1490                         }
1491                         
1492                         //find the averages for the query 
1493                         for (int i = 0; i < querySeqs.size(); i++) {
1494                                 findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
1495                         }
1496 #endif          
1497         }
1498         catch(exception& e) {
1499                 errorOut(e, "Ccode", "createProcessesVariances");
1500                 exit(1);
1501         }
1502 }
1503 //***************************************************************************************************************
1504 void Ccode::createProcessesDetermine() {
1505         try {
1506         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1507                 int process = 0;
1508                 vector<int> processIDS;
1509                 
1510                 //loop through and create all the processes you want
1511                 while (process != processors) {
1512                         int pid = fork();
1513                         
1514                         if (pid > 0) {
1515                                 processIDS.push_back(pid);  
1516                                 process++;
1517                         }else if (pid == 0){
1518                                                         
1519                                 //find the averages for each querys references 
1520                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1521                                         determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. 
1522                                 }
1523
1524                                 //write out data to file so parent can read it
1525                                 ofstream out;
1526                                 string s = toString(getpid()) + ".temp";
1527                                 openOutputFile(s, out);
1528                                 
1529                                 //output pairs
1530                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1531                                         for (int j = 0; j < windows[i].size(); j++) {
1532                                                 out << anova[i][j] << '\t';
1533                                         }
1534                                         out << endl;
1535                                         for (int j = 0; j < windows[i].size(); j++) {
1536                                                 out << isChimericConfidence[i][j] << '\t';
1537                                         }
1538                                         out << endl;
1539                                         for (int j = 0; j < windows[i].size(); j++) {
1540                                                 out << isChimericTStudent[i][j] << '\t';
1541                                         }
1542                                         out << endl;
1543                                         for (int j = 0; j < windows[i].size(); j++) {
1544                                                 out << isChimericANOVA[i][j] << '\t';
1545                                         }
1546                                         out << endl;
1547                                 }
1548
1549                                 out.close();
1550                                 
1551                                 exit(0);
1552                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1553                 }
1554                 
1555                 //force parent to wait until all the processes are done
1556                 for (int i=0;i<processors;i++) { 
1557                         int temp = processIDS[i];
1558                         wait(&temp);
1559                 }
1560         
1561                 //get data created by processes
1562                 for (int i=0;i<processors;i++) { 
1563                         ifstream in;
1564                         string s = toString(processIDS[i]) + ".temp";
1565                         openInputFile(s, in);
1566                 
1567                         //get pairs
1568                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1569                                 anova[k].resize(windows[k].size());
1570                                 isChimericConfidence[k].resize(windows[k].size(), false);
1571                                 isChimericTStudent[k].resize(windows[k].size(), false);
1572                                 isChimericANOVA[k].resize(windows[k].size(), false);
1573                                 int tempBool;
1574                                                                 
1575                                 for (int j = 0; j < windows[k].size(); j++) {
1576                                         in >> anova[k][j];
1577                                 }
1578                                 gobble(in);
1579                                 for (int j = 0; j < windows[k].size(); j++) {
1580                                         in >> tempBool;
1581                                         if (tempBool == 1) {  isChimericConfidence[k][j] = true;  }
1582                                 }
1583                                 gobble(in);
1584                                 for (int j = 0; j < windows[k].size(); j++) {
1585                                         in >> tempBool;
1586                                         if (tempBool == 1) {  isChimericTStudent[k][j] = true;  }
1587                                 }
1588                                 gobble(in);
1589                                 for (int j = 0; j < windows[k].size(); j++) {
1590                                         in >> tempBool;
1591                                         if (tempBool == 1) {  isChimericANOVA[k][j] = true;  }
1592                                 }
1593                                 gobble(in);
1594                         }
1595                         
1596                         in.close();
1597                         remove(s.c_str());
1598                 }
1599         #else
1600                         for (int i = 0; i < querySeqs.size(); i++) {
1601                                 determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i].
1602                         }
1603         #endif          
1604
1605         }
1606         catch(exception& e) {
1607                 errorOut(e, "Ccode", "createProcessesDetermine");
1608                 exit(1);
1609         }
1610 }
1611 //***************************************************************************************************************
1612
1613
1614