]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
changed reporting in ccode to only report those who are found to be chimeric by all...
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
17 //***************************************************************************************************************
18
19 Ccode::~Ccode() {
20         try {
21                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
22                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
23                 delete distCalc;
24         }
25         catch(exception& e) {
26                 errorOut(e, "Ccode", "~Ccode");
27                 exit(1);
28         }
29 }       
30 //***************************************************************************************************************
31 void Ccode::print(ostream& out) {
32         try {
33                 
34                 mothurOutEndLine();
35                 
36                 string mapInfo = getRootName(fastafile) + "mapinfo";
37                 ofstream out2;
38                 openOutputFile(mapInfo, out2);
39                 
40                 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
41                 
42                 for (int j = 0; j < querySeqs.size(); j++) {
43                         out2 << querySeqs[j]->getName() << endl;
44                         for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) {
45                                 out2 << it->first << '\t' << it->second << endl;
46                         }
47                 }
48                 out2.close();
49                 
50                 out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
51                 
52                 out << "For full window mapping info refer to " << mapInfo << endl << endl;
53                 
54                 for (int i = 0; i < querySeqs.size(); i++) {
55                 
56                         out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
57                         
58                         for (int j = 0; j < closest[i].size(); j++) {
59                                 out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl;
60                         }
61                         out << endl << endl;
62                 
63                         //for each window
64                         //window mapping info.
65                         out << "Mapping information: ";
66                         //you mask and did not filter
67                         if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
68                                 
69                         //you filtered and did not mask
70                         if ((seqMask == "") && (filter)) { out << "filter and trim."; }
71                                 
72                         //you masked and filtered
73                         if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
74                         
75                         out << endl << "Window\tStartPos\tEndPos" << endl;
76                         it = trim[i].begin();
77                         
78                         for (int k = 0; k < windows[i].size()-1; k++) {
79                                 out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl;
80                         }
81                         
82                         out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl;
83                         out << endl;
84                         
85                         out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
86                         for (int k = 0; k < windows[i].size(); k++) {
87                                 float ds = averageQuery[i][k] / averageRef[i][k]; 
88                                 out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl;
89                         }
90                         out << endl;
91                         
92                         //varRef
93                         //varQuery
94                         /* F test for differences among variances.
95                         * varQuery is expected to be higher or similar than varRef */
96                         //float fs = varQuery[query][i] / varRef[query][i];     /* F-Snedecor, test for differences of variances */
97                         
98                         bool results = false;   
99                                                 
100                         //confidence limit, t - Student, anova
101                         out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
102                         
103                         for (int k = 0; k < windows[i].size(); k++) {
104                                 string temp = "";
105                                 if (isChimericConfidence[i][k]) {  temp += "*\t"; }
106                                 else { temp += "\t"; }
107                                 
108                                 if (isChimericTStudent[i][k]) {  temp += "*\t"; }
109                                 else { temp += "\t"; }
110                                 
111                                 if (isChimericANOVA[i][k]) {  temp += "*\t"; }
112                                 else { temp += "\t"; }
113                         
114                                 out << k+1 << '\t' << temp << endl;
115                                 
116                                 if (temp == "*\t*\t*\t") {  results = true;  }
117                         }
118                         out << endl;    
119                         
120                         if (results) {
121                                 mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
122                         }
123                 }
124         }
125         catch(exception& e) {
126                 errorOut(e, "Ccode", "print");
127                 exit(1);
128         }
129 }
130
131 //***************************************************************************************************************
132 void Ccode::getChimeras() {
133         try {
134                 
135                 //read in query sequences and subject sequences
136                 mothurOut("Reading sequences and template file... "); cout.flush();
137                 querySeqs = readSeqs(fastafile);
138                 templateSeqs = readSeqs(templateFile);
139                 mothurOut("Done."); mothurOutEndLine();
140                 
141                 int numSeqs = querySeqs.size();
142                 
143                 closest.resize(numSeqs);
144                 
145                 refCombo.resize(numSeqs, 0);
146                 sumRef.resize(numSeqs); 
147                 varRef.resize(numSeqs); 
148                 varQuery.resize(numSeqs); 
149                 sdRef.resize(numSeqs); 
150                 sdQuery.resize(numSeqs);     
151                 sumQuery.resize(numSeqs);
152                 sumSquaredRef.resize(numSeqs); 
153                 sumSquaredQuery.resize(numSeqs); 
154                 averageRef.resize(numSeqs);
155                 averageQuery.resize(numSeqs);
156                 anova.resize(numSeqs);
157                 isChimericConfidence.resize(numSeqs);
158                 isChimericTStudent.resize(numSeqs);
159                 isChimericANOVA.resize(numSeqs);
160                 trim.resize(numSeqs);
161                 spotMap.resize(numSeqs);
162                 windowSizes.resize(numSeqs, window);
163                 windows.resize(numSeqs);
164                 
165                 //break up file if needed
166                 int linesPerProcess = numSeqs / processors ;
167                 
168                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
169                         //find breakup of sequences for all times we will Parallelize
170                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
171                         else {
172                                 //fill line pairs
173                                 for (int i = 0; i < (processors-1); i++) {                      
174                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
175                                 }
176                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
177                                 int i = processors - 1;
178                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
179                         }
180                         
181                         //find breakup of templatefile for quantiles
182                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
183                         else { 
184                                 for (int i = 0; i < processors; i++) {
185                                         templateLines.push_back(new linePair());
186                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
187                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
188                                 }
189                         }
190                 #else
191                         lines.push_back(new linePair(0, numSeqs));
192                         templateLines.push_back(new linePair(0, templateSeqs.size()));
193                 #endif
194         
195                 distCalc = new eachGapDist();
196                 decalc = new DeCalculator();
197                 
198                 //find closest
199                 if (processors == 1) { 
200                         mothurOut("Finding top matches for sequences... "); cout.flush();
201                         closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
202                         mothurOut("Done."); mothurOutEndLine();
203                 }else {         createProcessesClosest();               }
204
205                 
206 for (int i = 0; i < closest.size(); i++) {
207         //cout << querySeqs[i]->getName() << ": ";
208         string file = querySeqs[i]->getName() + ".input";
209         ofstream o;
210         openOutputFile(file, o);
211         
212         querySeqs[i]->printSequence(o);
213         for (int j = 0; j < closest[i].size(); j++) {
214                 //cout << closest[i][j].seq->getName() << '\t';
215                 closest[i][j].seq->printSequence(o);
216         }
217         //cout << endl;
218         o.close();
219 }       
220                 for (int j = 0; j < numSeqs; j++) {
221                         for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
222                                 spotMap[j][i] = i;
223                         }
224                 }
225                 
226                 //mask sequences if the user wants to 
227                 if (seqMask != "") {
228                         decalc->setMask(seqMask);
229                         
230                         //mask querys
231                         for (int i = 0; i < querySeqs.size(); i++) {
232                                 decalc->runMask(querySeqs[i]);
233                         }
234                 
235                         //mask templates
236                         for (int i = 0; i < templateSeqs.size(); i++) {
237                                 decalc->runMask(templateSeqs[i]);
238                         }
239                         
240                         for (int i = 0; i < numSeqs; i++) {
241                                 spotMap[i] = decalc->getMaskMap();
242                         }
243                 }
244                 
245                 if (filter) {
246                         vector<Sequence*> temp = templateSeqs;
247                         for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
248                         
249                         createFilter(temp);
250                 
251                         runFilter(querySeqs);
252                         runFilter(templateSeqs);
253                         
254                         //update spotMap
255                         map<int, int> newMap;
256                         int spot = 0;
257                         int j = 0;
258                         
259                         for (int i = 0; i < filterString.length(); i++) {
260                                 if (filterString[i] == '1') {
261                                         //add to newMap
262                                         newMap[spot] = spotMap[j][i];
263                                         spot++;  
264                                 }
265                         }
266                         
267                         for (int i = 0; i < numSeqs; i++) {
268                                 spotMap[i] = newMap;
269                         }
270                 }
271
272                 //trim sequences - this follows ccodes remove_extra_gaps 
273                 for (int i = 0; i < querySeqs.size(); i++) {
274                         trimSequences(i);
275                 }
276                 
277                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
278                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
279                 for (int i = 0; i < querySeqs.size(); i++) {
280                         windows[i] = findWindows(i);  
281                 }
282
283                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later - should be paralellized
284                 for (int i = 0; i < closest.size(); i++) {
285                         removeBadReferenceSeqs(closest[i], i);
286                 }
287                 
288         
289                 //find the averages for each querys references - should be paralellized
290                 for (int i = 0; i < numSeqs; i++) {
291                         getAverageRef(closest[i], i);  //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
292                 }
293         
294                 //find the averages for the query - should be paralellized
295                 for (int i = 0; i < numSeqs; i++) {
296                         getAverageQuery(closest[i], i);  //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
297                 }
298                 
299                 //find the averages for each querys references - should be paralellized
300                 for (int i = 0; i < numSeqs; i++) {
301                         findVarianceRef(i);  //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
302                 }
303                 
304                 //find the averages for the query - should be paralellized
305                 for (int i = 0; i < numSeqs; i++) {
306                         findVarianceQuery(i);  //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
307                 }
308
309                 for (int i = 0; i < numSeqs; i++) {
310                         determineChimeras(i);  //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - should be paralellized
311                 }
312
313                 //free memory
314                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
315                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
316                         
317         }
318         catch(exception& e) {
319                 errorOut(e, "Ccode", "getChimeras");
320                 exit(1);
321         }
322 }
323 /***************************************************************************************************************/
324 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
325 void Ccode::trimSequences(int query) {
326         try {
327                 
328                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
329                 int rearPos = querySeqs[query]->getAligned().length();
330                 
331                 //********find first position in closest seqs that is a non gap character***********//
332                 //find first position all query seqs that is a non gap character
333                 for (int i = 0; i < closest[query].size(); i++) {
334                         
335                         string aligned = closest[query][i].seq->getAligned();
336                         int pos = 0;
337                         
338                         //find first spot in this seq
339                         for (int j = 0; j < aligned.length(); j++) {
340                                 if (isalpha(aligned[j])) {
341                                         pos = j;
342                                         break;
343                                 }
344                         }
345                         
346                         //save this spot if it is the farthest
347                         if (pos > frontPos) { frontPos = pos; }
348                 }
349                 
350                 //find first position all querySeq[query] that is a non gap character
351                 string aligned = querySeqs[query]->getAligned();
352                 int pos = 0;
353                         
354                 //find first spot in this seq
355                 for (int j = 0; j < aligned.length(); j++) {
356                         if (isalpha(aligned[j])) {
357                                 pos = j;
358                                 break;
359                         }
360                 }
361                 
362                 //save this spot if it is the farthest
363                 if (pos > frontPos) { frontPos = pos; }
364                 
365                 
366                 //********find last position in closest seqs that is a non gap character***********//
367                 for (int i = 0; i < closest[query].size(); i++) {
368                         
369                         string aligned = closest[query][i].seq->getAligned();
370                         int pos = aligned.length();
371                         
372                         //find first spot in this seq
373                         for (int j = aligned.length()-1; j >= 0; j--) {
374                                 if (isalpha(aligned[j])) {
375                                         pos = j;
376                                         break;
377                                 }
378                         }
379                         
380                         //save this spot if it is the farthest
381                         if (pos < rearPos) { rearPos = pos; }
382                 }
383                 
384                 //find last position all querySeqs[query] that is a non gap character
385                 aligned = querySeqs[query]->getAligned();
386                 pos = aligned.length();
387                 
388                 //find first spot in this seq
389                 for (int j = aligned.length()-1; j >= 0; j--) {
390                         if (isalpha(aligned[j])) {
391                                 pos = j;
392                                 break;
393                         }
394                 }
395                 
396                 //save this spot if it is the farthest
397                 if (pos < rearPos) { rearPos = pos; }
398
399                 
400                 //check to make sure that is not whole seq
401                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
402                 
403                 map<int, int> tempTrim;
404                 tempTrim[frontPos] = rearPos;
405                 
406                 //save trimmed locations
407                 trim[query] = tempTrim;
408                                                 
409                 //update spotMask
410                 map<int, int> newMap;
411                 int spot = 0;
412                 
413                 for (int i = frontPos; i < rearPos; i++) {
414                         //add to newMap
415 //cout << query << '\t' << i << '\t' << spotMap[query][i] << endl;
416                         newMap[spot] = spotMap[query][i];
417                         spot++;  
418                 }
419                 
420                 //for (it = newMap.begin(); it!=newMap.end(); it++) {
421                         //cout << query << '\t' << it->first << '\t' << it->second << endl;
422                 //}
423                 
424                 spotMap[query] = newMap;
425
426                 
427         }
428         catch(exception& e) {
429                 errorOut(e, "Ccode", "trimSequences");
430                 exit(1);
431         }
432
433 }
434 /***************************************************************************************************************/
435 vector<int> Ccode::findWindows(int query) {
436         try {
437                 
438                 vector<int> win; 
439                 it = trim[query].begin();
440                 
441                 int length = it->second - it->first;
442                 
443                 //default is wanted = 10% of total length
444                 if (windowSizes[query] > length) { 
445                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
446                         windowSizes[query] = length / 10;
447                 }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10;  }
448                 else if (windowSizes[query] > (length / 20)) {
449                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
450                 }else if (windowSizes[query] < (length / 5)) {
451                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
452                 }
453                 
454                 //save starting points of each window
455                 for (int m = it->first;  m < (it->second-windowSizes[query]); m+=windowSizes[query]) {  win.push_back(m);  }
456                 
457                 //save last window
458                 if (win[win.size()-1] < (it->first+length)) {
459                         win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
460                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
461                 
462
463                 return win;
464         
465         }
466         catch(exception& e) {
467                 errorOut(e, "Ccode", "findWindows");
468                 exit(1);
469         }
470 }
471 //***************************************************************************************************************
472 int Ccode::getDiff(string seqA, string seqB) {
473         try {
474                 
475                 int numDiff = 0;
476                 
477                 for (int i = 0; i < seqA.length(); i++) {
478                         //if you are both not gaps
479                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
480                                 //are you different
481                                 if (seqA[i] != seqB[i]) { 
482                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
483
484                                         /* the char in base_a and base_b have been checked and they are different */
485                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
486                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
487                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
488                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
489                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
490                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
491                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
492                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
493                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
494                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
495                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
496                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
497                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
498                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
499                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
500                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
501                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
502                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
503                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
504                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
505                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
506                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
507                                         else ok = 0;  /* the bases are different and not equivalent */
508                                         
509                                         //check if they are both blanks
510                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
511                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
512                                         
513                                         if (ok == 0) {  numDiff++;  }
514                                 }
515                         //}
516                 }
517                 
518                 return numDiff;
519         
520         }
521         catch(exception& e) {
522                 errorOut(e, "Ccode", "getDiff");
523                 exit(1);
524         }
525 }
526 //***************************************************************************************************************
527 //tried to make this look most like ccode original implementation
528 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs, int query) {
529         try {
530                 
531                 vector< vector<int> > numDiffBases;
532                 numDiffBases.resize(seqs.size());
533                 //initialize to 0
534                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
535                 
536                 it = trim[query].begin();
537                 int length = it->second - it->first;
538                 
539                 //calc differences from each sequence to everyother seq in the set
540                 for (int i = 0; i < seqs.size(); i++) {
541                         
542                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
543                         
544                         //so you don't calc i to j and j to i since they are the same
545                         for (int j = 0; j < i; j++) {
546                                 
547                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
548                                 
549                                 //compare strings
550                                 int numDiff = getDiff(seqA, seqB);
551                                 
552                                 numDiffBases[i][j] = numDiff;
553                                 numDiffBases[j][i] = numDiff;
554                         }
555                 }
556                 
557                 //initailize remove to 0
558                 vector<int> remove;  remove.resize(seqs.size(), 0);
559                 float top = ((20*length) / (float) 100);
560                 float bottom = ((0.5*length) / (float) 100);
561                 
562                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
563                 for (int i = 0; i < numDiffBases.size(); i++) {
564                         for (int j = 0; j < i; j++) {   
565                                 //are you more than 20% different
566                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
567                                 //are you less than 0.5% different
568                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
569                         }
570                 }
571                 
572                 int numSeqsLeft = 0;
573                 
574                 //count seqs that are not going to be removed
575                 for (int i = 0; i < remove.size(); i++) {  
576                         if (remove[i] == 0)  { numSeqsLeft++;  }
577                 }
578                 
579                 //if you have enough then remove bad ones
580                 if (numSeqsLeft >= 3) {
581                         vector<SeqDist> goodSeqs;
582                         //remove bad seqs
583                         for (int i = 0; i < remove.size(); i++) {
584                                 if (remove[i] == 0) { 
585                                         goodSeqs.push_back(seqs[i]);
586                                 }
587                         }
588                         
589                         seqs = goodSeqs;
590                         
591                 }else { //warn, but dont remove any
592                         mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
593                 }
594                         
595
596         }
597         catch(exception& e) {
598                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
599                 exit(1);
600         }
601 }
602 //***************************************************************************************************************
603 vector< vector<SeqDist> > Ccode::findClosest(int start, int end, int numWanted) {
604         try{
605         
606                 vector< vector<SeqDist> > topMatches;  topMatches.resize(querySeqs.size());
607         
608                 float smallestOverall, smallestLeft, smallestRight;
609                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
610                 
611                 //for each sequence in querySeqs - find top matches to use as reference
612                 for(int j = start; j < end; j++){
613                         
614                         Sequence query = *(querySeqs[j]);
615                         
616                         vector<SeqDist> distances;
617                         
618                         //calc distance to each sequence in template seqs
619                         for (int i = 0; i < templateSeqs.size(); i++) {
620                         
621                                 Sequence ref = *(templateSeqs[i]); 
622                                         
623                                 //find overall dist
624                                 distCalc->calcDist(query, ref);
625                                 float dist = distCalc->getDist();       
626                                 
627                                 //save distance
628                                 SeqDist temp;
629                                 temp.seq = templateSeqs[i];
630                                 temp.dist = dist;
631
632                                 distances.push_back(temp);
633                         }
634                         
635                         sort(distances.begin(), distances.end(), compareSeqDist);
636                         
637                         //save the number of top matches wanted
638                         for (int h = 0; h < numWanted; h++) {
639                                 topMatches[j].push_back(distances[h]);
640                         }
641                 }
642                         
643                 return topMatches;
644
645         }
646         catch(exception& e) {
647                 errorOut(e, "Ccode", "findClosestSides");
648                 exit(1);
649         }
650 }
651 /**************************************************************************************************/
652 //find the distances from each reference sequence to every other reference sequence for each window for this query
653 void Ccode::getAverageRef(vector<SeqDist> ref, int query) {
654         try {
655                 
656                 vector< vector< vector<int> > > diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
657                 
658                 //initialize diffs vector
659                 diffs.resize(ref.size());
660                 for (int i = 0; i < diffs.size(); i++) {  
661                         diffs[i].resize(ref.size());
662                         for (int j = 0; j < diffs[i].size(); j++) {
663                                 diffs[i][j].resize(windows[query].size(), 0);
664                         }
665                 }
666                 
667                 it = trim[query].begin();
668                                 
669                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
670                 for (int i = 0; i < ref.size(); i++) {
671                         
672                         string refI = ref[i].seq->getAligned();
673                         
674                         //j<i, so you don't find distances from i to j and then j to i.
675                         for (int j = 0; j < i; j++) {
676                         
677                                 string refJ = ref[j].seq->getAligned();
678                         
679                                 for (int k = 0; k < windows[query].size(); k++) {
680                                         
681                                         string refIWindowk, refJWindowk;
682                                         
683                                         if (k < windows[query].size()-1) {
684                                                 //get window strings
685                                                 refIWindowk = refI.substr(windows[query][k], windowSizes[query]);
686                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
687                                         }else { //last window may be smaller than rest - see findwindows
688                                                 //get window strings
689                                                 refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k]));
690                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
691                                         }
692                                         
693                                         //find differences
694                                         int diff = getDiff(refIWindowk, refJWindowk);
695 //cout <<  i << '\t' << j << '\t' << k << '\t' << diff << endl;
696                                         //save differences in [i][j][k] and [j][i][k] since they are the same
697                                         diffs[i][j][k] = diff;
698                                         diffs[j][i][k] = diff;
699
700                                 }//k
701                                 
702                         }//j
703                 
704                 }//i
705                 
706                 //initialize sumRef for this query 
707                 sumRef[query].resize(windows[query].size(), 0);
708                 sumSquaredRef[query].resize(windows[query].size(), 0);
709                 averageRef[query].resize(windows[query].size(), 0);
710                 
711                 //find the sum of the differences for hte reference sequences
712                 for (int i = 0; i < diffs.size(); i++) {
713                         for (int j = 0; j < i; j++) {
714                         
715                                 //increment this querys reference sequences combos
716                                 refCombo[query]++;
717                                 
718                                 for (int k = 0; k < diffs[i][j].size(); k++) {
719                                         sumRef[query][k] += diffs[i][j][k];
720                                         sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]);
721                                 }//k
722                                 
723                         }//j
724                 }//i
725
726                 
727                 //find the average of the differences for the references for each window
728                 for (int i = 0; i < windows[query].size(); i++) {
729                         averageRef[query][i] = sumRef[query][i] / (float) refCombo[query];
730                 }
731                 
732         }
733         catch(exception& e) {
734                 errorOut(e, "Ccode", "getAverageRef");
735                 exit(1);
736         }
737 }
738 /**************************************************************************************************/
739 void Ccode::getAverageQuery (vector<SeqDist> ref, int query) {
740         try {
741         
742                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
743                 
744                 //initialize diffs vector
745                 diffs.resize(ref.size());
746                 for (int j = 0; j < diffs.size(); j++) {
747                         diffs[j].resize(windows[query].size(), 0);
748                 }
749                 
750                 it = trim[query].begin();
751                                                         
752                 string refQuery = querySeqs[query]->getAligned();
753                          
754                 //j<i, so you don't find distances from i to j and then j to i.
755                 for (int j = 0; j < ref.size(); j++) {
756                          
757                          string refJ = ref[j].seq->getAligned();
758                          
759                          for (int k = 0; k < windows[query].size(); k++) {
760                                         
761                                         string QueryWindowk, refJWindowk;
762                                         
763                                         if (k < windows[query].size()-1) {
764                                                 //get window strings
765                                                 QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]);
766                                                 refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);                                       
767                                         }else { //last window may be smaller than rest - see findwindows
768                                                 //get window strings
769                                                 QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k]));
770                                                 refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
771                                         }
772                                         
773                                         //find differences
774                                         int diff = getDiff(QueryWindowk, refJWindowk);
775 //cout  << j << '\t' << k << '\t' << diff << endl;                                              
776                                         //save differences 
777                                         diffs[j][k] = diff;
778                          
779                          }//k
780                 }//j
781                          
782                 
783                 //initialize sumRef for this query 
784                 sumQuery[query].resize(windows[query].size(), 0);
785                 sumSquaredQuery[query].resize(windows[query].size(), 0);
786                 averageQuery[query].resize(windows[query].size(), 0);
787                 
788                 //find the sum of the differences 
789                 for (int j = 0; j < diffs.size(); j++) {
790                         for (int k = 0; k < diffs[j].size(); k++) {
791                                 sumQuery[query][k] += diffs[j][k];
792                                 sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]);
793                         }//k
794                 }//j
795                 
796                 
797                 //find the average of the differences for the references for each window
798                 for (int i = 0; i < windows[query].size(); i++) {
799                         averageQuery[query][i] = sumQuery[query][i] / (float) ref.size();
800                 }
801
802         
803         }
804         catch(exception& e) {
805                 errorOut(e, "Ccode", "getAverageQuery");
806                 exit(1);
807         }
808 }
809 /**************************************************************************************************/
810 void Ccode::findVarianceRef (int query) {
811         try {
812                 
813                 varRef[query].resize(windows[query].size(), 0);
814                 sdRef[query].resize(windows[query].size(), 0);
815                 
816                 //for each window
817                 for (int i = 0; i < windows[query].size(); i++) {
818                         varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1);
819                         sdRef[query][i] = sqrt(varRef[query][i]);
820                         
821                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
822                         if (averageRef[query][i] < 0.001)                       {       averageRef[query][i] = 0.001;           }
823                         if (sumRef[query][i] < 0.001)                           {       sumRef[query][i] = 0.001;                       }
824                         if (varRef[query][i] < 0.001)                           {       varRef[query][i] = 0.001;                       }
825                         if (sumSquaredRef[query][i] < 0.001)            {       sumSquaredRef[query][i] = 0.001;        }
826                         if (sdRef[query][i] < 0.001)                            {       sdRef[query][i] = 0.001;                        }
827                                 
828                 }
829         }
830         catch(exception& e) {
831                 errorOut(e, "Ccode", "findVarianceRef");
832                 exit(1);
833         }
834 }
835 /**************************************************************************************************/
836 void Ccode::findVarianceQuery (int query) {
837         try {
838                 varQuery[query].resize(windows[query].size(), 0);
839                 sdQuery[query].resize(windows[query].size(), 0);
840                 
841                 //for each window
842                 for (int i = 0; i < windows[query].size(); i++) {
843                         varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1);
844                         sdQuery[query][i] = sqrt(varQuery[query][i]);
845                         
846                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
847                         if (averageQuery[query][i] < 0.001)                     {       averageQuery[query][i] = 0.001;         }
848                         if (sumQuery[query][i] < 0.001)                         {       sumQuery[query][i] = 0.001;                     }
849                         if (varQuery[query][i] < 0.001)                         {       varQuery[query][i] = 0.001;                     }
850                         if (sumSquaredQuery[query][i] < 0.001)          {       sumSquaredQuery[query][i] = 0.001;      }
851                         if (sdQuery[query][i] < 0.001)                          {       sdQuery[query][i] = 0.001;                      }
852                 }
853
854         }
855         catch(exception& e) {
856                 errorOut(e, "Ccode", "findVarianceQuery");
857                 exit(1);
858         }
859 }
860 /**************************************************************************************************/
861 void Ccode::determineChimeras (int query) {
862         try {
863                 
864                 isChimericConfidence[query].resize(windows[query].size(), false);
865                 isChimericTStudent[query].resize(windows[query].size(), false);
866                 isChimericANOVA[query].resize(windows[query].size(), false);
867                 anova[query].resize(windows[query].size());
868
869                 
870                 //for each window
871                 for (int i = 0; i < windows[query].size(); i++) {
872                 
873                         //get confidence limits
874                         float t = getT(closest[query].size()-1);  //how many seqs you are comparing to this querySeq
875                         float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; 
876                         float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; 
877 //cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl;                    
878                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) {  /* range does not include 1 */
879                                         isChimericConfidence[query][i] = true;   /* significantly higher at P<0.05 */ 
880 //cout << i << " made it here" << endl;
881                         }
882                         
883                         //student t test
884                         int degreeOfFreedom = refCombo[query] + closest[query].size() - 2;
885                         float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size()));    /* denominator, without sqrt(), for ts calculations */
886                                 
887                         float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT)));  /* value of ts for t-student test */     
888                         t = getT(degreeOfFreedom);
889 //cout << i << '\t' << t << '\t' << ts << endl;                 
890                         if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) {   
891                                 isChimericTStudent[query][i] = true;   /* significantly higher at P<0.05 */ 
892                         }
893                 
894                         //anova test
895                         float value1 = sumQuery[query][i] + sumRef[query][i];
896                         float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i];
897                         float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]);
898                         float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) );
899                         float value5 = value2 - value4;
900                         float value6 = value3 - value4;
901                         float value7 = value5 - value6;
902                         float value8 = value7 / ((float) degreeOfFreedom);
903                         float anovaValue = value6 / value8;
904                         
905                         float f = getF(degreeOfFreedom);
906                         
907                          if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i]))  {
908                        isChimericANOVA[query][i] = true;   /* significant P<0.05 */
909                 }
910                         
911                         anova[query][i] = anovaValue;
912                 }
913                 
914         }
915         catch(exception& e) {
916                 errorOut(e, "Ccode", "determineChimeras");
917                 exit(1);
918         }
919 }
920 /**************************************************************************************************/    
921 float Ccode::getT(int numseq) {
922         try {
923         
924                 float tvalue = 0;
925                 
926                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
927                 if (numseq > 120) tvalue = 1.645;
928                 else if (numseq > 60) tvalue = 1.658;
929         else if (numseq > 40) tvalue = 1.671;
930         else if (numseq > 30) tvalue = 1.684;
931         else if (numseq > 29) tvalue = 1.697;
932         else if (numseq > 28) tvalue = 1.699;
933         else if (numseq > 27) tvalue = 1.701;
934         else if (numseq > 26) tvalue = 1.703;
935         else if (numseq > 25) tvalue = 1.706;
936         else if (numseq > 24) tvalue = 1.708;
937         else if (numseq > 23) tvalue = 1.711;
938         else if (numseq > 22) tvalue = 1.714;
939         else if (numseq > 21) tvalue = 1.717;
940         else if (numseq > 20) tvalue = 1.721;
941         else if (numseq > 19) tvalue = 1.725;
942         else if (numseq > 18) tvalue = 1.729;
943         else if (numseq > 17) tvalue = 1.734;
944         else if (numseq > 16) tvalue = 1.740;
945         else if (numseq > 15) tvalue = 1.746;
946         else if (numseq > 14) tvalue = 1.753;
947         else if (numseq > 13) tvalue = 1.761;
948         else if (numseq > 12) tvalue = 1.771;
949         else if (numseq > 11) tvalue = 1.782;
950         else if (numseq > 10) tvalue = 1.796;
951         else if (numseq > 9) tvalue = 1.812;
952         else if (numseq > 8) tvalue = 1.833;
953         else if (numseq > 7) tvalue = 1.860;
954         else if (numseq > 6) tvalue = 1.895;
955         else if (numseq > 5) tvalue = 1.943;
956         else if (numseq > 4) tvalue = 2.015;
957         else if (numseq > 3) tvalue = 2.132;
958         else if (numseq > 2) tvalue = 2.353;
959         else if (numseq > 1) tvalue = 2.920;
960                 else if (numseq <= 1) {
961                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
962                 }
963                 
964                 return tvalue;
965         }
966         catch(exception& e) {
967                 errorOut(e, "Ccode", "getT");
968                 exit(1);
969         }
970 }
971 /**************************************************************************************************/    
972 float Ccode::getF(int numseq) {
973         try {
974         
975                 float fvalue = 0;
976                 
977                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
978         if (numseq > 120) fvalue = 3.84;
979         else if (numseq > 60) fvalue = 3.92;
980         else if (numseq > 40) fvalue = 4.00;
981         else if (numseq > 30) fvalue = 4.08;
982         else if (numseq > 29) fvalue = 4.17;
983         else if (numseq > 28) fvalue = 4.18;
984         else if (numseq > 27) fvalue = 4.20;
985         else if (numseq > 26) fvalue = 4.21;
986         else if (numseq > 25) fvalue = 4.23;
987         else if (numseq > 24) fvalue = 4.24;
988         else if (numseq > 23) fvalue = 4.26;
989         else if (numseq > 22) fvalue = 4.28;
990         else if (numseq > 21) fvalue = 4.30;
991         else if (numseq > 20) fvalue = 4.32;
992         else if (numseq > 19) fvalue = 4.35;
993         else if (numseq > 18) fvalue = 4.38;
994         else if (numseq > 17) fvalue = 4.41;
995         else if (numseq > 16) fvalue = 4.45;
996         else if (numseq > 15) fvalue = 4.49;
997         else if (numseq > 14) fvalue = 4.54;
998         else if (numseq > 13) fvalue = 4.60;
999         else if (numseq > 12) fvalue = 4.67;
1000         else if (numseq > 11) fvalue = 4.75;
1001         else if (numseq > 10) fvalue = 4.84;
1002         else if (numseq > 9) fvalue = 4.96;
1003         else if (numseq > 8) fvalue = 5.12;
1004         else if (numseq > 7) fvalue = 5.32;
1005         else if (numseq > 6) fvalue = 5.59;
1006         else if (numseq > 5) fvalue = 5.99;
1007         else if (numseq > 4) fvalue = 6.61;
1008         else if (numseq > 3) fvalue = 7.71;
1009         else if (numseq > 2) fvalue = 10.1;
1010         else if (numseq > 1) fvalue = 18.5;
1011         else if (numseq > 0) fvalue = 161;
1012                 else if (numseq <= 0) {
1013                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
1014         }
1015                 
1016                 return fvalue;
1017         }
1018         catch(exception& e) {
1019                 errorOut(e, "Ccode", "getF");
1020                 exit(1);
1021         }
1022 }
1023
1024 /**************************************************************************************************/
1025 void Ccode::createProcessesClosest() {
1026         try {
1027 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1028                 int process = 0;
1029                 vector<int> processIDS;
1030                 
1031                 //loop through and create all the processes you want
1032                 while (process != processors) {
1033                         int pid = fork();
1034                         
1035                         if (pid > 0) {
1036                                 processIDS.push_back(pid);  
1037                                 process++;
1038                         }else if (pid == 0){
1039                                 
1040                                 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1041                                 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
1042                                 mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
1043                                 
1044                                 //write out data to file so parent can read it
1045                                 ofstream out;
1046                                 string s = toString(getpid()) + ".temp";
1047                                 openOutputFile(s, out);
1048                                 
1049                                 //output pairs
1050                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1051                                          for (int j = 0; j < closest[i].size(); j++) {
1052                                                 closest[i][j].seq->printSequence(out);
1053                                          }
1054                                 }
1055                                 out << ">" << endl; //to stop sequence read
1056                                 
1057                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
1058                                          for (int j = 0; j < closest[i].size(); j++) {
1059                                                 out << closest[i][j].dist << '\t';
1060                                          }
1061                                          out << endl;
1062                                 }
1063
1064                                 out.close();
1065                                 
1066                                 exit(0);
1067                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
1068                 }
1069                 
1070                 //force parent to wait until all the processes are done
1071                 for (int i=0;i<processors;i++) { 
1072                         int temp = processIDS[i];
1073                         wait(&temp);
1074                 }
1075                 
1076                 //get data created by processes
1077                 for (int i=0;i<processors;i++) { 
1078                         ifstream in;
1079                         string s = toString(processIDS[i]) + ".temp";
1080                         openInputFile(s, in);
1081                         
1082                         vector< vector<Sequence*> > tempClosest;  tempClosest.resize(querySeqs.size());
1083                         //get pairs
1084                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
1085                                 vector<Sequence*> tempVector;
1086                                 
1087                                 for (int j = 0; j < numWanted; j++) {
1088                                 
1089                                         Sequence* temp = new Sequence(in);
1090                                         gobble(in);
1091                                                 
1092                                         tempVector.push_back(temp);
1093                                 }
1094                                 
1095                                 tempClosest[k] = tempVector;
1096                         }
1097                         
1098                         string junk;
1099                         in >> junk;  gobble(in);  // to get ">"
1100                         
1101                         vector< vector<float> > dists; dists.resize(querySeqs.size());
1102                         
1103                         for (int i = lines[process]->start; i < lines[process]->end; i++) {
1104                                 dists[i].resize(closest[i].size());
1105                                 for (int j = 0; j < closest[i].size(); j++) {
1106                                         in >> dists[i][j];
1107                                 }
1108                                 gobble(in);
1109                         } 
1110                         
1111                         for (int i = lines[process]->start; i < lines[process]->end; i++) {
1112                                 for (int j = 0; j < closest[i].size(); j++) {
1113                                         closest[i][j].seq = tempClosest[i][j];
1114                                         closest[i][j].dist = dists[i][j]; 
1115                                 }
1116                         } 
1117
1118                         in.close();
1119                         remove(s.c_str());
1120                 }
1121                         
1122         
1123 #else
1124                 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
1125 #endif  
1126
1127         }
1128         catch(exception& e) {
1129                 errorOut(e, "Ccode", "createProcessesClosest");
1130                 exit(1);
1131         }
1132 }
1133
1134 //***************************************************************************************************************
1135