]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
added checks for ^C to quit command instead of program
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string o) {  
17         fastafile = filename;  outputDir = o; 
18         distCalc = new eachGapDist();
19         decalc = new DeCalculator();
20         
21         mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo";
22         ofstream out2;
23         openOutputFile(mapInfo, out2);
24                 
25         out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
26         out2.close();
27 }
28 //***************************************************************************************************************
29 Ccode::~Ccode() {
30         delete distCalc;
31         delete decalc;
32 }       
33 //***************************************************************************************************************
34 void Ccode::printHeader(ostream& out) {
35         out << "For full window mapping info refer to " << mapInfo << endl << endl;
36 }
37 //***************************************************************************************************************
38 int Ccode::print(ostream& out, ostream& outAcc) {
39         try {
40                 
41                 ofstream out2;
42                 openOutputFileAppend(mapInfo, out2);
43                 
44                 out2 << querySeq->getName() << endl;
45                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
46                         out2 << it->first << '\t' << it->second << endl;
47                 }
48                 out2.close();
49                 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
50                         
51                 for (int j = 0; j < closest.size(); j++) {
52                         out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
53                 }
54                 out << endl << endl;
55                 
56                 //for each window
57                 //window mapping info.
58                 out << "Mapping information: ";
59                 //you mask and did not filter
60                 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
61                                 
62                 //you filtered and did not mask
63                 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
64                                 
65                 //you masked and filtered
66                 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
67                         
68                 out << endl << "Window\tStartPos\tEndPos" << endl;
69                 it = trim.begin();
70                 for (int k = 0; k < windows.size()-1; k++) {
71                         out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
72                 }
73                         
74                 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
75                 out << endl;
76                 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
77                 for (int k = 0; k < windows.size(); k++) {
78                         float ds = averageQuery[k] / averageRef[k]; 
79                         out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
80                 }
81                 out << endl;
82                         
83                 //varRef
84                 //varQuery
85                 /* F test for differences among variances.
86                 * varQuery is expected to be higher or similar than varRef */
87                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
88                         
89                 bool results = false;   
90                                         
91                 //confidence limit, t - Student, anova
92                 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
93                         
94                 for (int k = 0; k < windows.size(); k++) {
95                         string temp = "";
96                         if (isChimericConfidence[k]) {  temp += "*\t"; }
97                         else { temp += "\t"; }
98                                 
99                         if (isChimericTStudent[k]) {  temp += "*\t"; }
100                         else { temp += "\t"; }
101                                 
102                         if (isChimericANOVA[k]) {  temp += "*\t"; }
103                         else { temp += "\t"; }
104                         
105                         out << k+1 << '\t' << temp << endl;
106                                 
107                         if (temp == "*\t*\t*\t") {  results = true;  }
108                 }
109                 out << endl;    
110                         
111                 if (results) {
112                         m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
113                         outAcc << querySeq->getName() << endl;
114                 }
115
116                 //free memory
117                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
118
119                 return 0;
120         }
121         catch(exception& e) {
122                 m->errorOut(e, "Ccode", "print");
123                 exit(1);
124         }
125 }
126 //***************************************************************************************************************
127 int Ccode::getChimeras(Sequence* query) {
128         try {
129         
130                 closest.clear();
131                 refCombo = 0;
132                 sumRef.clear(); 
133                 varRef.clear(); 
134                 varQuery.clear(); 
135                 sdRef.clear(); 
136                 sdQuery.clear();     
137                 sumQuery.clear();
138                 sumSquaredRef.clear(); 
139                 sumSquaredQuery.clear(); 
140                 averageRef.clear();
141                 averageQuery.clear();
142                 anova.clear();
143                 isChimericConfidence.clear();
144                 isChimericTStudent.clear();
145                 isChimericANOVA.clear();
146                 trim.clear();
147                 spotMap.clear();
148                 windowSizes = window;
149                 windows.clear();
150
151         
152                 querySeq = query;
153                 
154                 //find closest matches to query
155                 closest = findClosest(query, numWanted);
156                 
157                 if (m->control_pressed) {  return 0;  }
158                 
159                 //initialize spotMap
160                 for (int i = 0; i < query->getAligned().length(); i++) {        spotMap[i] = i;         }
161         
162                 //mask sequences if the user wants to 
163                 if (seqMask != "") {
164                         decalc->setMask(seqMask);
165                         
166                         decalc->runMask(query);
167                         
168                         //mask closest
169                         for (int i = 0; i < closest.size(); i++) {      decalc->runMask(closest[i].seq);        }
170                         
171                         spotMap = decalc->getMaskMap();
172                 }
173                 
174                 if (filter) {
175                         vector<Sequence*> temp;
176                         for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
177                         temp.push_back(query);  
178                         
179                         createFilter(temp, 0.5);
180                 
181                         for (int i = 0; i < temp.size(); i++) { 
182                                 if (m->control_pressed) {  return 0;  }
183                                 runFilter(temp[i]);  
184                         }
185                         
186                         //update spotMap
187                         map<int, int> newMap;
188                         int spot = 0;
189                         
190                         for (int i = 0; i < filterString.length(); i++) {
191                                 if (filterString[i] == '1') {
192                                         //add to newMap
193                                         newMap[spot] = spotMap[i];
194                                         spot++;  
195                                 }
196                         }
197                         spotMap = newMap;
198                 }
199
200                 //trim sequences - this follows ccodes remove_extra_gaps 
201                 trimSequences(query);
202                 if (m->control_pressed) {  return 0;  }
203                 
204                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
205                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
206                 windows = findWindows();  
207                 if (m->control_pressed) {  return 0;  }
208
209                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
210                 removeBadReferenceSeqs(closest);
211                 if (m->control_pressed) {  return 0;  }
212                 
213                 //find the averages for each querys references
214                 getAverageRef(closest);  //fills sumRef, averageRef, sumSquaredRef and refCombo.
215                 getAverageQuery(closest, query);  //fills sumQuery, averageQuery, sumSquaredQuery.
216                 if (m->control_pressed) {  return 0;  }                 
217                 
218                 //find the averages for each querys references 
219                 findVarianceRef();  //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
220                 if (m->control_pressed) {  return 0;  } 
221                         
222                 //find the averages for the query 
223                 findVarianceQuery();  //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
224                 if (m->control_pressed) {  return 0;  }
225                                         
226                 determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
227                 if (m->control_pressed) {  return 0;  }
228                 
229                 return 0;
230         }
231         catch(exception& e) {
232                 m->errorOut(e, "Ccode", "getChimeras");
233                 exit(1);
234         }
235 }
236 /***************************************************************************************************************/
237 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
238 void Ccode::trimSequences(Sequence* query) {
239         try {
240                 
241                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
242                 int rearPos = query->getAligned().length();
243                 
244                 //********find first position in closest seqs that is a non gap character***********//
245                 //find first position all query seqs that is a non gap character
246                 for (int i = 0; i < closest.size(); i++) {
247                         
248                         string aligned = closest[i].seq->getAligned();
249                         int pos = 0;
250                         
251                         //find first spot in this seq
252                         for (int j = 0; j < aligned.length(); j++) {
253                                 if (isalpha(aligned[j])) {
254                                         pos = j;
255                                         break;
256                                 }
257                         }
258                         
259                         //save this spot if it is the farthest
260                         if (pos > frontPos) { frontPos = pos; }
261                 }
262                 
263                 //find first position all querySeq[query] that is a non gap character
264                 string aligned = query->getAligned();
265                 int pos = 0;
266                         
267                 //find first spot in this seq
268                 for (int j = 0; j < aligned.length(); j++) {
269                         if (isalpha(aligned[j])) {
270                                 pos = j;
271                                 break;
272                         }
273                 }
274                 
275                 //save this spot if it is the farthest
276                 if (pos > frontPos) { frontPos = pos; }
277                 
278                 
279                 //********find last position in closest seqs that is a non gap character***********//
280                 for (int i = 0; i < closest.size(); i++) {
281                         
282                         string aligned = closest[i].seq->getAligned();
283                         int pos = aligned.length();
284                         
285                         //find first spot in this seq
286                         for (int j = aligned.length()-1; j >= 0; j--) {
287                                 if (isalpha(aligned[j])) {
288                                         pos = j;
289                                         break;
290                                 }
291                         }
292                         
293                         //save this spot if it is the farthest
294                         if (pos < rearPos) { rearPos = pos; }
295                 }
296                 
297                 //find last position all querySeqs[query] that is a non gap character
298                 aligned = query->getAligned();
299                 pos = aligned.length();
300                 
301                 //find first spot in this seq
302                 for (int j = aligned.length()-1; j >= 0; j--) {
303                         if (isalpha(aligned[j])) {
304                                 pos = j;
305                                 break;
306                         }
307                 }
308                 
309                 //save this spot if it is the farthest
310                 if (pos < rearPos) { rearPos = pos; }
311
312                 
313                 //check to make sure that is not whole seq
314                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
315                 
316                 map<int, int> tempTrim;
317                 tempTrim[frontPos] = rearPos;
318                 
319                 //save trimmed locations
320                 trim = tempTrim;
321                                                 
322                 //update spotMask
323                 map<int, int> newMap;
324                 int spot = 0;
325                 
326                 for (int i = frontPos; i < rearPos; i++) {
327                         //add to newMap
328                         newMap[spot] = spotMap[i];
329                         spot++;  
330                 }
331                 spotMap = newMap;
332         }
333         catch(exception& e) {
334                 m->errorOut(e, "Ccode", "trimSequences");
335                 exit(1);
336         }
337 }
338 /***************************************************************************************************************/
339 vector<int> Ccode::findWindows() {
340         try {
341                 
342                 vector<int> win; 
343                 it = trim.begin();
344                 
345                 int length = it->second - it->first;
346         
347                 //default is wanted = 10% of total length
348                 if (windowSizes > length) { 
349                         m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
350                         windowSizes = length / 10;
351                 }else if (windowSizes == 0) { windowSizes = length / 10;  }
352                 else if (windowSizes > (length * 0.20)) {
353                         m->mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
354                 }else if (windowSizes < (length * 0.05)) {
355                         m->mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
356                 }
357                 
358                 //save starting points of each window
359                 for (int m = it->first;  m < (it->second-windowSizes); m+=windowSizes) {  win.push_back(m);  }
360                 
361                 //save last window
362                 if (win[win.size()-1] < (it->first+length)) {
363                         win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
364                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
365                 
366                 return win;
367         }
368         catch(exception& e) {
369                 m->errorOut(e, "Ccode", "findWindows");
370                 exit(1);
371         }
372 }
373 //***************************************************************************************************************
374 int Ccode::getDiff(string seqA, string seqB) {
375         try {
376                 
377                 int numDiff = 0;
378                 
379                 for (int i = 0; i < seqA.length(); i++) {
380                         //if you are both not gaps
381                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
382                                 //are you different
383                                 if (seqA[i] != seqB[i]) { 
384                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
385
386                                         /* the char in base_a and base_b have been checked and they are different */
387                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
388                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
389                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
390                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
391                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
392                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
393                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
394                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
395                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
396                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
397                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
398                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
399                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
400                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
401                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
402                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
403                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
404                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
405                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
406                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
407                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
408                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
409                                         else ok = 0;  /* the bases are different and not equivalent */
410                                         
411                                         //check if they are both blanks
412                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
413                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
414                                         
415                                         if (ok == 0) {  numDiff++;  }
416                                 }
417                         //}
418                 }
419                 
420                 return numDiff;
421         
422         }
423         catch(exception& e) {
424                 m->errorOut(e, "Ccode", "getDiff");
425                 exit(1);
426         }
427 }
428 //***************************************************************************************************************
429 //tried to make this look most like ccode original implementation
430 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
431         try {
432                 
433                 vector< vector<int> > numDiffBases;
434                 numDiffBases.resize(seqs.size());
435                 //initialize to 0
436                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
437                 
438                 it = trim.begin();
439                 int length = it->second - it->first;
440                 
441                 //calc differences from each sequence to everyother seq in the set
442                 for (int i = 0; i < seqs.size(); i++) {
443                         
444                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
445                         
446                         //so you don't calc i to j and j to i since they are the same
447                         for (int j = 0; j < i; j++) {
448                                 
449                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
450                                 
451                                 //compare strings
452                                 int numDiff = getDiff(seqA, seqB);
453                                 
454                                 numDiffBases[i][j] = numDiff;
455                                 numDiffBases[j][i] = numDiff;
456                         }
457                 }
458                 
459                 //initailize remove to 0
460                 vector<int> remove;  remove.resize(seqs.size(), 0);
461                 float top = ((20*length) / (float) 100);
462                 float bottom = ((0.5*length) / (float) 100);
463                 
464                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
465                 for (int i = 0; i < numDiffBases.size(); i++) {
466                         for (int j = 0; j < i; j++) {   
467                                 //are you more than 20% different
468                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
469                                 //are you less than 0.5% different
470                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
471                         }
472                 }
473                 
474                 int numSeqsLeft = 0;
475                 
476                 //count seqs that are not going to be removed
477                 for (int i = 0; i < remove.size(); i++) {  
478                         if (remove[i] == 0)  { numSeqsLeft++;  }
479                 }
480                 
481                 //if you have enough then remove bad ones
482                 if (numSeqsLeft >= 3) {
483                         vector<SeqDist> goodSeqs;
484                         //remove bad seqs
485                         for (int i = 0; i < remove.size(); i++) {
486                                 if (remove[i] == 0) { 
487                                         goodSeqs.push_back(seqs[i]);
488                                 }
489                         }
490                         
491                         seqs = goodSeqs;
492                         
493                 }else { //warn, but dont remove any
494                         m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); m->mothurOutEndLine();  
495                 }
496
497         }
498         catch(exception& e) {
499                 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
500                 exit(1);
501         }
502 }
503 //***************************************************************************************************************
504 //makes copy of templateseq for filter
505 vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
506         try{
507         
508                 vector<SeqDist>  topMatches;  
509                 
510                 Sequence query = *(q);
511                         
512                 //calc distance to each sequence in template seqs
513                 for (int i = 0; i < templateSeqs.size(); i++) {
514                         
515                         Sequence ref = *(templateSeqs[i]); 
516                                         
517                         //find overall dist
518                         distCalc->calcDist(query, ref);
519                         float dist = distCalc->getDist();       
520                                 
521                         //save distance
522                         SeqDist temp;
523                         temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
524                         temp.dist = dist;
525
526                         topMatches.push_back(temp);
527                 }
528                         
529                 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
530                 
531                 for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
532                 
533                 topMatches.resize(numWanted);
534                         
535                 return topMatches;
536
537         }
538         catch(exception& e) {
539                 m->errorOut(e, "Ccode", "findClosestSides");
540                 exit(1);
541         }
542 }
543 /**************************************************************************************************/
544 //find the distances from each reference sequence to every other reference sequence for each window for this query
545 void Ccode::getAverageRef(vector<SeqDist> ref) {
546         try {
547                 
548                 vector< vector< vector<int> > >  diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
549                 
550                 //initialize diffs vector
551                 diffs.resize(ref.size());
552                 for (int i = 0; i < diffs.size(); i++) {  
553                         diffs[i].resize(ref.size());
554                         for (int j = 0; j < diffs[i].size(); j++) {
555                                 diffs[i][j].resize(windows.size(), 0);
556                         }
557                 }
558                 
559                 it = trim.begin();
560                                 
561                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
562                 for (int i = 0; i < ref.size(); i++) {
563                         
564                         string refI = ref[i].seq->getAligned();
565                         
566                         //j<i, so you don't find distances from i to j and then j to i.
567                         for (int j = 0; j < i; j++) {
568                         
569                                 string refJ = ref[j].seq->getAligned();
570                         
571                                 for (int k = 0; k < windows.size(); k++) {
572                                         
573                                         string refIWindowk, refJWindowk;
574                                         
575                                         if (k < windows.size()-1) {
576                                                 //get window strings
577                                                 refIWindowk = refI.substr(windows[k], windowSizes);
578                                                 refJWindowk = refJ.substr(windows[k], windowSizes);
579                                         }else { //last window may be smaller than rest - see findwindows
580                                                 //get window strings
581                                                 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
582                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
583                                         }
584                                         
585                                         //find differences
586                                         int diff = getDiff(refIWindowk, refJWindowk);
587
588                                         //save differences in [i][j][k] and [j][i][k] since they are the same
589                                         diffs[i][j][k] = diff;
590                                         diffs[j][i][k] = diff;
591
592                                 }//k
593                                 
594                         }//j
595                 
596                 }//i
597                 
598                 //initialize sumRef for this query 
599                 sumRef.resize(windows.size(), 0);
600                 sumSquaredRef.resize(windows.size(), 0);
601                 averageRef.resize(windows.size(), 0);
602                 
603                 //find the sum of the differences for hte reference sequences
604                 for (int i = 0; i < diffs.size(); i++) {
605                         for (int j = 0; j < i; j++) {
606                         
607                                 //increment this querys reference sequences combos
608                                 refCombo++;
609                                 
610                                 for (int k = 0; k < diffs[i][j].size(); k++) {
611                                         sumRef[k] += diffs[i][j][k];
612                                         sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
613                                 }//k
614                                 
615                         }//j
616                 }//i
617
618                 
619                 //find the average of the differences for the references for each window
620                 for (int i = 0; i < windows.size(); i++) {
621                         averageRef[i] = sumRef[i] / (float) refCombo;
622                 }
623                 
624         }
625         catch(exception& e) {
626                 m->errorOut(e, "Ccode", "getAverageRef");
627                 exit(1);
628         }
629 }
630 /**************************************************************************************************/
631 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
632         try {
633         
634                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
635                 
636                 //initialize diffs vector
637                 diffs.resize(ref.size());
638                 for (int j = 0; j < diffs.size(); j++) {
639                         diffs[j].resize(windows.size(), 0);
640                 }
641                 
642                 it = trim.begin();
643                                                         
644                 string refQuery = query->getAligned();
645                          
646                 //j<i, so you don't find distances from i to j and then j to i.
647                 for (int j = 0; j < ref.size(); j++) {
648                          
649                          string refJ = ref[j].seq->getAligned();
650                          
651                          for (int k = 0; k < windows.size(); k++) {
652                                         
653                                         string QueryWindowk, refJWindowk;
654                                         
655                                         if (k < windows.size()-1) {
656                                                 //get window strings
657                                                 QueryWindowk = refQuery.substr(windows[k], windowSizes);
658                                                 refJWindowk = refJ.substr(windows[k], windowSizes);                                     
659                                         }else { //last window may be smaller than rest - see findwindows
660                                                 //get window strings
661                                                 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
662                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
663                                         }
664                                         
665                                         //find differences
666                                         int diff = getDiff(QueryWindowk, refJWindowk);
667                                         
668                                         //save differences 
669                                         diffs[j][k] = diff;
670                          
671                          }//k
672                 }//j
673                          
674                 
675                 //initialize sumRef for this query 
676                 sumQuery.resize(windows.size(), 0);
677                 sumSquaredQuery.resize(windows.size(), 0);
678                 averageQuery.resize(windows.size(), 0);
679                 
680                 //find the sum of the differences 
681                 for (int j = 0; j < diffs.size(); j++) {
682                         for (int k = 0; k < diffs[j].size(); k++) {
683                                 sumQuery[k] += diffs[j][k];
684                                 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
685                         }//k
686                 }//j
687                 
688                 
689                 //find the average of the differences for the references for each window
690                 for (int i = 0; i < windows.size(); i++) {
691                         averageQuery[i] = sumQuery[i] / (float) ref.size();
692                 }
693         }
694         catch(exception& e) {
695                 m->errorOut(e, "Ccode", "getAverageQuery");
696                 exit(1);
697         }
698 }
699 /**************************************************************************************************/
700 void Ccode::findVarianceRef() {
701         try {
702                 
703                 varRef.resize(windows.size(), 0);
704                 sdRef.resize(windows.size(), 0);
705                 
706                 //for each window
707                 for (int i = 0; i < windows.size(); i++) {
708                         varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
709                         sdRef[i] = sqrt(varRef[i]);
710                         
711                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
712                         if (averageRef[i] < 0.001)                      {       averageRef[i] = 0.001;          }
713                         if (sumRef[i] < 0.001)                          {       sumRef[i] = 0.001;                      }
714                         if (varRef[i] < 0.001)                          {       varRef[i] = 0.001;                      }
715                         if (sumSquaredRef[i] < 0.001)           {       sumSquaredRef[i] = 0.001;       }
716                         if (sdRef[i] < 0.001)                           {       sdRef[i] = 0.001;                       }
717                                 
718                 }
719         }
720         catch(exception& e) {
721                 m->errorOut(e, "Ccode", "findVarianceRef");
722                 exit(1);
723         }
724 }
725 /**************************************************************************************************/
726 void Ccode::findVarianceQuery() {
727         try {
728                 varQuery.resize(windows.size(), 0);
729                 sdQuery.resize(windows.size(), 0);
730                 
731                 //for each window
732                 for (int i = 0; i < windows.size(); i++) {
733                         varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
734                         sdQuery[i] = sqrt(varQuery[i]);
735                         
736                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
737                         if (averageQuery[i] < 0.001)                    {       averageQuery[i] = 0.001;                }
738                         if (sumQuery[i] < 0.001)                                {       sumQuery[i] = 0.001;                    }
739                         if (varQuery[i] < 0.001)                                {       varQuery[i] = 0.001;                    }
740                         if (sumSquaredQuery[i] < 0.001)         {       sumSquaredQuery[i] = 0.001;     }
741                         if (sdQuery[i] < 0.001)                         {       sdQuery[i] = 0.001;                     }
742                 }
743
744         }
745         catch(exception& e) {
746                 m->errorOut(e, "Ccode", "findVarianceQuery");
747                 exit(1);
748         }
749 }
750 /**************************************************************************************************/
751 void Ccode::determineChimeras() {
752         try {
753                 
754                 isChimericConfidence.resize(windows.size(), false);
755                 isChimericTStudent.resize(windows.size(), false);
756                 isChimericANOVA.resize(windows.size(), false);
757                 anova.resize(windows.size());
758
759                 
760                 //for each window
761                 for (int i = 0; i < windows.size(); i++) {
762                 
763                         //get confidence limits
764                         float t = getT(closest.size()-1);  //how many seqs you are comparing to this querySeq
765                         float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; 
766                         float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; 
767                 
768                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) {  /* range does not include 1 */
769                                         isChimericConfidence[i] = true;   /* significantly higher at P<0.05 */ 
770
771                         }
772                         
773                         //student t test
774                         int degreeOfFreedom = refCombo + closest.size() - 2;
775                         float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size()));    /* denominator, without sqrt(), for ts calculations */
776                                 
777                         float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT)));  /* value of ts for t-student test */   
778                         t = getT(degreeOfFreedom);
779         
780                         if ((ts >= t) && (averageQuery[i] > averageRef[i])) {   
781                                 isChimericTStudent[i] = true;   /* significantly higher at P<0.05 */ 
782                         }
783                 
784                         //anova test
785                         float value1 = sumQuery[i] + sumRef[i];
786                         float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
787                         float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
788                         float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
789                         float value5 = value2 - value4;
790                         float value6 = value3 - value4;
791                         float value7 = value5 - value6;
792                         float value8 = value7 / ((float) degreeOfFreedom);
793                         float anovaValue = value6 / value8;
794                         
795                         float f = getF(degreeOfFreedom);
796                         
797                          if ((anovaValue >= f) && (averageQuery[i] > averageRef[i]))  {
798                        isChimericANOVA[i] = true;   /* significant P<0.05 */
799                 }
800                         
801                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
802                         
803                         anova[i] = anovaValue;
804                 }
805                 
806         }
807         catch(exception& e) {
808                 m->errorOut(e, "Ccode", "determineChimeras");
809                 exit(1);
810         }
811 }
812 /**************************************************************************************************/    
813 float Ccode::getT(int numseq) {
814         try {
815         
816                 float tvalue = 0;
817                 
818                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
819                 if (numseq > 120) tvalue = 1.645;
820                 else if (numseq > 60) tvalue = 1.658;
821         else if (numseq > 40) tvalue = 1.671;
822         else if (numseq > 30) tvalue = 1.684;
823         else if (numseq > 29) tvalue = 1.697;
824         else if (numseq > 28) tvalue = 1.699;
825         else if (numseq > 27) tvalue = 1.701;
826         else if (numseq > 26) tvalue = 1.703;
827         else if (numseq > 25) tvalue = 1.706;
828         else if (numseq > 24) tvalue = 1.708;
829         else if (numseq > 23) tvalue = 1.711;
830         else if (numseq > 22) tvalue = 1.714;
831         else if (numseq > 21) tvalue = 1.717;
832         else if (numseq > 20) tvalue = 1.721;
833         else if (numseq > 19) tvalue = 1.725;
834         else if (numseq > 18) tvalue = 1.729;
835         else if (numseq > 17) tvalue = 1.734;
836         else if (numseq > 16) tvalue = 1.740;
837         else if (numseq > 15) tvalue = 1.746;
838         else if (numseq > 14) tvalue = 1.753;
839         else if (numseq > 13) tvalue = 1.761;
840         else if (numseq > 12) tvalue = 1.771;
841         else if (numseq > 11) tvalue = 1.782;
842         else if (numseq > 10) tvalue = 1.796;
843         else if (numseq > 9) tvalue = 1.812;
844         else if (numseq > 8) tvalue = 1.833;
845         else if (numseq > 7) tvalue = 1.860;
846         else if (numseq > 6) tvalue = 1.895;
847         else if (numseq > 5) tvalue = 1.943;
848         else if (numseq > 4) tvalue = 2.015;
849         else if (numseq > 3) tvalue = 2.132;
850         else if (numseq > 2) tvalue = 2.353;
851         else if (numseq > 1) tvalue = 2.920;
852                 else if (numseq <= 1) {
853                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
854                 }
855                 
856                 return tvalue;
857         }
858         catch(exception& e) {
859                 m->errorOut(e, "Ccode", "getT");
860                 exit(1);
861         }
862 }
863 /**************************************************************************************************/    
864 float Ccode::getF(int numseq) {
865         try {
866         
867                 float fvalue = 0;
868                 
869                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
870         if (numseq > 120) fvalue = 3.84;
871         else if (numseq > 60) fvalue = 3.92;
872         else if (numseq > 40) fvalue = 4.00;
873         else if (numseq > 30) fvalue = 4.08;
874         else if (numseq > 29) fvalue = 4.17;
875         else if (numseq > 28) fvalue = 4.18;
876         else if (numseq > 27) fvalue = 4.20;
877         else if (numseq > 26) fvalue = 4.21;
878         else if (numseq > 25) fvalue = 4.23;
879         else if (numseq > 24) fvalue = 4.24;
880         else if (numseq > 23) fvalue = 4.26;
881         else if (numseq > 22) fvalue = 4.28;
882         else if (numseq > 21) fvalue = 4.30;
883         else if (numseq > 20) fvalue = 4.32;
884         else if (numseq > 19) fvalue = 4.35;
885         else if (numseq > 18) fvalue = 4.38;
886         else if (numseq > 17) fvalue = 4.41;
887         else if (numseq > 16) fvalue = 4.45;
888         else if (numseq > 15) fvalue = 4.49;
889         else if (numseq > 14) fvalue = 4.54;
890         else if (numseq > 13) fvalue = 4.60;
891         else if (numseq > 12) fvalue = 4.67;
892         else if (numseq > 11) fvalue = 4.75;
893         else if (numseq > 10) fvalue = 4.84;
894         else if (numseq > 9) fvalue = 4.96;
895         else if (numseq > 8) fvalue = 5.12;
896         else if (numseq > 7) fvalue = 5.32;
897         else if (numseq > 6) fvalue = 5.59;
898         else if (numseq > 5) fvalue = 5.99;
899         else if (numseq > 4) fvalue = 6.61;
900         else if (numseq > 3) fvalue = 7.71;
901         else if (numseq > 2) fvalue = 10.1;
902         else if (numseq > 1) fvalue = 18.5;
903         else if (numseq > 0) fvalue = 161;
904                 else if (numseq <= 0) {
905                         m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
906         }
907                 
908                 return fvalue;
909         }
910         catch(exception& e) {
911                 m->errorOut(e, "Ccode", "getF");
912                 exit(1);
913         }
914 }
915 //***************************************************************************************************************
916
917
918