]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
chimeras, fix to sabundvector and sharedsabundvector that caused getRabundVector...
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string o) {  
17         fastafile = filename;  outputDir = o; 
18         distCalc = new eachGapDist();
19         decalc = new DeCalculator();
20         
21         mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo";
22         ofstream out2;
23         openOutputFile(mapInfo, out2);
24                 
25         out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
26         out2.close();
27 }
28 //***************************************************************************************************************
29 Ccode::~Ccode() {
30         delete distCalc;
31         delete decalc;
32 }       
33 //***************************************************************************************************************
34 void Ccode::printHeader(ostream& out) {
35         out << "For full window mapping info refer to " << mapInfo << endl << endl;
36 }
37 //***************************************************************************************************************
38 void Ccode::print(ostream& out) {
39         try {
40                 
41                 mothurOutEndLine();
42                 
43                 ofstream out2;
44                 openOutputFileAppend(mapInfo, out2);
45                 
46                 out2 << querySeq->getName() << endl;
47                 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
48                         out2 << it->first << '\t' << it->second << endl;
49                 }
50                 out2.close();
51                 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
52                         
53                 for (int j = 0; j < closest.size(); j++) {
54                         out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
55                 }
56                 out << endl << endl;
57                 
58                 //for each window
59                 //window mapping info.
60                 out << "Mapping information: ";
61                 //you mask and did not filter
62                 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
63                                 
64                 //you filtered and did not mask
65                 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
66                                 
67                 //you masked and filtered
68                 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
69                         
70                 out << endl << "Window\tStartPos\tEndPos" << endl;
71                 it = trim.begin();
72                 for (int k = 0; k < windows.size()-1; k++) {
73                         out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
74                 }
75                         
76                 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
77                 out << endl;
78                 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
79                 for (int k = 0; k < windows.size(); k++) {
80                         float ds = averageQuery[k] / averageRef[k]; 
81                         out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
82                 }
83                 out << endl;
84                         
85                 //varRef
86                 //varQuery
87                 /* F test for differences among variances.
88                 * varQuery is expected to be higher or similar than varRef */
89                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
90                         
91                 bool results = false;   
92                                         
93                 //confidence limit, t - Student, anova
94                 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
95                         
96                 for (int k = 0; k < windows.size(); k++) {
97                         string temp = "";
98                         if (isChimericConfidence[k]) {  temp += "*\t"; }
99                         else { temp += "\t"; }
100                                 
101                         if (isChimericTStudent[k]) {  temp += "*\t"; }
102                         else { temp += "\t"; }
103                                 
104                         if (isChimericANOVA[k]) {  temp += "*\t"; }
105                         else { temp += "\t"; }
106                         
107                         out << k+1 << '\t' << temp << endl;
108                                 
109                         if (temp == "*\t*\t*\t") {  results = true;  }
110                 }
111                 out << endl;    
112                         
113                 if (results) {
114                         mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
115                 }
116
117                 //free memory
118                 for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
119
120                 
121         }
122         catch(exception& e) {
123                 errorOut(e, "Ccode", "print");
124                 exit(1);
125         }
126 }
127 //***************************************************************************************************************
128 int Ccode::getChimeras(Sequence* query) {
129         try {
130         
131                 closest.clear();
132                 refCombo = 0;
133                 sumRef.clear(); 
134                 varRef.clear(); 
135                 varQuery.clear(); 
136                 sdRef.clear(); 
137                 sdQuery.clear();     
138                 sumQuery.clear();
139                 sumSquaredRef.clear(); 
140                 sumSquaredQuery.clear(); 
141                 averageRef.clear();
142                 averageQuery.clear();
143                 anova.clear();
144                 isChimericConfidence.clear();
145                 isChimericTStudent.clear();
146                 isChimericANOVA.clear();
147                 trim.clear();
148                 spotMap.clear();
149                 windowSizes = window;
150                 windows.clear();
151
152         
153                 querySeq = query;
154                 
155                 //find closest matches to query
156                 closest = findClosest(query, numWanted);
157                 
158                 //initialize spotMap
159                 for (int i = 0; i < query->getAligned().length(); i++) {        spotMap[i] = i;         }
160         
161                 //mask sequences if the user wants to 
162                 if (seqMask != "") {
163                         decalc->setMask(seqMask);
164                         
165                         decalc->runMask(query);
166                         
167                         //mask closest
168                         for (int i = 0; i < closest.size(); i++) {      decalc->runMask(closest[i].seq);        }
169                         
170                         spotMap = decalc->getMaskMap();
171                 }
172                 
173                 if (filter) {
174                         vector<Sequence*> temp;
175                         for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
176                         temp.push_back(query);  
177                         
178                         createFilter(temp, 0.5);
179                 
180                         for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]);  }
181                         
182                         //update spotMap
183                         map<int, int> newMap;
184                         int spot = 0;
185                         
186                         for (int i = 0; i < filterString.length(); i++) {
187                                 if (filterString[i] == '1') {
188                                         //add to newMap
189                                         newMap[spot] = spotMap[i];
190                                         spot++;  
191                                 }
192                         }
193                         spotMap = newMap;
194                 }
195
196                 //trim sequences - this follows ccodes remove_extra_gaps 
197                 trimSequences(query);
198                 
199                 
200                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
201                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
202                 windows = findWindows();  
203                 
204
205                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later 
206                 removeBadReferenceSeqs(closest);
207                 
208                 
209                 //find the averages for each querys references
210                 getAverageRef(closest);  //fills sumRef, averageRef, sumSquaredRef and refCombo.
211                 getAverageQuery(closest, query);  //fills sumQuery, averageQuery, sumSquaredQuery.
212                                         
213                 
214                 //find the averages for each querys references 
215                 findVarianceRef();  //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
216                         
217                         
218                 //find the averages for the query 
219                 findVarianceQuery();  //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
220                                         
221                 determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
222                 
223                 return 0;
224         }
225         catch(exception& e) {
226                 errorOut(e, "Ccode", "getChimeras");
227                 exit(1);
228         }
229 }
230 /***************************************************************************************************************/
231 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
232 void Ccode::trimSequences(Sequence* query) {
233         try {
234                 
235                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
236                 int rearPos = query->getAligned().length();
237                 
238                 //********find first position in closest seqs that is a non gap character***********//
239                 //find first position all query seqs that is a non gap character
240                 for (int i = 0; i < closest.size(); i++) {
241                         
242                         string aligned = closest[i].seq->getAligned();
243                         int pos = 0;
244                         
245                         //find first spot in this seq
246                         for (int j = 0; j < aligned.length(); j++) {
247                                 if (isalpha(aligned[j])) {
248                                         pos = j;
249                                         break;
250                                 }
251                         }
252                         
253                         //save this spot if it is the farthest
254                         if (pos > frontPos) { frontPos = pos; }
255                 }
256                 
257                 //find first position all querySeq[query] that is a non gap character
258                 string aligned = query->getAligned();
259                 int pos = 0;
260                         
261                 //find first spot in this seq
262                 for (int j = 0; j < aligned.length(); j++) {
263                         if (isalpha(aligned[j])) {
264                                 pos = j;
265                                 break;
266                         }
267                 }
268                 
269                 //save this spot if it is the farthest
270                 if (pos > frontPos) { frontPos = pos; }
271                 
272                 
273                 //********find last position in closest seqs that is a non gap character***********//
274                 for (int i = 0; i < closest.size(); i++) {
275                         
276                         string aligned = closest[i].seq->getAligned();
277                         int pos = aligned.length();
278                         
279                         //find first spot in this seq
280                         for (int j = aligned.length()-1; j >= 0; j--) {
281                                 if (isalpha(aligned[j])) {
282                                         pos = j;
283                                         break;
284                                 }
285                         }
286                         
287                         //save this spot if it is the farthest
288                         if (pos < rearPos) { rearPos = pos; }
289                 }
290                 
291                 //find last position all querySeqs[query] that is a non gap character
292                 aligned = query->getAligned();
293                 pos = aligned.length();
294                 
295                 //find first spot in this seq
296                 for (int j = aligned.length()-1; j >= 0; j--) {
297                         if (isalpha(aligned[j])) {
298                                 pos = j;
299                                 break;
300                         }
301                 }
302                 
303                 //save this spot if it is the farthest
304                 if (pos < rearPos) { rearPos = pos; }
305
306                 
307                 //check to make sure that is not whole seq
308                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
309                 
310                 map<int, int> tempTrim;
311                 tempTrim[frontPos] = rearPos;
312                 
313                 //save trimmed locations
314                 trim = tempTrim;
315                                                 
316                 //update spotMask
317                 map<int, int> newMap;
318                 int spot = 0;
319                 
320                 for (int i = frontPos; i < rearPos; i++) {
321                         //add to newMap
322                         newMap[spot] = spotMap[i];
323                         spot++;  
324                 }
325                 spotMap = newMap;
326         }
327         catch(exception& e) {
328                 errorOut(e, "Ccode", "trimSequences");
329                 exit(1);
330         }
331 }
332 /***************************************************************************************************************/
333 vector<int> Ccode::findWindows() {
334         try {
335                 
336                 vector<int> win; 
337                 it = trim.begin();
338                 
339                 int length = it->second - it->first;
340         
341                 //default is wanted = 10% of total length
342                 if (windowSizes > length) { 
343                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
344                         windowSizes = length / 10;
345                 }else if (windowSizes == 0) { windowSizes = length / 10;  }
346                 else if (windowSizes > (length * 0.20)) {
347                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
348                 }else if (windowSizes < (length * 0.05)) {
349                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
350                 }
351                 
352                 //save starting points of each window
353                 for (int m = it->first;  m < (it->second-windowSizes); m+=windowSizes) {  win.push_back(m);  }
354                 
355                 //save last window
356                 if (win[win.size()-1] < (it->first+length)) {
357                         win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
358                 }                                                                                                                                                                                                       //with this you would get 1,25,50,75,100
359                 
360                 return win;
361         }
362         catch(exception& e) {
363                 errorOut(e, "Ccode", "findWindows");
364                 exit(1);
365         }
366 }
367 //***************************************************************************************************************
368 int Ccode::getDiff(string seqA, string seqB) {
369         try {
370                 
371                 int numDiff = 0;
372                 
373                 for (int i = 0; i < seqA.length(); i++) {
374                         //if you are both not gaps
375                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
376                                 //are you different
377                                 if (seqA[i] != seqB[i]) { 
378                                          int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
379
380                                         /* the char in base_a and base_b have been checked and they are different */
381                                         if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
382                                         else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
383                                         else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
384                                         else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
385                                         else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
386                                         else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
387                                         else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
388                                         else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
389                                         else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
390                                         else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
391                                         else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
392                                         else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
393                                         else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
394                                         else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
395                                         else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
396                                         else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
397                                         else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
398                                         else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
399                                         else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
400                                         else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
401                                         else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
402                                         else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
403                                         else ok = 0;  /* the bases are different and not equivalent */
404                                         
405                                         //check if they are both blanks
406                                         if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
407                                         else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
408                                         
409                                         if (ok == 0) {  numDiff++;  }
410                                 }
411                         //}
412                 }
413                 
414                 return numDiff;
415         
416         }
417         catch(exception& e) {
418                 errorOut(e, "Ccode", "getDiff");
419                 exit(1);
420         }
421 }
422 //***************************************************************************************************************
423 //tried to make this look most like ccode original implementation
424 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
425         try {
426                 
427                 vector< vector<int> > numDiffBases;
428                 numDiffBases.resize(seqs.size());
429                 //initialize to 0
430                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
431                 
432                 it = trim.begin();
433                 int length = it->second - it->first;
434                 
435                 //calc differences from each sequence to everyother seq in the set
436                 for (int i = 0; i < seqs.size(); i++) {
437                         
438                         string seqA = seqs[i].seq->getAligned().substr(it->first, length);
439                         
440                         //so you don't calc i to j and j to i since they are the same
441                         for (int j = 0; j < i; j++) {
442                                 
443                                 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
444                                 
445                                 //compare strings
446                                 int numDiff = getDiff(seqA, seqB);
447                                 
448                                 numDiffBases[i][j] = numDiff;
449                                 numDiffBases[j][i] = numDiff;
450                         }
451                 }
452                 
453                 //initailize remove to 0
454                 vector<int> remove;  remove.resize(seqs.size(), 0);
455                 float top = ((20*length) / (float) 100);
456                 float bottom = ((0.5*length) / (float) 100);
457                 
458                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
459                 for (int i = 0; i < numDiffBases.size(); i++) {
460                         for (int j = 0; j < i; j++) {   
461                                 //are you more than 20% different
462                                 if (numDiffBases[i][j] > top)           {  remove[j] = 1;  }
463                                 //are you less than 0.5% different
464                                 if (numDiffBases[i][j] < bottom)        {  remove[j] = 1;  }
465                         }
466                 }
467                 
468                 int numSeqsLeft = 0;
469                 
470                 //count seqs that are not going to be removed
471                 for (int i = 0; i < remove.size(); i++) {  
472                         if (remove[i] == 0)  { numSeqsLeft++;  }
473                 }
474                 
475                 //if you have enough then remove bad ones
476                 if (numSeqsLeft >= 3) {
477                         vector<SeqDist> goodSeqs;
478                         //remove bad seqs
479                         for (int i = 0; i < remove.size(); i++) {
480                                 if (remove[i] == 0) { 
481                                         goodSeqs.push_back(seqs[i]);
482                                 }
483                         }
484                         
485                         seqs = goodSeqs;
486                         
487                 }else { //warn, but dont remove any
488                         mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
489                 }
490
491         }
492         catch(exception& e) {
493                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
494                 exit(1);
495         }
496 }
497 //***************************************************************************************************************
498 //makes copy of templateseq for filter
499 vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
500         try{
501         
502                 vector<SeqDist>  topMatches;  
503                 
504                 Sequence query = *(q);
505                         
506                 //calc distance to each sequence in template seqs
507                 for (int i = 0; i < templateSeqs.size(); i++) {
508                         
509                         Sequence ref = *(templateSeqs[i]); 
510                                         
511                         //find overall dist
512                         distCalc->calcDist(query, ref);
513                         float dist = distCalc->getDist();       
514                                 
515                         //save distance
516                         SeqDist temp;
517                         temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
518                         temp.dist = dist;
519
520                         topMatches.push_back(temp);
521                 }
522                         
523                 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
524                 
525                 for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
526                 
527                 topMatches.resize(numWanted);
528                         
529                 return topMatches;
530
531         }
532         catch(exception& e) {
533                 errorOut(e, "Ccode", "findClosestSides");
534                 exit(1);
535         }
536 }
537 /**************************************************************************************************/
538 //find the distances from each reference sequence to every other reference sequence for each window for this query
539 void Ccode::getAverageRef(vector<SeqDist> ref) {
540         try {
541                 
542                 vector< vector< vector<int> > >  diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
543                 
544                 //initialize diffs vector
545                 diffs.resize(ref.size());
546                 for (int i = 0; i < diffs.size(); i++) {  
547                         diffs[i].resize(ref.size());
548                         for (int j = 0; j < diffs[i].size(); j++) {
549                                 diffs[i][j].resize(windows.size(), 0);
550                         }
551                 }
552                 
553                 it = trim.begin();
554                                 
555                 //find the distances from each reference sequence to every other reference sequence for each window for this query              
556                 for (int i = 0; i < ref.size(); i++) {
557                         
558                         string refI = ref[i].seq->getAligned();
559                         
560                         //j<i, so you don't find distances from i to j and then j to i.
561                         for (int j = 0; j < i; j++) {
562                         
563                                 string refJ = ref[j].seq->getAligned();
564                         
565                                 for (int k = 0; k < windows.size(); k++) {
566                                         
567                                         string refIWindowk, refJWindowk;
568                                         
569                                         if (k < windows.size()-1) {
570                                                 //get window strings
571                                                 refIWindowk = refI.substr(windows[k], windowSizes);
572                                                 refJWindowk = refJ.substr(windows[k], windowSizes);
573                                         }else { //last window may be smaller than rest - see findwindows
574                                                 //get window strings
575                                                 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
576                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
577                                         }
578                                         
579                                         //find differences
580                                         int diff = getDiff(refIWindowk, refJWindowk);
581
582                                         //save differences in [i][j][k] and [j][i][k] since they are the same
583                                         diffs[i][j][k] = diff;
584                                         diffs[j][i][k] = diff;
585
586                                 }//k
587                                 
588                         }//j
589                 
590                 }//i
591                 
592                 //initialize sumRef for this query 
593                 sumRef.resize(windows.size(), 0);
594                 sumSquaredRef.resize(windows.size(), 0);
595                 averageRef.resize(windows.size(), 0);
596                 
597                 //find the sum of the differences for hte reference sequences
598                 for (int i = 0; i < diffs.size(); i++) {
599                         for (int j = 0; j < i; j++) {
600                         
601                                 //increment this querys reference sequences combos
602                                 refCombo++;
603                                 
604                                 for (int k = 0; k < diffs[i][j].size(); k++) {
605                                         sumRef[k] += diffs[i][j][k];
606                                         sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
607                                 }//k
608                                 
609                         }//j
610                 }//i
611
612                 
613                 //find the average of the differences for the references for each window
614                 for (int i = 0; i < windows.size(); i++) {
615                         averageRef[i] = sumRef[i] / (float) refCombo;
616                 }
617                 
618         }
619         catch(exception& e) {
620                 errorOut(e, "Ccode", "getAverageRef");
621                 exit(1);
622         }
623 }
624 /**************************************************************************************************/
625 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
626         try {
627         
628                 vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
629                 
630                 //initialize diffs vector
631                 diffs.resize(ref.size());
632                 for (int j = 0; j < diffs.size(); j++) {
633                         diffs[j].resize(windows.size(), 0);
634                 }
635                 
636                 it = trim.begin();
637                                                         
638                 string refQuery = query->getAligned();
639                          
640                 //j<i, so you don't find distances from i to j and then j to i.
641                 for (int j = 0; j < ref.size(); j++) {
642                          
643                          string refJ = ref[j].seq->getAligned();
644                          
645                          for (int k = 0; k < windows.size(); k++) {
646                                         
647                                         string QueryWindowk, refJWindowk;
648                                         
649                                         if (k < windows.size()-1) {
650                                                 //get window strings
651                                                 QueryWindowk = refQuery.substr(windows[k], windowSizes);
652                                                 refJWindowk = refJ.substr(windows[k], windowSizes);                                     
653                                         }else { //last window may be smaller than rest - see findwindows
654                                                 //get window strings
655                                                 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
656                                                 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
657                                         }
658                                         
659                                         //find differences
660                                         int diff = getDiff(QueryWindowk, refJWindowk);
661                                         
662                                         //save differences 
663                                         diffs[j][k] = diff;
664                          
665                          }//k
666                 }//j
667                          
668                 
669                 //initialize sumRef for this query 
670                 sumQuery.resize(windows.size(), 0);
671                 sumSquaredQuery.resize(windows.size(), 0);
672                 averageQuery.resize(windows.size(), 0);
673                 
674                 //find the sum of the differences 
675                 for (int j = 0; j < diffs.size(); j++) {
676                         for (int k = 0; k < diffs[j].size(); k++) {
677                                 sumQuery[k] += diffs[j][k];
678                                 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
679                         }//k
680                 }//j
681                 
682                 
683                 //find the average of the differences for the references for each window
684                 for (int i = 0; i < windows.size(); i++) {
685                         averageQuery[i] = sumQuery[i] / (float) ref.size();
686                 }
687         }
688         catch(exception& e) {
689                 errorOut(e, "Ccode", "getAverageQuery");
690                 exit(1);
691         }
692 }
693 /**************************************************************************************************/
694 void Ccode::findVarianceRef() {
695         try {
696                 
697                 varRef.resize(windows.size(), 0);
698                 sdRef.resize(windows.size(), 0);
699                 
700                 //for each window
701                 for (int i = 0; i < windows.size(); i++) {
702                         varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
703                         sdRef[i] = sqrt(varRef[i]);
704                         
705                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
706                         if (averageRef[i] < 0.001)                      {       averageRef[i] = 0.001;          }
707                         if (sumRef[i] < 0.001)                          {       sumRef[i] = 0.001;                      }
708                         if (varRef[i] < 0.001)                          {       varRef[i] = 0.001;                      }
709                         if (sumSquaredRef[i] < 0.001)           {       sumSquaredRef[i] = 0.001;       }
710                         if (sdRef[i] < 0.001)                           {       sdRef[i] = 0.001;                       }
711                                 
712                 }
713         }
714         catch(exception& e) {
715                 errorOut(e, "Ccode", "findVarianceRef");
716                 exit(1);
717         }
718 }
719 /**************************************************************************************************/
720 void Ccode::findVarianceQuery() {
721         try {
722                 varQuery.resize(windows.size(), 0);
723                 sdQuery.resize(windows.size(), 0);
724                 
725                 //for each window
726                 for (int i = 0; i < windows.size(); i++) {
727                         varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
728                         sdQuery[i] = sqrt(varQuery[i]);
729                         
730                         //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
731                         if (averageQuery[i] < 0.001)                    {       averageQuery[i] = 0.001;                }
732                         if (sumQuery[i] < 0.001)                                {       sumQuery[i] = 0.001;                    }
733                         if (varQuery[i] < 0.001)                                {       varQuery[i] = 0.001;                    }
734                         if (sumSquaredQuery[i] < 0.001)         {       sumSquaredQuery[i] = 0.001;     }
735                         if (sdQuery[i] < 0.001)                         {       sdQuery[i] = 0.001;                     }
736                 }
737
738         }
739         catch(exception& e) {
740                 errorOut(e, "Ccode", "findVarianceQuery");
741                 exit(1);
742         }
743 }
744 /**************************************************************************************************/
745 void Ccode::determineChimeras() {
746         try {
747                 
748                 isChimericConfidence.resize(windows.size(), false);
749                 isChimericTStudent.resize(windows.size(), false);
750                 isChimericANOVA.resize(windows.size(), false);
751                 anova.resize(windows.size());
752
753                 
754                 //for each window
755                 for (int i = 0; i < windows.size(); i++) {
756                 
757                         //get confidence limits
758                         float t = getT(closest.size()-1);  //how many seqs you are comparing to this querySeq
759                         float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; 
760                         float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; 
761                 
762                         if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) {  /* range does not include 1 */
763                                         isChimericConfidence[i] = true;   /* significantly higher at P<0.05 */ 
764
765                         }
766                         
767                         //student t test
768                         int degreeOfFreedom = refCombo + closest.size() - 2;
769                         float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size()));    /* denominator, without sqrt(), for ts calculations */
770                                 
771                         float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT)));  /* value of ts for t-student test */   
772                         t = getT(degreeOfFreedom);
773         
774                         if ((ts >= t) && (averageQuery[i] > averageRef[i])) {   
775                                 isChimericTStudent[i] = true;   /* significantly higher at P<0.05 */ 
776                         }
777                 
778                         //anova test
779                         float value1 = sumQuery[i] + sumRef[i];
780                         float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
781                         float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
782                         float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
783                         float value5 = value2 - value4;
784                         float value6 = value3 - value4;
785                         float value7 = value5 - value6;
786                         float value8 = value7 / ((float) degreeOfFreedom);
787                         float anovaValue = value6 / value8;
788                         
789                         float f = getF(degreeOfFreedom);
790                         
791                          if ((anovaValue >= f) && (averageQuery[i] > averageRef[i]))  {
792                        isChimericANOVA[i] = true;   /* significant P<0.05 */
793                 }
794                         
795                         if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
796                         
797                         anova[i] = anovaValue;
798                 }
799                 
800         }
801         catch(exception& e) {
802                 errorOut(e, "Ccode", "determineChimeras");
803                 exit(1);
804         }
805 }
806 /**************************************************************************************************/    
807 float Ccode::getT(int numseq) {
808         try {
809         
810                 float tvalue = 0;
811                 
812                 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
813                 if (numseq > 120) tvalue = 1.645;
814                 else if (numseq > 60) tvalue = 1.658;
815         else if (numseq > 40) tvalue = 1.671;
816         else if (numseq > 30) tvalue = 1.684;
817         else if (numseq > 29) tvalue = 1.697;
818         else if (numseq > 28) tvalue = 1.699;
819         else if (numseq > 27) tvalue = 1.701;
820         else if (numseq > 26) tvalue = 1.703;
821         else if (numseq > 25) tvalue = 1.706;
822         else if (numseq > 24) tvalue = 1.708;
823         else if (numseq > 23) tvalue = 1.711;
824         else if (numseq > 22) tvalue = 1.714;
825         else if (numseq > 21) tvalue = 1.717;
826         else if (numseq > 20) tvalue = 1.721;
827         else if (numseq > 19) tvalue = 1.725;
828         else if (numseq > 18) tvalue = 1.729;
829         else if (numseq > 17) tvalue = 1.734;
830         else if (numseq > 16) tvalue = 1.740;
831         else if (numseq > 15) tvalue = 1.746;
832         else if (numseq > 14) tvalue = 1.753;
833         else if (numseq > 13) tvalue = 1.761;
834         else if (numseq > 12) tvalue = 1.771;
835         else if (numseq > 11) tvalue = 1.782;
836         else if (numseq > 10) tvalue = 1.796;
837         else if (numseq > 9) tvalue = 1.812;
838         else if (numseq > 8) tvalue = 1.833;
839         else if (numseq > 7) tvalue = 1.860;
840         else if (numseq > 6) tvalue = 1.895;
841         else if (numseq > 5) tvalue = 1.943;
842         else if (numseq > 4) tvalue = 2.015;
843         else if (numseq > 3) tvalue = 2.132;
844         else if (numseq > 2) tvalue = 2.353;
845         else if (numseq > 1) tvalue = 2.920;
846                 else if (numseq <= 1) {
847                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
848                 }
849                 
850                 return tvalue;
851         }
852         catch(exception& e) {
853                 errorOut(e, "Ccode", "getT");
854                 exit(1);
855         }
856 }
857 /**************************************************************************************************/    
858 float Ccode::getF(int numseq) {
859         try {
860         
861                 float fvalue = 0;
862                 
863                  /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
864         if (numseq > 120) fvalue = 3.84;
865         else if (numseq > 60) fvalue = 3.92;
866         else if (numseq > 40) fvalue = 4.00;
867         else if (numseq > 30) fvalue = 4.08;
868         else if (numseq > 29) fvalue = 4.17;
869         else if (numseq > 28) fvalue = 4.18;
870         else if (numseq > 27) fvalue = 4.20;
871         else if (numseq > 26) fvalue = 4.21;
872         else if (numseq > 25) fvalue = 4.23;
873         else if (numseq > 24) fvalue = 4.24;
874         else if (numseq > 23) fvalue = 4.26;
875         else if (numseq > 22) fvalue = 4.28;
876         else if (numseq > 21) fvalue = 4.30;
877         else if (numseq > 20) fvalue = 4.32;
878         else if (numseq > 19) fvalue = 4.35;
879         else if (numseq > 18) fvalue = 4.38;
880         else if (numseq > 17) fvalue = 4.41;
881         else if (numseq > 16) fvalue = 4.45;
882         else if (numseq > 15) fvalue = 4.49;
883         else if (numseq > 14) fvalue = 4.54;
884         else if (numseq > 13) fvalue = 4.60;
885         else if (numseq > 12) fvalue = 4.67;
886         else if (numseq > 11) fvalue = 4.75;
887         else if (numseq > 10) fvalue = 4.84;
888         else if (numseq > 9) fvalue = 4.96;
889         else if (numseq > 8) fvalue = 5.12;
890         else if (numseq > 7) fvalue = 5.32;
891         else if (numseq > 6) fvalue = 5.59;
892         else if (numseq > 5) fvalue = 5.99;
893         else if (numseq > 4) fvalue = 6.61;
894         else if (numseq > 3) fvalue = 7.71;
895         else if (numseq > 2) fvalue = 10.1;
896         else if (numseq > 1) fvalue = 18.5;
897         else if (numseq > 0) fvalue = 161;
898                 else if (numseq <= 0) {
899                         mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine();
900         }
901                 
902                 return fvalue;
903         }
904         catch(exception& e) {
905                 errorOut(e, "Ccode", "getF");
906                 exit(1);
907         }
908 }
909 //***************************************************************************************************************
910
911
912