5 * Created by westcott on 8/24/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string o) {
17 fastafile = filename; outputDir = o;
18 distCalc = new eachGapDist();
19 decalc = new DeCalculator();
21 mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo";
23 openOutputFile(mapInfo, out2);
25 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
28 //***************************************************************************************************************
33 //***************************************************************************************************************
34 void Ccode::printHeader(ostream& out) {
35 out << "For full window mapping info refer to " << mapInfo << endl << endl;
37 //***************************************************************************************************************
38 int Ccode::print(ostream& out, ostream& outAcc) {
42 openOutputFileAppend(mapInfo, out2);
44 out2 << querySeq->getName() << endl;
45 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
46 out2 << it->first << '\t' << it->second << endl;
49 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
51 for (int j = 0; j < closest.size(); j++) {
52 out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
57 //window mapping info.
58 out << "Mapping information: ";
59 //you mask and did not filter
60 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
62 //you filtered and did not mask
63 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
65 //you masked and filtered
66 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
68 out << endl << "Window\tStartPos\tEndPos" << endl;
70 for (int k = 0; k < windows.size()-1; k++) {
71 out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
74 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
76 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
77 for (int k = 0; k < windows.size(); k++) {
78 float ds = averageQuery[k] / averageRef[k];
79 out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
85 /* F test for differences among variances.
86 * varQuery is expected to be higher or similar than varRef */
87 //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */
91 //confidence limit, t - Student, anova
92 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
94 for (int k = 0; k < windows.size(); k++) {
96 if (isChimericConfidence[k]) { temp += "*\t"; }
97 else { temp += "\t"; }
99 if (isChimericTStudent[k]) { temp += "*\t"; }
100 else { temp += "\t"; }
102 if (isChimericANOVA[k]) { temp += "*\t"; }
103 else { temp += "\t"; }
105 out << k+1 << '\t' << temp << endl;
107 if (temp == "*\t*\t*\t") { results = true; }
112 m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
113 outAcc << querySeq->getName() << endl;
117 for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; }
121 catch(exception& e) {
122 m->errorOut(e, "Ccode", "print");
126 //***************************************************************************************************************
127 int Ccode::getChimeras(Sequence* query) {
138 sumSquaredRef.clear();
139 sumSquaredQuery.clear();
141 averageQuery.clear();
143 isChimericConfidence.clear();
144 isChimericTStudent.clear();
145 isChimericANOVA.clear();
148 windowSizes = window;
154 //find closest matches to query
155 closest = findClosest(query, numWanted);
157 if (m->control_pressed) { return 0; }
160 for (int i = 0; i < query->getAligned().length(); i++) { spotMap[i] = i; }
162 //mask sequences if the user wants to
164 decalc->setMask(seqMask);
166 decalc->runMask(query);
169 for (int i = 0; i < closest.size(); i++) { decalc->runMask(closest[i].seq); }
171 spotMap = decalc->getMaskMap();
175 vector<Sequence*> temp;
176 for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); }
177 temp.push_back(query);
179 createFilter(temp, 0.5);
181 for (int i = 0; i < temp.size(); i++) {
182 if (m->control_pressed) { return 0; }
187 map<int, int> newMap;
190 for (int i = 0; i < filterString.length(); i++) {
191 if (filterString[i] == '1') {
193 newMap[spot] = spotMap[i];
200 //trim sequences - this follows ccodes remove_extra_gaps
201 trimSequences(query);
202 if (m->control_pressed) { return 0; }
204 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
205 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
206 windows = findWindows();
207 if (m->control_pressed) { return 0; }
209 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
210 removeBadReferenceSeqs(closest);
211 if (m->control_pressed) { return 0; }
213 //find the averages for each querys references
214 getAverageRef(closest); //fills sumRef, averageRef, sumSquaredRef and refCombo.
215 getAverageQuery(closest, query); //fills sumQuery, averageQuery, sumSquaredQuery.
216 if (m->control_pressed) { return 0; }
218 //find the averages for each querys references
219 findVarianceRef(); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
220 if (m->control_pressed) { return 0; }
222 //find the averages for the query
223 findVarianceQuery(); //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
224 if (m->control_pressed) { return 0; }
226 determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
227 if (m->control_pressed) { return 0; }
231 catch(exception& e) {
232 m->errorOut(e, "Ccode", "getChimeras");
236 /***************************************************************************************************************/
237 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
238 void Ccode::trimSequences(Sequence* query) {
241 int frontPos = 0; //should contain first position in all seqs that is not a gap character
242 int rearPos = query->getAligned().length();
244 //********find first position in closest seqs that is a non gap character***********//
245 //find first position all query seqs that is a non gap character
246 for (int i = 0; i < closest.size(); i++) {
248 string aligned = closest[i].seq->getAligned();
251 //find first spot in this seq
252 for (int j = 0; j < aligned.length(); j++) {
253 if (isalpha(aligned[j])) {
259 //save this spot if it is the farthest
260 if (pos > frontPos) { frontPos = pos; }
263 //find first position all querySeq[query] that is a non gap character
264 string aligned = query->getAligned();
267 //find first spot in this seq
268 for (int j = 0; j < aligned.length(); j++) {
269 if (isalpha(aligned[j])) {
275 //save this spot if it is the farthest
276 if (pos > frontPos) { frontPos = pos; }
279 //********find last position in closest seqs that is a non gap character***********//
280 for (int i = 0; i < closest.size(); i++) {
282 string aligned = closest[i].seq->getAligned();
283 int pos = aligned.length();
285 //find first spot in this seq
286 for (int j = aligned.length()-1; j >= 0; j--) {
287 if (isalpha(aligned[j])) {
293 //save this spot if it is the farthest
294 if (pos < rearPos) { rearPos = pos; }
297 //find last position all querySeqs[query] that is a non gap character
298 aligned = query->getAligned();
299 pos = aligned.length();
301 //find first spot in this seq
302 for (int j = aligned.length()-1; j >= 0; j--) {
303 if (isalpha(aligned[j])) {
309 //save this spot if it is the farthest
310 if (pos < rearPos) { rearPos = pos; }
313 //check to make sure that is not whole seq
314 if ((rearPos - frontPos - 1) <= 0) { m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1); }
316 map<int, int> tempTrim;
317 tempTrim[frontPos] = rearPos;
319 //save trimmed locations
323 map<int, int> newMap;
326 for (int i = frontPos; i < rearPos; i++) {
328 newMap[spot] = spotMap[i];
333 catch(exception& e) {
334 m->errorOut(e, "Ccode", "trimSequences");
338 /***************************************************************************************************************/
339 vector<int> Ccode::findWindows() {
345 int length = it->second - it->first;
347 //default is wanted = 10% of total length
348 if (windowSizes > length) {
349 m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
350 windowSizes = length / 10;
351 }else if (windowSizes == 0) { windowSizes = length / 10; }
352 else if (windowSizes > (length * 0.20)) {
353 m->mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
354 }else if (windowSizes < (length * 0.05)) {
355 m->mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
358 //save starting points of each window
359 for (int m = it->first; m < (it->second-windowSizes); m+=windowSizes) { win.push_back(m); }
362 if (win[win.size()-1] < (it->first+length)) {
363 win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
364 } //with this you would get 1,25,50,75,100
368 catch(exception& e) {
369 m->errorOut(e, "Ccode", "findWindows");
373 //***************************************************************************************************************
374 int Ccode::getDiff(string seqA, string seqB) {
379 for (int i = 0; i < seqA.length(); i++) {
380 //if you are both not gaps
381 //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
383 if (seqA[i] != seqB[i]) {
384 int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
386 /* the char in base_a and base_b have been checked and they are different */
387 if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
388 else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
389 else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
390 else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
391 else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
392 else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
393 else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
394 else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
395 else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
396 else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
397 else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
398 else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
399 else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
400 else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
401 else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
402 else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
403 else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
404 else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
405 else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
406 else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
407 else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
408 else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
409 else ok = 0; /* the bases are different and not equivalent */
411 //check if they are both blanks
412 if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
413 else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
415 if (ok == 0) { numDiff++; }
423 catch(exception& e) {
424 m->errorOut(e, "Ccode", "getDiff");
428 //***************************************************************************************************************
429 //tried to make this look most like ccode original implementation
430 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
433 vector< vector<int> > numDiffBases;
434 numDiffBases.resize(seqs.size());
436 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
439 int length = it->second - it->first;
441 //calc differences from each sequence to everyother seq in the set
442 for (int i = 0; i < seqs.size(); i++) {
444 string seqA = seqs[i].seq->getAligned().substr(it->first, length);
446 //so you don't calc i to j and j to i since they are the same
447 for (int j = 0; j < i; j++) {
449 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
452 int numDiff = getDiff(seqA, seqB);
454 numDiffBases[i][j] = numDiff;
455 numDiffBases[j][i] = numDiff;
459 //initailize remove to 0
460 vector<int> remove; remove.resize(seqs.size(), 0);
461 float top = ((20*length) / (float) 100);
462 float bottom = ((0.5*length) / (float) 100);
464 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
465 for (int i = 0; i < numDiffBases.size(); i++) {
466 for (int j = 0; j < i; j++) {
467 //are you more than 20% different
468 if (numDiffBases[i][j] > top) { remove[j] = 1; }
469 //are you less than 0.5% different
470 if (numDiffBases[i][j] < bottom) { remove[j] = 1; }
476 //count seqs that are not going to be removed
477 for (int i = 0; i < remove.size(); i++) {
478 if (remove[i] == 0) { numSeqsLeft++; }
481 //if you have enough then remove bad ones
482 if (numSeqsLeft >= 3) {
483 vector<SeqDist> goodSeqs;
485 for (int i = 0; i < remove.size(); i++) {
486 if (remove[i] == 0) {
487 goodSeqs.push_back(seqs[i]);
493 }else { //warn, but dont remove any
494 m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); m->mothurOutEndLine();
498 catch(exception& e) {
499 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
503 //***************************************************************************************************************
504 //makes copy of templateseq for filter
505 vector<SeqDist> Ccode::findClosest(Sequence* q, int numWanted) {
508 vector<SeqDist> topMatches;
510 Sequence query = *(q);
512 //calc distance to each sequence in template seqs
513 for (int i = 0; i < templateSeqs.size(); i++) {
515 Sequence ref = *(templateSeqs[i]);
518 distCalc->calcDist(query, ref);
519 float dist = distCalc->getDist();
523 temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
526 topMatches.push_back(temp);
529 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
531 for (int i = numWanted; i < topMatches.size(); i++) { delete topMatches[i].seq; }
533 topMatches.resize(numWanted);
538 catch(exception& e) {
539 m->errorOut(e, "Ccode", "findClosestSides");
543 /**************************************************************************************************/
544 //find the distances from each reference sequence to every other reference sequence for each window for this query
545 void Ccode::getAverageRef(vector<SeqDist> ref) {
548 vector< vector< vector<int> > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
550 //initialize diffs vector
551 diffs.resize(ref.size());
552 for (int i = 0; i < diffs.size(); i++) {
553 diffs[i].resize(ref.size());
554 for (int j = 0; j < diffs[i].size(); j++) {
555 diffs[i][j].resize(windows.size(), 0);
561 //find the distances from each reference sequence to every other reference sequence for each window for this query
562 for (int i = 0; i < ref.size(); i++) {
564 string refI = ref[i].seq->getAligned();
566 //j<i, so you don't find distances from i to j and then j to i.
567 for (int j = 0; j < i; j++) {
569 string refJ = ref[j].seq->getAligned();
571 for (int k = 0; k < windows.size(); k++) {
573 string refIWindowk, refJWindowk;
575 if (k < windows.size()-1) {
577 refIWindowk = refI.substr(windows[k], windowSizes);
578 refJWindowk = refJ.substr(windows[k], windowSizes);
579 }else { //last window may be smaller than rest - see findwindows
581 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
582 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
586 int diff = getDiff(refIWindowk, refJWindowk);
588 //save differences in [i][j][k] and [j][i][k] since they are the same
589 diffs[i][j][k] = diff;
590 diffs[j][i][k] = diff;
598 //initialize sumRef for this query
599 sumRef.resize(windows.size(), 0);
600 sumSquaredRef.resize(windows.size(), 0);
601 averageRef.resize(windows.size(), 0);
603 //find the sum of the differences for hte reference sequences
604 for (int i = 0; i < diffs.size(); i++) {
605 for (int j = 0; j < i; j++) {
607 //increment this querys reference sequences combos
610 for (int k = 0; k < diffs[i][j].size(); k++) {
611 sumRef[k] += diffs[i][j][k];
612 sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
619 //find the average of the differences for the references for each window
620 for (int i = 0; i < windows.size(); i++) {
621 averageRef[i] = sumRef[i] / (float) refCombo;
625 catch(exception& e) {
626 m->errorOut(e, "Ccode", "getAverageRef");
630 /**************************************************************************************************/
631 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
634 vector< vector<int> > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
636 //initialize diffs vector
637 diffs.resize(ref.size());
638 for (int j = 0; j < diffs.size(); j++) {
639 diffs[j].resize(windows.size(), 0);
644 string refQuery = query->getAligned();
646 //j<i, so you don't find distances from i to j and then j to i.
647 for (int j = 0; j < ref.size(); j++) {
649 string refJ = ref[j].seq->getAligned();
651 for (int k = 0; k < windows.size(); k++) {
653 string QueryWindowk, refJWindowk;
655 if (k < windows.size()-1) {
657 QueryWindowk = refQuery.substr(windows[k], windowSizes);
658 refJWindowk = refJ.substr(windows[k], windowSizes);
659 }else { //last window may be smaller than rest - see findwindows
661 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
662 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
666 int diff = getDiff(QueryWindowk, refJWindowk);
675 //initialize sumRef for this query
676 sumQuery.resize(windows.size(), 0);
677 sumSquaredQuery.resize(windows.size(), 0);
678 averageQuery.resize(windows.size(), 0);
680 //find the sum of the differences
681 for (int j = 0; j < diffs.size(); j++) {
682 for (int k = 0; k < diffs[j].size(); k++) {
683 sumQuery[k] += diffs[j][k];
684 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
689 //find the average of the differences for the references for each window
690 for (int i = 0; i < windows.size(); i++) {
691 averageQuery[i] = sumQuery[i] / (float) ref.size();
694 catch(exception& e) {
695 m->errorOut(e, "Ccode", "getAverageQuery");
699 /**************************************************************************************************/
700 void Ccode::findVarianceRef() {
703 varRef.resize(windows.size(), 0);
704 sdRef.resize(windows.size(), 0);
707 for (int i = 0; i < windows.size(); i++) {
708 varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
709 sdRef[i] = sqrt(varRef[i]);
711 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
712 if (averageRef[i] < 0.001) { averageRef[i] = 0.001; }
713 if (sumRef[i] < 0.001) { sumRef[i] = 0.001; }
714 if (varRef[i] < 0.001) { varRef[i] = 0.001; }
715 if (sumSquaredRef[i] < 0.001) { sumSquaredRef[i] = 0.001; }
716 if (sdRef[i] < 0.001) { sdRef[i] = 0.001; }
720 catch(exception& e) {
721 m->errorOut(e, "Ccode", "findVarianceRef");
725 /**************************************************************************************************/
726 void Ccode::findVarianceQuery() {
728 varQuery.resize(windows.size(), 0);
729 sdQuery.resize(windows.size(), 0);
732 for (int i = 0; i < windows.size(); i++) {
733 varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
734 sdQuery[i] = sqrt(varQuery[i]);
736 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
737 if (averageQuery[i] < 0.001) { averageQuery[i] = 0.001; }
738 if (sumQuery[i] < 0.001) { sumQuery[i] = 0.001; }
739 if (varQuery[i] < 0.001) { varQuery[i] = 0.001; }
740 if (sumSquaredQuery[i] < 0.001) { sumSquaredQuery[i] = 0.001; }
741 if (sdQuery[i] < 0.001) { sdQuery[i] = 0.001; }
745 catch(exception& e) {
746 m->errorOut(e, "Ccode", "findVarianceQuery");
750 /**************************************************************************************************/
751 void Ccode::determineChimeras() {
754 isChimericConfidence.resize(windows.size(), false);
755 isChimericTStudent.resize(windows.size(), false);
756 isChimericANOVA.resize(windows.size(), false);
757 anova.resize(windows.size());
761 for (int i = 0; i < windows.size(); i++) {
763 //get confidence limits
764 float t = getT(closest.size()-1); //how many seqs you are comparing to this querySeq
765 float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i];
766 float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i];
768 if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) { /* range does not include 1 */
769 isChimericConfidence[i] = true; /* significantly higher at P<0.05 */
774 int degreeOfFreedom = refCombo + closest.size() - 2;
775 float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size())); /* denominator, without sqrt(), for ts calculations */
777 float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT))); /* value of ts for t-student test */
778 t = getT(degreeOfFreedom);
780 if ((ts >= t) && (averageQuery[i] > averageRef[i])) {
781 isChimericTStudent[i] = true; /* significantly higher at P<0.05 */
785 float value1 = sumQuery[i] + sumRef[i];
786 float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
787 float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
788 float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
789 float value5 = value2 - value4;
790 float value6 = value3 - value4;
791 float value7 = value5 - value6;
792 float value8 = value7 / ((float) degreeOfFreedom);
793 float anovaValue = value6 / value8;
795 float f = getF(degreeOfFreedom);
797 if ((anovaValue >= f) && (averageQuery[i] > averageRef[i])) {
798 isChimericANOVA[i] = true; /* significant P<0.05 */
801 if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
803 anova[i] = anovaValue;
807 catch(exception& e) {
808 m->errorOut(e, "Ccode", "determineChimeras");
812 /**************************************************************************************************/
813 float Ccode::getT(int numseq) {
818 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
819 if (numseq > 120) tvalue = 1.645;
820 else if (numseq > 60) tvalue = 1.658;
821 else if (numseq > 40) tvalue = 1.671;
822 else if (numseq > 30) tvalue = 1.684;
823 else if (numseq > 29) tvalue = 1.697;
824 else if (numseq > 28) tvalue = 1.699;
825 else if (numseq > 27) tvalue = 1.701;
826 else if (numseq > 26) tvalue = 1.703;
827 else if (numseq > 25) tvalue = 1.706;
828 else if (numseq > 24) tvalue = 1.708;
829 else if (numseq > 23) tvalue = 1.711;
830 else if (numseq > 22) tvalue = 1.714;
831 else if (numseq > 21) tvalue = 1.717;
832 else if (numseq > 20) tvalue = 1.721;
833 else if (numseq > 19) tvalue = 1.725;
834 else if (numseq > 18) tvalue = 1.729;
835 else if (numseq > 17) tvalue = 1.734;
836 else if (numseq > 16) tvalue = 1.740;
837 else if (numseq > 15) tvalue = 1.746;
838 else if (numseq > 14) tvalue = 1.753;
839 else if (numseq > 13) tvalue = 1.761;
840 else if (numseq > 12) tvalue = 1.771;
841 else if (numseq > 11) tvalue = 1.782;
842 else if (numseq > 10) tvalue = 1.796;
843 else if (numseq > 9) tvalue = 1.812;
844 else if (numseq > 8) tvalue = 1.833;
845 else if (numseq > 7) tvalue = 1.860;
846 else if (numseq > 6) tvalue = 1.895;
847 else if (numseq > 5) tvalue = 1.943;
848 else if (numseq > 4) tvalue = 2.015;
849 else if (numseq > 3) tvalue = 2.132;
850 else if (numseq > 2) tvalue = 2.353;
851 else if (numseq > 1) tvalue = 2.920;
852 else if (numseq <= 1) {
853 m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
858 catch(exception& e) {
859 m->errorOut(e, "Ccode", "getT");
863 /**************************************************************************************************/
864 float Ccode::getF(int numseq) {
869 /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
870 if (numseq > 120) fvalue = 3.84;
871 else if (numseq > 60) fvalue = 3.92;
872 else if (numseq > 40) fvalue = 4.00;
873 else if (numseq > 30) fvalue = 4.08;
874 else if (numseq > 29) fvalue = 4.17;
875 else if (numseq > 28) fvalue = 4.18;
876 else if (numseq > 27) fvalue = 4.20;
877 else if (numseq > 26) fvalue = 4.21;
878 else if (numseq > 25) fvalue = 4.23;
879 else if (numseq > 24) fvalue = 4.24;
880 else if (numseq > 23) fvalue = 4.26;
881 else if (numseq > 22) fvalue = 4.28;
882 else if (numseq > 21) fvalue = 4.30;
883 else if (numseq > 20) fvalue = 4.32;
884 else if (numseq > 19) fvalue = 4.35;
885 else if (numseq > 18) fvalue = 4.38;
886 else if (numseq > 17) fvalue = 4.41;
887 else if (numseq > 16) fvalue = 4.45;
888 else if (numseq > 15) fvalue = 4.49;
889 else if (numseq > 14) fvalue = 4.54;
890 else if (numseq > 13) fvalue = 4.60;
891 else if (numseq > 12) fvalue = 4.67;
892 else if (numseq > 11) fvalue = 4.75;
893 else if (numseq > 10) fvalue = 4.84;
894 else if (numseq > 9) fvalue = 4.96;
895 else if (numseq > 8) fvalue = 5.12;
896 else if (numseq > 7) fvalue = 5.32;
897 else if (numseq > 6) fvalue = 5.59;
898 else if (numseq > 5) fvalue = 5.99;
899 else if (numseq > 4) fvalue = 6.61;
900 else if (numseq > 3) fvalue = 7.71;
901 else if (numseq > 2) fvalue = 10.1;
902 else if (numseq > 1) fvalue = 18.5;
903 else if (numseq > 0) fvalue = 161;
904 else if (numseq <= 0) {
905 m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
910 catch(exception& e) {
911 m->errorOut(e, "Ccode", "getF");
915 //***************************************************************************************************************