5 * Created by westcott on 8/24/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
14 //***************************************************************************************************************
15 Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : Chimera() {
20 templateFileName = temp; templateSeqs = readSeqs(temp);
26 distCalc = new eachGapDist();
27 decalc = new DeCalculator();
29 mapInfo = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "mapinfo";
33 //char* inFileName = new char[mapInfo.length()];
34 //memcpy(inFileName, mapInfo.c_str(), mapInfo.length());
36 char inFileName[1024];
37 strcpy(inFileName, mapInfo.c_str());
39 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY;
41 MPI_File_open(MPI_COMM_WORLD, inFileName, outMode, MPI_INFO_NULL, &outMap); //comm, filename, mode, info, filepointer
46 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
49 string outString = "Place in masked, filtered and trimmed sequence\tPlace in original alignment\n";
52 int length = outString.length();
53 char* buf2 = new char[length];
54 memcpy(buf2, outString.c_str(), length);
56 MPI_File_write_shared(outMap, buf2, length, MPI_CHAR, &status);
62 m->openOutputFile(mapInfo, out2);
64 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
69 m->errorOut(e, "Ccode", "Ccode");
73 //***************************************************************************************************************
79 MPI_File_close(&outMap);
82 //***************************************************************************************************************
83 Sequence Ccode::print(ostream& out, ostream& outAcc) {
87 m->openOutputFileAppend(mapInfo, out2);
89 out2 << querySeq->getName() << endl;
90 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
91 out2 << it->first << '\t' << it->second << endl;
94 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
96 for (int j = 0; j < closest.size(); j++) {
97 out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
102 //window mapping info.
103 out << "Mapping information: ";
104 //you mask and did not filter
105 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
107 //you filtered and did not mask
108 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
110 //you masked and filtered
111 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
113 out << endl << "Window\tStartPos\tEndPos" << endl;
115 for (int k = 0; k < windows.size()-1; k++) {
116 out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
119 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
121 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
122 for (int k = 0; k < windows.size(); k++) {
123 float ds = averageQuery[k] / averageRef[k];
124 out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
130 /* F test for differences among variances.
131 * varQuery is expected to be higher or similar than varRef */
132 //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */
134 bool results = false;
136 //confidence limit, t - Student, anova
137 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
139 for (int k = 0; k < windows.size(); k++) {
141 if (isChimericConfidence[k]) { temp += "*\t"; }
142 else { temp += "\t"; }
144 if (isChimericTStudent[k]) { temp += "*\t"; }
145 else { temp += "\t"; }
147 if (isChimericANOVA[k]) { temp += "*\t"; }
148 else { temp += "\t"; }
150 out << k+1 << '\t' << temp << endl;
152 if (temp == "*\t*\t*\t") { results = true; }
157 m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
158 outAcc << querySeq->getName() << endl;
162 for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; }
166 catch(exception& e) {
167 m->errorOut(e, "Ccode", "print");
172 //***************************************************************************************************************
173 Sequence Ccode::print(MPI_File& out, MPI_File& outAcc) {
176 string outMapString = "";
178 outMapString += querySeq->getName() + "\n";
179 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
180 outMapString += toString(it->first) + "\t" + toString(it->second) + "\n";
182 printMapping(outMapString);
185 string outString = "";
186 string outAccString = "";
188 outString += querySeq->getName() + "\n\nReference sequences used and distance to query:\n";
190 for (int j = 0; j < closest.size(); j++) {
191 outString += closest[j].seq->getName() + "\t" + toString(closest[j].dist) + "\n";
193 outString += "\n\nMapping information: ";
196 //window mapping info.
197 //you mask and did not filter
198 if ((seqMask != "") && (!filter)) { outString += "mask and trim."; }
200 //you filtered and did not mask
201 if ((seqMask == "") && (filter)) { outString += "filter and trim."; }
203 //you masked and filtered
204 if ((seqMask != "") && (filter)) { outString += "mask, filter and trim."; }
206 outString += "\nWindow\tStartPos\tEndPos\n";
208 for (int k = 0; k < windows.size()-1; k++) {
209 outString += toString(k+1) + "\t" + toString(spotMap[windows[k]-it->first]) + "\t" + toString(spotMap[windows[k]-it->first+windowSizes]) + "\n";
212 outString += toString(windows.size()) + "\t" + toString(spotMap[windows[windows.size()-1]-it->first]) + "\t" + toString(spotMap[it->second-it->first-1]) + "\n\n";
214 outString += "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova\n";
215 for (int k = 0; k < windows.size(); k++) {
216 float ds = averageQuery[k] / averageRef[k];
217 outString += toString(k+1) + "\t" + toString(averageQuery[k]) + "\t" + toString(sdQuery[k]) + "\t" + toString(averageRef[k]) + "\t" + toString(sdRef[k]) + "\t" + toString(ds) + "\t" + toString(anova[k]) + "\n";
222 /* F test for differences among variances.
223 * varQuery is expected to be higher or similar than varRef */
224 //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */
226 bool results = false;
228 //confidence limit, t - Student, anova
229 outString += "\nWindow\tConfidenceLimit\tt-Student\tAnova\n";
231 for (int k = 0; k < windows.size(); k++) {
233 if (isChimericConfidence[k]) { temp += "*\t"; }
234 else { temp += "\t"; }
236 if (isChimericTStudent[k]) { temp += "*\t"; }
237 else { temp += "\t"; }
239 if (isChimericANOVA[k]) { temp += "*\t"; }
240 else { temp += "\t"; }
242 outString += toString(k+1) + "\t" + temp + "\n";
244 if (temp == "*\t*\t*\t") { results = true; }
249 int length = outString.length();
250 char* buf2 = new char[length];
251 memcpy(buf2, outString.c_str(), length);
253 MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
257 m->mothurOut(querySeq->getName() + " was found have at least one chimeric window."); m->mothurOutEndLine();
258 outAccString += querySeq->getName() + "\n";
260 MPI_Status statusAcc;
261 length = outAccString.length();
262 char* buf = new char[length];
263 memcpy(buf, outAccString.c_str(), length);
265 MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
270 for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; }
274 catch(exception& e) {
275 m->errorOut(e, "Ccode", "print");
279 //***************************************************************************************************************
280 int Ccode::printMapping(string& output) {
283 int length = output.length();
284 char* buf = new char[length];
285 memcpy(buf, output.c_str(), length);
287 MPI_File_write_shared(outMap, buf, length, MPI_CHAR, &status);
293 catch(exception& e) {
294 m->errorOut(e, "Ccode", "printMapping");
299 //***************************************************************************************************************
300 int Ccode::getChimeras(Sequence* query) {
311 sumSquaredRef.clear();
312 sumSquaredQuery.clear();
314 averageQuery.clear();
316 isChimericConfidence.clear();
317 isChimericTStudent.clear();
318 isChimericANOVA.clear();
321 windowSizes = window;
327 //find closest matches to query
328 closest = findClosest(query, numWanted);
330 if (m->control_pressed) { return 0; }
333 for (int i = 0; i < query->getAligned().length(); i++) { spotMap[i] = i; }
335 //mask sequences if the user wants to
337 decalc->setMask(seqMask);
339 decalc->runMask(query);
342 for (int i = 0; i < closest.size(); i++) { decalc->runMask(closest[i].seq); }
344 spotMap = decalc->getMaskMap();
348 vector<Sequence*> temp;
349 for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); }
350 temp.push_back(query);
352 createFilter(temp, 0.5);
354 for (int i = 0; i < temp.size(); i++) {
355 if (m->control_pressed) { return 0; }
360 map<int, int> newMap;
363 for (int i = 0; i < filterString.length(); i++) {
364 if (filterString[i] == '1') {
366 newMap[spot] = spotMap[i];
373 //trim sequences - this follows ccodes remove_extra_gaps
374 trimSequences(query);
375 if (m->control_pressed) { return 0; }
377 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
378 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
379 windows = findWindows();
380 if (m->control_pressed) { return 0; }
382 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
383 removeBadReferenceSeqs(closest);
384 if (m->control_pressed) { return 0; }
386 //find the averages for each querys references
387 getAverageRef(closest); //fills sumRef, averageRef, sumSquaredRef and refCombo.
388 getAverageQuery(closest, query); //fills sumQuery, averageQuery, sumSquaredQuery.
389 if (m->control_pressed) { return 0; }
391 //find the averages for each querys references
392 findVarianceRef(); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
393 if (m->control_pressed) { return 0; }
395 //find the averages for the query
396 findVarianceQuery(); //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
397 if (m->control_pressed) { return 0; }
399 determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
400 if (m->control_pressed) { return 0; }
404 catch(exception& e) {
405 m->errorOut(e, "Ccode", "getChimeras");
409 /***************************************************************************************************************/
410 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
411 void Ccode::trimSequences(Sequence* query) {
414 int frontPos = 0; //should contain first position in all seqs that is not a gap character
415 int rearPos = query->getAligned().length();
417 //********find first position in closest seqs that is a non gap character***********//
418 //find first position all query seqs that is a non gap character
419 for (int i = 0; i < closest.size(); i++) {
421 string aligned = closest[i].seq->getAligned();
424 //find first spot in this seq
425 for (int j = 0; j < aligned.length(); j++) {
426 if (isalpha(aligned[j])) {
432 //save this spot if it is the farthest
433 if (pos > frontPos) { frontPos = pos; }
436 //find first position all querySeq[query] that is a non gap character
437 string aligned = query->getAligned();
440 //find first spot in this seq
441 for (int j = 0; j < aligned.length(); j++) {
442 if (isalpha(aligned[j])) {
448 //save this spot if it is the farthest
449 if (pos > frontPos) { frontPos = pos; }
452 //********find last position in closest seqs that is a non gap character***********//
453 for (int i = 0; i < closest.size(); i++) {
455 string aligned = closest[i].seq->getAligned();
456 int pos = aligned.length();
458 //find first spot in this seq
459 for (int j = aligned.length()-1; j >= 0; j--) {
460 if (isalpha(aligned[j])) {
466 //save this spot if it is the farthest
467 if (pos < rearPos) { rearPos = pos; }
470 //find last position all querySeqs[query] that is a non gap character
471 aligned = query->getAligned();
472 pos = aligned.length();
474 //find first spot in this seq
475 for (int j = aligned.length()-1; j >= 0; j--) {
476 if (isalpha(aligned[j])) {
482 //save this spot if it is the farthest
483 if (pos < rearPos) { rearPos = pos; }
486 //check to make sure that is not whole seq
487 if ((rearPos - frontPos - 1) <= 0) { m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1); }
489 map<int, int> tempTrim;
490 tempTrim[frontPos] = rearPos;
492 //save trimmed locations
496 map<int, int> newMap;
499 for (int i = frontPos; i < rearPos; i++) {
501 newMap[spot] = spotMap[i];
506 catch(exception& e) {
507 m->errorOut(e, "Ccode", "trimSequences");
511 /***************************************************************************************************************/
512 vector<int> Ccode::findWindows() {
518 int length = it->second - it->first;
520 //default is wanted = 10% of total length
521 if (windowSizes > length) {
522 m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
523 windowSizes = length / 10;
524 }else if (windowSizes == 0) { windowSizes = length / 10; }
525 else if (windowSizes > (length * 0.20)) {
526 m->mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
527 }else if (windowSizes < (length * 0.05)) {
528 m->mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); m->mothurOutEndLine();
531 //save starting points of each window
532 for (int m = it->first; m < (it->second-windowSizes); m+=windowSizes) { win.push_back(m); }
535 if (win[win.size()-1] < (it->first+length)) {
536 win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
537 } //with this you would get 1,25,50,75,100
541 catch(exception& e) {
542 m->errorOut(e, "Ccode", "findWindows");
546 //***************************************************************************************************************
547 int Ccode::getDiff(string seqA, string seqB) {
552 for (int i = 0; i < seqA.length(); i++) {
553 //if you are both not gaps
554 //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
556 if (seqA[i] != seqB[i]) {
557 int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
559 /* the char in base_a and base_b have been checked and they are different */
560 if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
561 else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
562 else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
563 else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
564 else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
565 else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
566 else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
567 else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
568 else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
569 else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
570 else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
571 else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
572 else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
573 else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
574 else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
575 else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
576 else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
577 else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
578 else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
579 else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
580 else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
581 else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
582 else ok = 0; /* the bases are different and not equivalent */
584 //check if they are both blanks
585 if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
586 else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
588 if (ok == 0) { numDiff++; }
596 catch(exception& e) {
597 m->errorOut(e, "Ccode", "getDiff");
601 //***************************************************************************************************************
602 //tried to make this look most like ccode original implementation
603 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
606 vector< vector<int> > numDiffBases;
607 numDiffBases.resize(seqs.size());
609 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
612 int length = it->second - it->first;
614 //calc differences from each sequence to everyother seq in the set
615 for (int i = 0; i < seqs.size(); i++) {
617 string seqA = seqs[i].seq->getAligned().substr(it->first, length);
619 //so you don't calc i to j and j to i since they are the same
620 for (int j = 0; j < i; j++) {
622 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
625 int numDiff = getDiff(seqA, seqB);
627 numDiffBases[i][j] = numDiff;
628 numDiffBases[j][i] = numDiff;
632 //initailize remove to 0
633 vector<int> remove; remove.resize(seqs.size(), 0);
634 float top = ((20*length) / (float) 100);
635 float bottom = ((0.5*length) / (float) 100);
637 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
638 for (int i = 0; i < numDiffBases.size(); i++) {
639 for (int j = 0; j < i; j++) {
640 //are you more than 20% different
641 if (numDiffBases[i][j] > top) { remove[j] = 1; }
642 //are you less than 0.5% different
643 if (numDiffBases[i][j] < bottom) { remove[j] = 1; }
649 //count seqs that are not going to be removed
650 for (int i = 0; i < remove.size(); i++) {
651 if (remove[i] == 0) { numSeqsLeft++; }
654 //if you have enough then remove bad ones
655 if (numSeqsLeft >= 3) {
656 vector<SeqDist> goodSeqs;
658 for (int i = 0; i < remove.size(); i++) {
659 if (remove[i] == 0) {
660 goodSeqs.push_back(seqs[i]);
666 }else { //warn, but dont remove any
667 m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); m->mothurOutEndLine();
671 catch(exception& e) {
672 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
676 //***************************************************************************************************************
677 //makes copy of templateseq for filter
678 vector<SeqDist> Ccode::findClosest(Sequence* q, int numWanted) {
681 vector<SeqDist> topMatches;
683 Sequence query = *(q);
685 //calc distance to each sequence in template seqs
686 for (int i = 0; i < templateSeqs.size(); i++) {
688 Sequence ref = *(templateSeqs[i]);
691 distCalc->calcDist(query, ref);
692 float dist = distCalc->getDist();
696 temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
699 topMatches.push_back(temp);
702 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
704 for (int i = numWanted; i < topMatches.size(); i++) { delete topMatches[i].seq; }
706 topMatches.resize(numWanted);
711 catch(exception& e) {
712 m->errorOut(e, "Ccode", "findClosestSides");
716 /**************************************************************************************************/
717 //find the distances from each reference sequence to every other reference sequence for each window for this query
718 void Ccode::getAverageRef(vector<SeqDist> ref) {
721 vector< vector< vector<int> > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
723 //initialize diffs vector
724 diffs.resize(ref.size());
725 for (int i = 0; i < diffs.size(); i++) {
726 diffs[i].resize(ref.size());
727 for (int j = 0; j < diffs[i].size(); j++) {
728 diffs[i][j].resize(windows.size(), 0);
734 //find the distances from each reference sequence to every other reference sequence for each window for this query
735 for (int i = 0; i < ref.size(); i++) {
737 string refI = ref[i].seq->getAligned();
739 //j<i, so you don't find distances from i to j and then j to i.
740 for (int j = 0; j < i; j++) {
742 string refJ = ref[j].seq->getAligned();
744 for (int k = 0; k < windows.size(); k++) {
746 string refIWindowk, refJWindowk;
748 if (k < windows.size()-1) {
750 refIWindowk = refI.substr(windows[k], windowSizes);
751 refJWindowk = refJ.substr(windows[k], windowSizes);
752 }else { //last window may be smaller than rest - see findwindows
754 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
755 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
759 int diff = getDiff(refIWindowk, refJWindowk);
761 //save differences in [i][j][k] and [j][i][k] since they are the same
762 diffs[i][j][k] = diff;
763 diffs[j][i][k] = diff;
771 //initialize sumRef for this query
772 sumRef.resize(windows.size(), 0);
773 sumSquaredRef.resize(windows.size(), 0);
774 averageRef.resize(windows.size(), 0);
776 //find the sum of the differences for hte reference sequences
777 for (int i = 0; i < diffs.size(); i++) {
778 for (int j = 0; j < i; j++) {
780 //increment this querys reference sequences combos
783 for (int k = 0; k < diffs[i][j].size(); k++) {
784 sumRef[k] += diffs[i][j][k];
785 sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
792 //find the average of the differences for the references for each window
793 for (int i = 0; i < windows.size(); i++) {
794 averageRef[i] = sumRef[i] / (float) refCombo;
798 catch(exception& e) {
799 m->errorOut(e, "Ccode", "getAverageRef");
803 /**************************************************************************************************/
804 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
807 vector< vector<int> > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
809 //initialize diffs vector
810 diffs.resize(ref.size());
811 for (int j = 0; j < diffs.size(); j++) {
812 diffs[j].resize(windows.size(), 0);
817 string refQuery = query->getAligned();
819 //j<i, so you don't find distances from i to j and then j to i.
820 for (int j = 0; j < ref.size(); j++) {
822 string refJ = ref[j].seq->getAligned();
824 for (int k = 0; k < windows.size(); k++) {
826 string QueryWindowk, refJWindowk;
828 if (k < windows.size()-1) {
830 QueryWindowk = refQuery.substr(windows[k], windowSizes);
831 refJWindowk = refJ.substr(windows[k], windowSizes);
832 }else { //last window may be smaller than rest - see findwindows
834 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
835 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
839 int diff = getDiff(QueryWindowk, refJWindowk);
848 //initialize sumRef for this query
849 sumQuery.resize(windows.size(), 0);
850 sumSquaredQuery.resize(windows.size(), 0);
851 averageQuery.resize(windows.size(), 0);
853 //find the sum of the differences
854 for (int j = 0; j < diffs.size(); j++) {
855 for (int k = 0; k < diffs[j].size(); k++) {
856 sumQuery[k] += diffs[j][k];
857 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
862 //find the average of the differences for the references for each window
863 for (int i = 0; i < windows.size(); i++) {
864 averageQuery[i] = sumQuery[i] / (float) ref.size();
867 catch(exception& e) {
868 m->errorOut(e, "Ccode", "getAverageQuery");
872 /**************************************************************************************************/
873 void Ccode::findVarianceRef() {
876 varRef.resize(windows.size(), 0);
877 sdRef.resize(windows.size(), 0);
880 for (int i = 0; i < windows.size(); i++) {
881 varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
882 sdRef[i] = sqrt(varRef[i]);
884 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
885 if (averageRef[i] < 0.001) { averageRef[i] = 0.001; }
886 if (sumRef[i] < 0.001) { sumRef[i] = 0.001; }
887 if (varRef[i] < 0.001) { varRef[i] = 0.001; }
888 if (sumSquaredRef[i] < 0.001) { sumSquaredRef[i] = 0.001; }
889 if (sdRef[i] < 0.001) { sdRef[i] = 0.001; }
893 catch(exception& e) {
894 m->errorOut(e, "Ccode", "findVarianceRef");
898 /**************************************************************************************************/
899 void Ccode::findVarianceQuery() {
901 varQuery.resize(windows.size(), 0);
902 sdQuery.resize(windows.size(), 0);
905 for (int i = 0; i < windows.size(); i++) {
906 varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
907 sdQuery[i] = sqrt(varQuery[i]);
909 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
910 if (averageQuery[i] < 0.001) { averageQuery[i] = 0.001; }
911 if (sumQuery[i] < 0.001) { sumQuery[i] = 0.001; }
912 if (varQuery[i] < 0.001) { varQuery[i] = 0.001; }
913 if (sumSquaredQuery[i] < 0.001) { sumSquaredQuery[i] = 0.001; }
914 if (sdQuery[i] < 0.001) { sdQuery[i] = 0.001; }
918 catch(exception& e) {
919 m->errorOut(e, "Ccode", "findVarianceQuery");
923 /**************************************************************************************************/
924 void Ccode::determineChimeras() {
927 isChimericConfidence.resize(windows.size(), false);
928 isChimericTStudent.resize(windows.size(), false);
929 isChimericANOVA.resize(windows.size(), false);
930 anova.resize(windows.size());
934 for (int i = 0; i < windows.size(); i++) {
936 //get confidence limits
937 float t = getT(closest.size()-1); //how many seqs you are comparing to this querySeq
938 float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i];
939 float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i];
941 if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) { /* range does not include 1 */
942 isChimericConfidence[i] = true; /* significantly higher at P<0.05 */
947 int degreeOfFreedom = refCombo + closest.size() - 2;
948 float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size())); /* denominator, without sqrt(), for ts calculations */
950 float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT))); /* value of ts for t-student test */
951 t = getT(degreeOfFreedom);
953 if ((ts >= t) && (averageQuery[i] > averageRef[i])) {
954 isChimericTStudent[i] = true; /* significantly higher at P<0.05 */
958 float value1 = sumQuery[i] + sumRef[i];
959 float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
960 float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
961 float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
962 float value5 = value2 - value4;
963 float value6 = value3 - value4;
964 float value7 = value5 - value6;
965 float value8 = value7 / ((float) degreeOfFreedom);
966 float anovaValue = value6 / value8;
968 float f = getF(degreeOfFreedom);
970 if ((anovaValue >= f) && (averageQuery[i] > averageRef[i])) {
971 isChimericANOVA[i] = true; /* significant P<0.05 */
974 if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
976 anova[i] = anovaValue;
980 catch(exception& e) {
981 m->errorOut(e, "Ccode", "determineChimeras");
985 /**************************************************************************************************/
986 float Ccode::getT(int numseq) {
991 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
992 if (numseq > 120) tvalue = 1.645;
993 else if (numseq > 60) tvalue = 1.658;
994 else if (numseq > 40) tvalue = 1.671;
995 else if (numseq > 30) tvalue = 1.684;
996 else if (numseq > 29) tvalue = 1.697;
997 else if (numseq > 28) tvalue = 1.699;
998 else if (numseq > 27) tvalue = 1.701;
999 else if (numseq > 26) tvalue = 1.703;
1000 else if (numseq > 25) tvalue = 1.706;
1001 else if (numseq > 24) tvalue = 1.708;
1002 else if (numseq > 23) tvalue = 1.711;
1003 else if (numseq > 22) tvalue = 1.714;
1004 else if (numseq > 21) tvalue = 1.717;
1005 else if (numseq > 20) tvalue = 1.721;
1006 else if (numseq > 19) tvalue = 1.725;
1007 else if (numseq > 18) tvalue = 1.729;
1008 else if (numseq > 17) tvalue = 1.734;
1009 else if (numseq > 16) tvalue = 1.740;
1010 else if (numseq > 15) tvalue = 1.746;
1011 else if (numseq > 14) tvalue = 1.753;
1012 else if (numseq > 13) tvalue = 1.761;
1013 else if (numseq > 12) tvalue = 1.771;
1014 else if (numseq > 11) tvalue = 1.782;
1015 else if (numseq > 10) tvalue = 1.796;
1016 else if (numseq > 9) tvalue = 1.812;
1017 else if (numseq > 8) tvalue = 1.833;
1018 else if (numseq > 7) tvalue = 1.860;
1019 else if (numseq > 6) tvalue = 1.895;
1020 else if (numseq > 5) tvalue = 1.943;
1021 else if (numseq > 4) tvalue = 2.015;
1022 else if (numseq > 3) tvalue = 2.132;
1023 else if (numseq > 2) tvalue = 2.353;
1024 else if (numseq > 1) tvalue = 2.920;
1025 else if (numseq <= 1) {
1026 m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1031 catch(exception& e) {
1032 m->errorOut(e, "Ccode", "getT");
1036 /**************************************************************************************************/
1037 float Ccode::getF(int numseq) {
1042 /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
1043 if (numseq > 120) fvalue = 3.84;
1044 else if (numseq > 60) fvalue = 3.92;
1045 else if (numseq > 40) fvalue = 4.00;
1046 else if (numseq > 30) fvalue = 4.08;
1047 else if (numseq > 29) fvalue = 4.17;
1048 else if (numseq > 28) fvalue = 4.18;
1049 else if (numseq > 27) fvalue = 4.20;
1050 else if (numseq > 26) fvalue = 4.21;
1051 else if (numseq > 25) fvalue = 4.23;
1052 else if (numseq > 24) fvalue = 4.24;
1053 else if (numseq > 23) fvalue = 4.26;
1054 else if (numseq > 22) fvalue = 4.28;
1055 else if (numseq > 21) fvalue = 4.30;
1056 else if (numseq > 20) fvalue = 4.32;
1057 else if (numseq > 19) fvalue = 4.35;
1058 else if (numseq > 18) fvalue = 4.38;
1059 else if (numseq > 17) fvalue = 4.41;
1060 else if (numseq > 16) fvalue = 4.45;
1061 else if (numseq > 15) fvalue = 4.49;
1062 else if (numseq > 14) fvalue = 4.54;
1063 else if (numseq > 13) fvalue = 4.60;
1064 else if (numseq > 12) fvalue = 4.67;
1065 else if (numseq > 11) fvalue = 4.75;
1066 else if (numseq > 10) fvalue = 4.84;
1067 else if (numseq > 9) fvalue = 4.96;
1068 else if (numseq > 8) fvalue = 5.12;
1069 else if (numseq > 7) fvalue = 5.32;
1070 else if (numseq > 6) fvalue = 5.59;
1071 else if (numseq > 5) fvalue = 5.99;
1072 else if (numseq > 4) fvalue = 6.61;
1073 else if (numseq > 3) fvalue = 7.71;
1074 else if (numseq > 2) fvalue = 10.1;
1075 else if (numseq > 1) fvalue = 18.5;
1076 else if (numseq > 0) fvalue = 161;
1077 else if (numseq <= 0) {
1078 m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); m->mothurOutEndLine();
1083 catch(exception& e) {
1084 m->errorOut(e, "Ccode", "getF");
1088 //***************************************************************************************************************