From 40873e9a7e12d248ebb86e75ca96238c7e7b9701 Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 4 Sep 2009 15:06:54 +0000 Subject: [PATCH] ccode working - still need to paralellize --- ccode.cpp | 777 ++++++++++++++++++++++++++++++++++++++++++++++------- ccode.h | 39 ++- decalc.cpp | 4 + decalc.h | 3 + 4 files changed, 718 insertions(+), 105 deletions(-) diff --git a/ccode.cpp b/ccode.cpp index 4ec3a12..0ba2379 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -33,8 +33,93 @@ void Ccode::print(ostream& out) { mothurOutEndLine(); + string mapInfo = getRootName(fastafile) + "mapinfo"; + ofstream out2; + openOutputFile(mapInfo, out2); + + out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; + + for (int j = 0; j < querySeqs.size(); j++) { + out2 << querySeqs[j]->getName() << endl; + for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) { + out2 << it->first << '\t' << it->second << endl; + } + } + out2.close(); + + out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + + out << "For full window mapping info refer to " << mapInfo << endl << endl; + for (int i = 0; i < querySeqs.size(); i++) { + + out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl; + + for (int j = 0; j < closest[i].size(); j++) { + out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl; + } + out << endl << endl; + + //for each window + //window mapping info. + out << "Mapping information: " << endl; + //you mask and did not filter + if ((seqMask != "") && (!filter)) { out << "mask and trim."; } + + //you filtered and did not mask + if ((seqMask == "") && (filter)) { out << "filter and trim."; } + + //you masked and filtered + if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; } + + out << "Window\tStartPos\tEndPos" << endl; + it = trim[i].begin(); + + for (int k = 0; k < windows[i].size()-1; k++) { + out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl; + } + + out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl; + out << endl; + + out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl; + for (int k = 0; k < windows[i].size(); k++) { + float ds = averageQuery[i][k] / averageRef[i][k]; + out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl; + } + out << endl; + //varRef + //varQuery + /* F test for differences among variances. + * varQuery is expected to be higher or similar than varRef */ + //float fs = varQuery[query][i] / varRef[query][i]; /* F-Snedecor, test for differences of variances */ + + bool results = false; + + //confidence limit, t - Student, anova + out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl; + + for (int k = 0; k < windows[i].size(); k++) { + string temp = ""; + if (isChimericConfidence[i][k]) { temp += "*\t"; } + else { temp += "\t"; } + + if (isChimericTStudent[i][k]) { temp += "*\t"; } + else { temp += "\t"; } + + if (isChimericANOVA[i][k]) { temp += "*\t"; } + else { temp += "\t"; } + + out << k+1 << '\t' << temp << endl; + + if (temp != "\t\t\t") { results = true; } + } + out << endl; + + if (results) { + mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine(); + } } } catch(exception& e) { @@ -57,6 +142,26 @@ void Ccode::getChimeras() { closest.resize(numSeqs); + refCombo.resize(numSeqs, 0); + sumRef.resize(numSeqs); + varRef.resize(numSeqs); + varQuery.resize(numSeqs); + sdRef.resize(numSeqs); + sdQuery.resize(numSeqs); + sumQuery.resize(numSeqs); + sumSquaredRef.resize(numSeqs); + sumSquaredQuery.resize(numSeqs); + averageRef.resize(numSeqs); + averageQuery.resize(numSeqs); + anova.resize(numSeqs); + isChimericConfidence.resize(numSeqs); + isChimericTStudent.resize(numSeqs); + isChimericANOVA.resize(numSeqs); + trim.resize(numSeqs); + spotMap.resize(numSeqs); + windowSizes.resize(numSeqs, window); + windows.resize(numSeqs); + //break up file if needed int linesPerProcess = numSeqs / processors ; @@ -99,14 +204,25 @@ void Ccode::getChimeras() { for (int i = 0; i < closest.size(); i++) { - cout << querySeqs[i]->getName() << ": "; + //cout << querySeqs[i]->getName() << ": "; + string file = querySeqs[i]->getName() + ".input"; + ofstream o; + openOutputFile(file, o); + + querySeqs[i]->printSequence(o); for (int j = 0; j < closest[i].size(); j++) { - - cout << closest[i][j]->getName() << '\t'; + //cout << closest[i][j].seq->getName() << '\t'; + closest[i][j].seq->printSequence(o); } - cout << endl; + //cout << endl; + o.close(); } - + for (int j = 0; j < numSeqs; j++) { + for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) { + spotMap[j][i] = i; + } + } + //mask sequences if the user wants to if (seqMask != "") { //mask querys @@ -118,6 +234,10 @@ for (int i = 0; i < closest.size(); i++) { for (int i = 0; i < templateSeqs.size(); i++) { decalc->runMask(templateSeqs[i]); } + + for (int i = 0; i < numSeqs; i++) { + spotMap[i] = decalc->getMaskMap(); + } } if (filter) { @@ -128,22 +248,66 @@ for (int i = 0; i < closest.size(); i++) { runFilter(querySeqs); runFilter(templateSeqs); + + //update spotMap + map newMap; + int spot = 0; + int j = 0; + + for (int i = 0; i < filterString.length(); i++) { + if (filterString[i] == 1) { + //add to newMap + newMap[spot] = spotMap[j][i]; + spot++; + } + } + + for (int i = 0; i < numSeqs; i++) { + spotMap[i] = newMap; + } } //trim sequences - this follows ccodes remove_extra_gaps - //just need to pass it query and template since closest points to template - trimSequences(); + for (int i = 0; i < querySeqs.size(); i++) { + trimSequences(i); + } //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length(). //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations - windows = findWindows(); - - //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later + for (int i = 0; i < querySeqs.size(); i++) { + windows[i] = findWindows(i); + } + + //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later - should be paralellized for (int i = 0; i < closest.size(); i++) { removeBadReferenceSeqs(closest[i], i); } - - + + + //find the averages for each querys references - should be paralellized + for (int i = 0; i < numSeqs; i++) { + getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i]. + } + + //find the averages for the query - should be paralellized + for (int i = 0; i < numSeqs; i++) { + getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i]. + } + + //find the averages for each querys references - should be paralellized + for (int i = 0; i < numSeqs; i++) { + findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0. + } + + //find the averages for the query - should be paralellized + for (int i = 0; i < numSeqs; i++) { + findVarianceQuery(i); //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0. + } + + for (int i = 0; i < numSeqs; i++) { + determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - should be paralellized + } + //free memory for (int i = 0; i < lines.size(); i++) { delete lines[i]; } for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } @@ -156,17 +320,17 @@ for (int i = 0; i < closest.size(); i++) { } /***************************************************************************************************************/ //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences." -void Ccode::trimSequences() { +void Ccode::trimSequences(int query) { try { int frontPos = 0; //should contain first position in all seqs that is not a gap character - int rearPos = querySeqs[0]->getAligned().length(); + int rearPos = querySeqs[query]->getAligned().length(); - //********find first position in all seqs that is a non gap character***********// + //********find first position in closest seqs that is a non gap character***********// //find first position all query seqs that is a non gap character - for (int i = 0; i < querySeqs.size(); i++) { + for (int i = 0; i < closest[query].size(); i++) { - string aligned = querySeqs[i]->getAligned(); + string aligned = closest[query][i].seq->getAligned(); int pos = 0; //find first spot in this seq @@ -181,30 +345,26 @@ void Ccode::trimSequences() { if (pos > frontPos) { frontPos = pos; } } - //find first position all template seqs that is a non gap character - for (int i = 0; i < templateSeqs.size(); i++) { - - string aligned = templateSeqs[i]->getAligned(); - int pos = 0; + //find first position all querySeq[query] that is a non gap character + string aligned = querySeqs[query]->getAligned(); + int pos = 0; - //find first spot in this seq - for (int j = 0; j < aligned.length(); j++) { - if (isalpha(aligned[j])) { - pos = j; - break; - } + //find first spot in this seq + for (int j = 0; j < aligned.length(); j++) { + if (isalpha(aligned[j])) { + pos = j; + break; } - - //save this spot if it is the farthest - if (pos > frontPos) { frontPos = pos; } } - - //********find last position in all seqs that is a non gap character***********// - //find last position all query seqs that is a non gap character - for (int i = 0; i < querySeqs.size(); i++) { + //save this spot if it is the farthest + if (pos > frontPos) { frontPos = pos; } + + + //********find last position in closest seqs that is a non gap character***********// + for (int i = 0; i < closest[query].size(); i++) { - string aligned = querySeqs[i]->getAligned(); + string aligned = closest[query][i].seq->getAligned(); int pos = aligned.length(); //find first spot in this seq @@ -219,49 +379,49 @@ void Ccode::trimSequences() { if (pos < rearPos) { rearPos = pos; } } - //find last position all template seqs that is a non gap character - for (int i = 0; i < templateSeqs.size(); i++) { - - string aligned = templateSeqs[i]->getAligned(); - int pos = aligned.length(); - - //find first spot in this seq - for (int j = aligned.length()-1; j >= 0; j--) { - if (isalpha(aligned[j])) { - pos = j; - break; - } + //find last position all querySeqs[query] that is a non gap character + aligned = querySeqs[query]->getAligned(); + pos = aligned.length(); + + //find first spot in this seq + for (int j = aligned.length()-1; j >= 0; j--) { + if (isalpha(aligned[j])) { + pos = j; + break; } - - //save this spot if it is the farthest - if (pos < rearPos) { rearPos = pos; } } + + //save this spot if it is the farthest + if (pos < rearPos) { rearPos = pos; } //check to make sure that is not whole seq if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); } - //***********trim all seqs to that position***************// - for (int i = 0; i < querySeqs.size(); i++) { - - string aligned = querySeqs[i]->getAligned(); - - //between the two points - aligned = aligned.substr(frontPos, (rearPos-frontPos-1)); - - querySeqs[i]->setAligned(aligned); - } + map tempTrim; + tempTrim[frontPos] = rearPos; - for (int i = 0; i < templateSeqs.size(); i++) { - - string aligned = templateSeqs[i]->getAligned(); - - //between the two points - aligned = aligned.substr(frontPos, (rearPos-frontPos-1)); - - templateSeqs[i]->setAligned(aligned); + //save trimmed locations + trim[query] = tempTrim; + + //update spotMask + map newMap; + int spot = 0; + + for (int i = frontPos; i < rearPos; i++) { + //add to newMap +//cout << query << '\t' << i << '\t' << spotMap[query][i] << endl; + newMap[spot] = spotMap[query][i]; + spot++; } - + + //for (it = newMap.begin(); it!=newMap.end(); it++) { + //cout << query << '\t' << it->first << '\t' << it->second << endl; + //} + + spotMap[query] = newMap; + + } catch(exception& e) { errorOut(e, "Ccode", "trimSequences"); @@ -270,25 +430,33 @@ void Ccode::trimSequences() { } /***************************************************************************************************************/ -vector Ccode::findWindows() { +vector Ccode::findWindows(int query) { try { vector win; - int length = querySeqs[0]->getAligned().length(); + it = trim[query].begin(); + + int length = it->second - it->first; //default is wanted = 10% of total length - if (window > length) { + if (windowSizes[query] > length) { mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length."); - window = length / 10; - }else if (window == 0) { window = length / 10; } - else if (window > (length / 20)) { + windowSizes[query] = length / 10; + }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10; } + else if (windowSizes[query] > (length / 20)) { mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); - }else if (window < (length / 5)) { + }else if (windowSizes[query] < (length / 5)) { mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); } //save starting points of each window - for (int m = 0; m < (length-window); m+=window) { win.push_back(m); } + for (int m = it->first; m < (it->second-windowSizes[query]); m+=windowSizes[query]) { win.push_back(m); } + + //save last window + if (win[win.size()-1] < (it->first+length)) { + win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75 + } //with this you would get 1,25,50,75,100 + return win; @@ -308,7 +476,40 @@ int Ccode::getDiff(string seqA, string seqB) { //if you are both not gaps //if (isalpha(seqA[i]) && isalpha(seqA[i])) { //are you different - if (seqA[i] != seqB[i]) { numDiff++; } + if (seqA[i] != seqB[i]) { + int ok; /* ok=1 means equivalent base. Checks for degenerate bases */ + + /* the char in base_a and base_b have been checked and they are different */ + if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1; + else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1; + else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1; + else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1; + else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1; + else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1; + else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1; + else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1; + else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1; + else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1; + else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1; + else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1; + else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1; + else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1; + else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1; + else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1; + else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1; + else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1; + else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1; + else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1; + else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1; + else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1; + else ok = 0; /* the bases are different and not equivalent */ + + //check if they are both blanks + if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1; + else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1; + + if (ok == 0) { numDiff++; } + } //} } @@ -322,7 +523,7 @@ int Ccode::getDiff(string seqA, string seqB) { } //*************************************************************************************************************** //tried to make this look most like ccode original implementation -void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { +void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { try { vector< vector > numDiffBases; @@ -330,17 +531,18 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { //initialize to 0 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); } - int length = seqs[0]->getAligned().length(); + it = trim[query].begin(); + int length = it->second - it->first; //calc differences from each sequence to everyother seq in the set for (int i = 0; i < seqs.size(); i++) { - string seqA = seqs[i]->getAligned(); + string seqA = seqs[i].seq->getAligned().substr(it->first, length); //so you don't calc i to j and j to i since they are the same for (int j = 0; j < i; j++) { - string seqB = seqs[j]->getAligned(); + string seqB = seqs[j].seq->getAligned().substr(it->first, length); //compare strings int numDiff = getDiff(seqA, seqB); @@ -352,15 +554,16 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { //initailize remove to 0 vector remove; remove.resize(seqs.size(), 0); + float top = ((20*length) / (float) 100); + float bottom = ((0.5*length) / (float) 100); //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set for (int i = 0; i < numDiffBases.size(); i++) { - for (int j = 0; j < numDiffBases[i].size(); j++) { - + for (int j = 0; j < i; j++) { //are you more than 20% different - if (numDiffBases[i][j] > ((20*length) / 100)) { remove[j] = 1; } + if (numDiffBases[i][j] > top) { remove[j] = 1; } //are you less than 0.5% different - if (numDiffBases[i][j] < ((0.5*length) / 100)) { remove[j] = 1; } + if (numDiffBases[i][j] < bottom) { remove[j] = 1; } } } @@ -373,7 +576,7 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { //if you have enough then remove bad ones if (numSeqsLeft >= 3) { - vector goodSeqs; + vector goodSeqs; //remove bad seqs for (int i = 0; i < remove.size(); i++) { if (remove[i] == 0) { @@ -395,10 +598,10 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { } } //*************************************************************************************************************** -vector< vector > Ccode::findClosest(int start, int end, int numWanted) { +vector< vector > Ccode::findClosest(int start, int end, int numWanted) { try{ - vector< vector > topMatches; topMatches.resize(querySeqs.size()); + vector< vector > topMatches; topMatches.resize(querySeqs.size()); float smallestOverall, smallestLeft, smallestRight; smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000; @@ -431,7 +634,7 @@ vector< vector > Ccode::findClosest(int start, int end, int numWanted //save the number of top matches wanted for (int h = 0; h < numWanted; h++) { - topMatches[j].push_back(distances[h].seq); + topMatches[j].push_back(distances[h]); } } @@ -444,8 +647,86 @@ vector< vector > Ccode::findClosest(int start, int end, int numWanted } } /**************************************************************************************************/ -vector Ccode::getAverageRef(vector ref) { +//find the distances from each reference sequence to every other reference sequence for each window for this query +void Ccode::getAverageRef(vector ref, int query) { try { + + vector< vector< vector > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2. + + //initialize diffs vector + diffs.resize(ref.size()); + for (int i = 0; i < diffs.size(); i++) { + diffs[i].resize(ref.size()); + for (int j = 0; j < diffs[i].size(); j++) { + diffs[i][j].resize(windows[query].size(), 0); + } + } + + it = trim[query].begin(); + + //find the distances from each reference sequence to every other reference sequence for each window for this query + for (int i = 0; i < ref.size(); i++) { + + string refI = ref[i].seq->getAligned(); + + //jgetAligned(); + + for (int k = 0; k < windows[query].size(); k++) { + + string refIWindowk, refJWindowk; + + if (k < windows[query].size()-1) { + //get window strings + refIWindowk = refI.substr(windows[query][k], windowSizes[query]); + refJWindowk = refJ.substr(windows[query][k], windowSizes[query]); + }else { //last window may be smaller than rest - see findwindows + //get window strings + refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k])); + refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k])); + } + + //find differences + int diff = getDiff(refIWindowk, refJWindowk); +//cout << i << '\t' << j << '\t' << k << '\t' << diff << endl; + //save differences in [i][j][k] and [j][i][k] since they are the same + diffs[i][j][k] = diff; + diffs[j][i][k] = diff; + + }//k + + }//j + + }//i + + //initialize sumRef for this query + sumRef[query].resize(windows[query].size(), 0); + sumSquaredRef[query].resize(windows[query].size(), 0); + averageRef[query].resize(windows[query].size(), 0); + + //find the sum of the differences for hte reference sequences + for (int i = 0; i < diffs.size(); i++) { + for (int j = 0; j < i; j++) { + + //increment this querys reference sequences combos + refCombo[query]++; + + for (int k = 0; k < diffs[i][j].size(); k++) { + sumRef[query][k] += diffs[i][j][k]; + sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]); + }//k + + }//j + }//i + + + //find the average of the differences for the references for each window + for (int i = 0; i < windows[query].size(); i++) { + averageRef[query][i] = sumRef[query][i] / (float) refCombo[query]; + } + } catch(exception& e) { errorOut(e, "Ccode", "getAverageRef"); @@ -453,9 +734,69 @@ vector Ccode::getAverageRef(vector ref) { } } /**************************************************************************************************/ -vector Ccode::getAverageQuery (vector ref, int query) { +void Ccode::getAverageQuery (vector ref, int query) { try { + vector< vector > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2. + + //initialize diffs vector + diffs.resize(ref.size()); + for (int j = 0; j < diffs.size(); j++) { + diffs[j].resize(windows[query].size(), 0); + } + + it = trim[query].begin(); + + string refQuery = querySeqs[query]->getAligned(); + + //jgetAligned(); + + for (int k = 0; k < windows[query].size(); k++) { + + string QueryWindowk, refJWindowk; + + if (k < windows[query].size()-1) { + //get window strings + QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]); + refJWindowk = refJ.substr(windows[query][k], windowSizes[query]); + }else { //last window may be smaller than rest - see findwindows + //get window strings + QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k])); + refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k])); + } + + //find differences + int diff = getDiff(QueryWindowk, refJWindowk); +//cout << j << '\t' << k << '\t' << diff << endl; + //save differences + diffs[j][k] = diff; + + }//k + }//j + + + //initialize sumRef for this query + sumQuery[query].resize(windows[query].size(), 0); + sumSquaredQuery[query].resize(windows[query].size(), 0); + averageQuery[query].resize(windows[query].size(), 0); + + //find the sum of the differences + for (int j = 0; j < diffs.size(); j++) { + for (int k = 0; k < diffs[j].size(); k++) { + sumQuery[query][k] += diffs[j][k]; + sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]); + }//k + }//j + + + //find the average of the differences for the references for each window + for (int i = 0; i < windows[query].size(); i++) { + averageQuery[query][i] = sumQuery[query][i] / (float) ref.size(); + } + } catch(exception& e) { @@ -463,6 +804,221 @@ vector Ccode::getAverageQuery (vector ref, int query) { exit(1); } } +/**************************************************************************************************/ +void Ccode::findVarianceRef (int query) { + try { + + varRef[query].resize(windows[query].size(), 0); + sdRef[query].resize(windows[query].size(), 0); + + //for each window + for (int i = 0; i < windows[query].size(); i++) { + varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1); + sdRef[query][i] = sqrt(varRef[query][i]); + + //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation + if (averageRef[query][i] < 0.001) { averageRef[query][i] = 0.001; } + if (sumRef[query][i] < 0.001) { sumRef[query][i] = 0.001; } + if (varRef[query][i] < 0.001) { varRef[query][i] = 0.001; } + if (sumSquaredRef[query][i] < 0.001) { sumSquaredRef[query][i] = 0.001; } + if (sdRef[query][i] < 0.001) { sdRef[query][i] = 0.001; } + + } + } + catch(exception& e) { + errorOut(e, "Ccode", "findVarianceRef"); + exit(1); + } +} +/**************************************************************************************************/ +void Ccode::findVarianceQuery (int query) { + try { + varQuery[query].resize(windows[query].size(), 0); + sdQuery[query].resize(windows[query].size(), 0); + + //for each window + for (int i = 0; i < windows[query].size(); i++) { + varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1); + sdQuery[query][i] = sqrt(varQuery[query][i]); + + //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation + if (averageQuery[query][i] < 0.001) { averageQuery[query][i] = 0.001; } + if (sumQuery[query][i] < 0.001) { sumQuery[query][i] = 0.001; } + if (varQuery[query][i] < 0.001) { varQuery[query][i] = 0.001; } + if (sumSquaredQuery[query][i] < 0.001) { sumSquaredQuery[query][i] = 0.001; } + if (sdQuery[query][i] < 0.001) { sdQuery[query][i] = 0.001; } + } + + } + catch(exception& e) { + errorOut(e, "Ccode", "findVarianceQuery"); + exit(1); + } +} +/**************************************************************************************************/ +void Ccode::determineChimeras (int query) { + try { + + isChimericConfidence[query].resize(windows[query].size(), false); + isChimericTStudent[query].resize(windows[query].size(), false); + isChimericANOVA[query].resize(windows[query].size(), false); + anova[query].resize(windows[query].size()); + + + //for each window + for (int i = 0; i < windows[query].size(); i++) { + + //get confidence limits + float t = getT(closest[query].size()-1); //how many seqs you are comparing to this querySeq + float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; + float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; +//cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl; + if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) { /* range does not include 1 */ + isChimericConfidence[query][i] = true; /* significantly higher at P<0.05 */ +//cout << i << " made it here" << endl; + } + + //student t test + int degreeOfFreedom = refCombo[query] + closest[query].size() - 2; + float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size())); /* denominator, without sqrt(), for ts calculations */ + + float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT))); /* value of ts for t-student test */ + t = getT(degreeOfFreedom); +//cout << i << '\t' << t << '\t' << ts << endl; + if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) { + isChimericTStudent[query][i] = true; /* significantly higher at P<0.05 */ + } + + //anova test + float value1 = sumQuery[query][i] + sumRef[query][i]; + float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i]; + float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]); + float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) ); + float value5 = value2 - value4; + float value6 = value3 - value4; + float value7 = value5 - value6; + float value8 = value7 / ((float) degreeOfFreedom); + float anovaValue = value6 / value8; + + float f = getF(degreeOfFreedom); + + if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i])) { + isChimericANOVA[query][i] = true; /* significant P<0.05 */ + } + + anova[query][i] = anovaValue; + } + + } + catch(exception& e) { + errorOut(e, "Ccode", "determineChimeras"); + exit(1); + } +} +/**************************************************************************************************/ +float Ccode::getT(int numseq) { + try { + + float tvalue = 0; + + /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */ + if (numseq > 120) tvalue = 1.645; + else if (numseq > 60) tvalue = 1.658; + else if (numseq > 40) tvalue = 1.671; + else if (numseq > 30) tvalue = 1.684; + else if (numseq > 29) tvalue = 1.697; + else if (numseq > 28) tvalue = 1.699; + else if (numseq > 27) tvalue = 1.701; + else if (numseq > 26) tvalue = 1.703; + else if (numseq > 25) tvalue = 1.706; + else if (numseq > 24) tvalue = 1.708; + else if (numseq > 23) tvalue = 1.711; + else if (numseq > 22) tvalue = 1.714; + else if (numseq > 21) tvalue = 1.717; + else if (numseq > 20) tvalue = 1.721; + else if (numseq > 19) tvalue = 1.725; + else if (numseq > 18) tvalue = 1.729; + else if (numseq > 17) tvalue = 1.734; + else if (numseq > 16) tvalue = 1.740; + else if (numseq > 15) tvalue = 1.746; + else if (numseq > 14) tvalue = 1.753; + else if (numseq > 13) tvalue = 1.761; + else if (numseq > 12) tvalue = 1.771; + else if (numseq > 11) tvalue = 1.782; + else if (numseq > 10) tvalue = 1.796; + else if (numseq > 9) tvalue = 1.812; + else if (numseq > 8) tvalue = 1.833; + else if (numseq > 7) tvalue = 1.860; + else if (numseq > 6) tvalue = 1.895; + else if (numseq > 5) tvalue = 1.943; + else if (numseq > 4) tvalue = 2.015; + else if (numseq > 3) tvalue = 2.132; + else if (numseq > 2) tvalue = 2.353; + else if (numseq > 1) tvalue = 2.920; + else if (numseq <= 1) { + mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine(); + } + + return tvalue; + } + catch(exception& e) { + errorOut(e, "Ccode", "getT"); + exit(1); + } +} +/**************************************************************************************************/ +float Ccode::getF(int numseq) { + try { + + float fvalue = 0; + + /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */ + if (numseq > 120) fvalue = 3.84; + else if (numseq > 60) fvalue = 3.92; + else if (numseq > 40) fvalue = 4.00; + else if (numseq > 30) fvalue = 4.08; + else if (numseq > 29) fvalue = 4.17; + else if (numseq > 28) fvalue = 4.18; + else if (numseq > 27) fvalue = 4.20; + else if (numseq > 26) fvalue = 4.21; + else if (numseq > 25) fvalue = 4.23; + else if (numseq > 24) fvalue = 4.24; + else if (numseq > 23) fvalue = 4.26; + else if (numseq > 22) fvalue = 4.28; + else if (numseq > 21) fvalue = 4.30; + else if (numseq > 20) fvalue = 4.32; + else if (numseq > 19) fvalue = 4.35; + else if (numseq > 18) fvalue = 4.38; + else if (numseq > 17) fvalue = 4.41; + else if (numseq > 16) fvalue = 4.45; + else if (numseq > 15) fvalue = 4.49; + else if (numseq > 14) fvalue = 4.54; + else if (numseq > 13) fvalue = 4.60; + else if (numseq > 12) fvalue = 4.67; + else if (numseq > 11) fvalue = 4.75; + else if (numseq > 10) fvalue = 4.84; + else if (numseq > 9) fvalue = 4.96; + else if (numseq > 8) fvalue = 5.12; + else if (numseq > 7) fvalue = 5.32; + else if (numseq > 6) fvalue = 5.59; + else if (numseq > 5) fvalue = 5.99; + else if (numseq > 4) fvalue = 6.61; + else if (numseq > 3) fvalue = 7.71; + else if (numseq > 2) fvalue = 10.1; + else if (numseq > 1) fvalue = 18.5; + else if (numseq > 0) fvalue = 161; + else if (numseq <= 0) { + mothurOut("Two or more reference sequences are required, your data will be flawed.\n"); mothurOutEndLine(); + } + + return fvalue; + } + catch(exception& e) { + errorOut(e, "Ccode", "getF"); + exit(1); + } +} + /**************************************************************************************************/ void Ccode::createProcessesClosest() { try { @@ -491,9 +1047,18 @@ void Ccode::createProcessesClosest() { //output pairs for (int i = lines[process]->start; i < lines[process]->end; i++) { for (int j = 0; j < closest[i].size(); j++) { - closest[i][j]->printSequence(out); + closest[i][j].seq->printSequence(out); } } + out << ">" << endl; //to stop sequence read + + for (int i = lines[process]->start; i < lines[process]->end; i++) { + for (int j = 0; j < closest[i].size(); j++) { + out << closest[i][j].dist << '\t'; + } + out << endl; + } + out.close(); exit(0); @@ -512,9 +1077,9 @@ void Ccode::createProcessesClosest() { string s = toString(processIDS[i]) + ".temp"; openInputFile(s, in); + vector< vector > tempClosest; tempClosest.resize(querySeqs.size()); //get pairs for (int k = lines[i]->start; k < lines[i]->end; k++) { - vector tempVector; for (int j = 0; j < numWanted; j++) { @@ -525,9 +1090,29 @@ void Ccode::createProcessesClosest() { tempVector.push_back(temp); } - closest[k] = tempVector; + tempClosest[k] = tempVector; } + string junk; + in >> junk; gobble(in); // to get ">" + + vector< vector > dists; dists.resize(querySeqs.size()); + + for (int i = lines[process]->start; i < lines[process]->end; i++) { + dists[i].resize(closest[i].size()); + for (int j = 0; j < closest[i].size(); j++) { + in >> dists[i][j]; + } + gobble(in); + } + + for (int i = lines[process]->start; i < lines[process]->end; i++) { + for (int j = 0; j < closest[i].size(); j++) { + closest[i][j].seq = tempClosest[i][j]; + closest[i][j].dist = dists[i][j]; + } + } + in.close(); remove(s.c_str()); } diff --git a/ccode.h b/ccode.h index cced0b7..6322292 100644 --- a/ccode.h +++ b/ccode.h @@ -45,21 +45,42 @@ class Ccode : public Chimera { vector templateLines; vector querySeqs; vector templateSeqs; + vector< map > spotMap; + map::iterator it; - vector windows; - vector< vector > closest; //closest[0] is a vector of sequence at are closest to queryseqs[0]... + vector< vector > windows; //windows[0] is the vector of window breaks for querySeqs[0] + vector windowSizes; //windowSizes[0] is the size of the windows for querySeqs[0] + vector< map > trim; //trim[0] is the map containing the starting and ending positions for querySeqs[0] set of seqs + vector< vector > closest; //closest[0] is a vector of sequence at are closest to queryseqs[0]... vector< vector > averageRef; //averageRef[0] is the average distance at each window for the references for querySeqs[0] vector< vector > averageQuery; //averageQuery[0] is the average distance at each winow for the query for querySeqs[0] + vector< vector > sumRef; //sumRef[0] is the sum of distances at each window for the references for querySeqs[0] + vector< vector > sumSquaredRef; //sumSquaredRef[0] is the sum of squared distances at each window for the references for querySeqs[0] + vector< vector > sumQuery; //sumQuery[0] is the sum of distances at each window for the comparison of query to references for querySeqs[0] + vector< vector > sumSquaredQuery; //sumSquaredQuery[0] is the sum of squared distances at each window for the comparison of query to references for querySeqs[0] + vector< vector > varRef; //varRef[0] is the variance among references seqs at each window for querySeqs[0] + vector< vector > varQuery; //varQuery[0] is the variance among references and querySeqs[0] at each window + vector< vector > sdRef; //sdRef[0] is the standard deviation of references seqs at each window for querySeqs[0] + vector< vector > sdQuery; //sdQuery[0] is the standard deviation of references and querySeqs[0] at each window + vector< vector > anova; //anova[0] is the vector of anova scores for each window for querySeqs[0] + vector refCombo; //refCombo[0] is the number of reference sequences combinations for querySeqs[0] + vector< vector > isChimericConfidence; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits + vector< vector > isChimericTStudent; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits + vector< vector > isChimericANOVA; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits - vector< vector > findClosest(int, int, int); - void removeBadReferenceSeqs(vector&, int); //removes sequences from closest that are to different of too similar to eachother. - void trimSequences(); - vector findWindows(); - vector getAverageRef(vector); - vector getAverageQuery (vector, int); - + vector< vector > findClosest(int, int, int); + void removeBadReferenceSeqs(vector&, int); //removes sequences from closest that are to different of too similar to eachother. + void trimSequences(int); + vector findWindows(int); + void getAverageRef(vector, int); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i]. + void getAverageQuery (vector, int); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i]. + void findVarianceRef (int); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0. + void findVarianceQuery (int); //fills varQuery[i] and sdQuery[i] + void determineChimeras (int); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. int getDiff(string, string); //return number of mismatched bases, a gap to base is not counted as a mismatch + float getT(int); + float getF(int); void createProcessesClosest(); diff --git a/decalc.cpp b/decalc.cpp index 0589751..2d60899 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -13,13 +13,17 @@ void DeCalculator::setMask(string m) { try { seqMask = m; + int count = 0; + maskMap.clear(); if (seqMask.length() != 0) { //whereever there is a base in the mask, save that value is query and subject for (int i = 0; i < seqMask.length(); i++) { if (isalpha(seqMask[i])) { h.insert(i); + maskMap[i] = count; } + count++; } }else { for (int i = 0; i < alignLength; i++) { h.insert(i); } diff --git a/decalc.h b/decalc.h index da0f96c..dca562a 100644 --- a/decalc.h +++ b/decalc.h @@ -57,6 +57,8 @@ class DeCalculator { vector returnObviousOutliers(vector< vector >, int); + map getMaskMap() { return maskMap; } + private: //vector sortContrib(map); //used by mallard float findAverage(vector); @@ -65,6 +67,7 @@ class DeCalculator { string seqMask; set h; int alignLength; + map maskMap; }; -- 2.39.2