X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=ccode.cpp;h=bfcd44d8239a30f8ad18db73f0c06268dc0e7474;hb=5a1e62397b91f57d0d3aff635891df04b8999a88;hp=7e6154b6c5af1d69259223025d9da9f7f3f5d4df;hpb=4f8291bc0a482c68dc739ef5aa7a311fc74f5f43;p=mothur.git diff --git a/ccode.cpp b/ccode.cpp index 7e6154b..bfcd44d 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -13,307 +13,218 @@ //*************************************************************************************************************** -Ccode::Ccode(string filename, string temp) { fastafile = filename; templateFile = temp; } +Ccode::Ccode(string filename, string o) { + fastafile = filename; outputDir = o; + distCalc = new eachGapDist(); + decalc = new DeCalculator(); + + mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo"; + ofstream out2; + openOutputFile(mapInfo, out2); + + out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; + out2.close(); +} //*************************************************************************************************************** - Ccode::~Ccode() { - try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - delete distCalc; - } - catch(exception& e) { - errorOut(e, "Ccode", "~Ccode"); - exit(1); - } + delete distCalc; + delete decalc; } //*************************************************************************************************************** +void Ccode::printHeader(ostream& out) { + out << "For full window mapping info refer to " << mapInfo << endl << endl; +} +//*************************************************************************************************************** void Ccode::print(ostream& out) { try { mothurOutEndLine(); - string mapInfo = getRootName(fastafile) + "mapinfo"; ofstream out2; - openOutputFile(mapInfo, out2); - - out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; + openOutputFileAppend(mapInfo, out2); - for (int j = 0; j < querySeqs.size(); j++) { - out2 << querySeqs[j]->getName() << endl; - for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) { - out2 << it->first << '\t' << it->second << endl; - } + out2 << querySeq->getName() << endl; + for (it = spotMap.begin(); it!= spotMap.end(); it++) { + out2 << it->first << '\t' << it->second << endl; } out2.close(); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); - out << "For full window mapping info refer to " << mapInfo << endl << endl; - - for (int i = 0; i < querySeqs.size(); i++) { - - out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl; + out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl; - for (int j = 0; j < closest[i].size(); j++) { - out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl; - } - out << endl << endl; + for (int j = 0; j < closest.size(); j++) { + out << setprecision(3) << closest[j].seq->getName() << '\t' << closest[j].dist << endl; + } + out << endl << endl; - //for each window - //window mapping info. - out << "Mapping information: "; - //you mask and did not filter - if ((seqMask != "") && (!filter)) { out << "mask and trim."; } + //for each window + //window mapping info. + out << "Mapping information: "; + //you mask and did not filter + if ((seqMask != "") && (!filter)) { out << "mask and trim."; } - //you filtered and did not mask - if ((seqMask == "") && (filter)) { out << "filter and trim."; } + //you filtered and did not mask + if ((seqMask == "") && (filter)) { out << "filter and trim."; } - //you masked and filtered - if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; } + //you masked and filtered + if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; } - out << endl << "Window\tStartPos\tEndPos" << endl; - it = trim[i].begin(); + out << endl << "Window\tStartPos\tEndPos" << endl; + it = trim.begin(); - for (int k = 0; k < windows[i].size()-1; k++) { - out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl; - } + for (int k = 0; k < windows.size()-1; k++) { + out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl; + } - out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl; - out << endl; + out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl; + out << endl; - out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl; - for (int k = 0; k < windows[i].size(); k++) { - float ds = averageQuery[i][k] / averageRef[i][k]; - out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl; - } - out << endl; + out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl; + for (int k = 0; k < windows.size(); k++) { + float ds = averageQuery[k] / averageRef[k]; + out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl; + } + out << endl; - //varRef - //varQuery - /* F test for differences among variances. - * varQuery is expected to be higher or similar than varRef */ - //float fs = varQuery[query][i] / varRef[query][i]; /* F-Snedecor, test for differences of variances */ + //varRef + //varQuery + /* F test for differences among variances. + * varQuery is expected to be higher or similar than varRef */ + //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */ - bool results = false; + bool results = false; - //confidence limit, t - Student, anova - out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl; + //confidence limit, t - Student, anova + out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl; - for (int k = 0; k < windows[i].size(); k++) { - string temp = ""; - if (isChimericConfidence[i][k]) { temp += "*\t"; } - else { temp += "\t"; } + for (int k = 0; k < windows.size(); k++) { + string temp = ""; + if (isChimericConfidence[k]) { temp += "*\t"; } + else { temp += "\t"; } - if (isChimericTStudent[i][k]) { temp += "*\t"; } - else { temp += "\t"; } + if (isChimericTStudent[k]) { temp += "*\t"; } + else { temp += "\t"; } - if (isChimericANOVA[i][k]) { temp += "*\t"; } - else { temp += "\t"; } + if (isChimericANOVA[k]) { temp += "*\t"; } + else { temp += "\t"; } - out << k+1 << '\t' << temp << endl; + out << k+1 << '\t' << temp << endl; - if (temp == "*\t*\t*\t") { results = true; } - } - out << endl; + if (temp == "*\t*\t*\t") { results = true; } + } + out << endl; - if (results) { - mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine(); - } + if (results) { + mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine(); } + } catch(exception& e) { errorOut(e, "Ccode", "print"); exit(1); } } - //*************************************************************************************************************** -void Ccode::getChimeras() { +int Ccode::getChimeras(Sequence* query) { try { - - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - templateSeqs = readSeqs(templateFile); - mothurOut("Done."); mothurOutEndLine(); - - int numSeqs = querySeqs.size(); - - closest.resize(numSeqs); - - refCombo.resize(numSeqs, 0); - sumRef.resize(numSeqs); - varRef.resize(numSeqs); - varQuery.resize(numSeqs); - sdRef.resize(numSeqs); - sdQuery.resize(numSeqs); - sumQuery.resize(numSeqs); - sumSquaredRef.resize(numSeqs); - sumSquaredQuery.resize(numSeqs); - averageRef.resize(numSeqs); - averageQuery.resize(numSeqs); - anova.resize(numSeqs); - isChimericConfidence.resize(numSeqs); - isChimericTStudent.resize(numSeqs); - isChimericANOVA.resize(numSeqs); - trim.resize(numSeqs); - spotMap.resize(numSeqs); - windowSizes.resize(numSeqs, window); - windows.resize(numSeqs); - - //break up file if needed - int linesPerProcess = numSeqs / processors ; - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - - //find breakup of templatefile for quantiles - if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } - else { - for (int i = 0; i < processors; i++) { - templateLines.push_back(new linePair()); - templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size()); - templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size()); - } - } - #else - lines.push_back(new linePair(0, numSeqs)); - templateLines.push_back(new linePair(0, templateSeqs.size())); - #endif - distCalc = new eachGapDist(); - decalc = new DeCalculator(); - - //find closest - if (processors == 1) { - mothurOut("Finding top matches for sequences... "); cout.flush(); - closest = findClosest(lines[0]->start, lines[0]->end, numWanted); - mothurOut("Done."); mothurOutEndLine(); - }else { createProcessesClosest(); } + closest.clear(); + refCombo = 0; + sumRef.clear(); + varRef.clear(); + varQuery.clear(); + sdRef.clear(); + sdQuery.clear(); + sumQuery.clear(); + sumSquaredRef.clear(); + sumSquaredQuery.clear(); + averageRef.clear(); + averageQuery.clear(); + anova.clear(); + isChimericConfidence.clear(); + isChimericTStudent.clear(); + isChimericANOVA.clear(); + trim.clear(); + spotMap.clear(); + windowSizes = window; + windows.clear(); - -for (int i = 0; i < closest.size(); i++) { - //cout << querySeqs[i]->getName() << ": "; - string file = querySeqs[i]->getName() + ".input"; - ofstream o; - openOutputFile(file, o); - querySeqs[i]->printSequence(o); - for (int j = 0; j < closest[i].size(); j++) { - //cout << closest[i][j].seq->getName() << '\t'; - closest[i][j].seq->printSequence(o); - } - //cout << endl; - o.close(); -} - for (int j = 0; j < numSeqs; j++) { - for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) { - spotMap[j][i] = i; - } - } + querySeq = query; + //find closest matches to query + closest = findClosest(query, numWanted); + + //initialize spotMap + for (int i = 0; i < query->getAligned().length(); i++) { spotMap[i] = i; } + //mask sequences if the user wants to if (seqMask != "") { decalc->setMask(seqMask); - //mask querys - for (int i = 0; i < querySeqs.size(); i++) { - decalc->runMask(querySeqs[i]); - } - - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); - } + decalc->runMask(query); - for (int i = 0; i < numSeqs; i++) { - spotMap[i] = decalc->getMaskMap(); - } + //mask closest + for (int i = 0; i < closest.size(); i++) { decalc->runMask(closest[i].seq); } + + spotMap = decalc->getMaskMap(); } if (filter) { - vector temp = templateSeqs; - for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); } + vector temp; + for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); } + temp.push_back(query); createFilter(temp); - runFilter(querySeqs); - runFilter(templateSeqs); + for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]); } //update spotMap map newMap; int spot = 0; - int j = 0; for (int i = 0; i < filterString.length(); i++) { if (filterString[i] == '1') { //add to newMap - newMap[spot] = spotMap[j][i]; + newMap[spot] = spotMap[i]; spot++; } } - - for (int i = 0; i < numSeqs; i++) { - spotMap[i] = newMap; - } + spotMap = newMap; } //trim sequences - this follows ccodes remove_extra_gaps - for (int i = 0; i < querySeqs.size(); i++) { - trimSequences(i); - } + trimSequences(query); + //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length(). //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations - for (int i = 0; i < querySeqs.size(); i++) { - windows[i] = findWindows(i); - } + windows = findWindows(); + - //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later - should be paralellized - for (int i = 0; i < closest.size(); i++) { - removeBadReferenceSeqs(closest[i], i); - } + //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later + removeBadReferenceSeqs(closest); - - //find the averages for each querys references - should be paralellized - for (int i = 0; i < numSeqs; i++) { - getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i]. - } - - //find the averages for the query - should be paralellized - for (int i = 0; i < numSeqs; i++) { - getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i]. - } - //find the averages for each querys references - should be paralellized - for (int i = 0; i < numSeqs; i++) { - findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } + //find the averages for each querys references + getAverageRef(closest); //fills sumRef, averageRef, sumSquaredRef and refCombo. + getAverageQuery(closest, query); //fills sumQuery, averageQuery, sumSquaredQuery. + - //find the averages for the query - should be paralellized - for (int i = 0; i < numSeqs; i++) { - findVarianceQuery(i); //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } - - for (int i = 0; i < numSeqs; i++) { - determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - should be paralellized - } - - //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } + //find the averages for each querys references + findVarianceRef(); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0. + + //find the averages for the query + findVarianceQuery(); //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0. + + determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. + + //free memory + for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; } + + return 0; } catch(exception& e) { errorOut(e, "Ccode", "getChimeras"); @@ -322,17 +233,17 @@ for (int i = 0; i < closest.size(); i++) { } /***************************************************************************************************************/ //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences." -void Ccode::trimSequences(int query) { +void Ccode::trimSequences(Sequence* query) { try { int frontPos = 0; //should contain first position in all seqs that is not a gap character - int rearPos = querySeqs[query]->getAligned().length(); + int rearPos = query->getAligned().length(); //********find first position in closest seqs that is a non gap character***********// //find first position all query seqs that is a non gap character - for (int i = 0; i < closest[query].size(); i++) { + for (int i = 0; i < closest.size(); i++) { - string aligned = closest[query][i].seq->getAligned(); + string aligned = closest[i].seq->getAligned(); int pos = 0; //find first spot in this seq @@ -348,7 +259,7 @@ void Ccode::trimSequences(int query) { } //find first position all querySeq[query] that is a non gap character - string aligned = querySeqs[query]->getAligned(); + string aligned = query->getAligned(); int pos = 0; //find first spot in this seq @@ -364,9 +275,9 @@ void Ccode::trimSequences(int query) { //********find last position in closest seqs that is a non gap character***********// - for (int i = 0; i < closest[query].size(); i++) { + for (int i = 0; i < closest.size(); i++) { - string aligned = closest[query][i].seq->getAligned(); + string aligned = closest[i].seq->getAligned(); int pos = aligned.length(); //find first spot in this seq @@ -382,7 +293,7 @@ void Ccode::trimSequences(int query) { } //find last position all querySeqs[query] that is a non gap character - aligned = querySeqs[query]->getAligned(); + aligned = query->getAligned(); pos = aligned.length(); //find first spot in this seq @@ -404,7 +315,7 @@ void Ccode::trimSequences(int query) { tempTrim[frontPos] = rearPos; //save trimmed locations - trim[query] = tempTrim; + trim = tempTrim; //update spotMask map newMap; @@ -412,56 +323,45 @@ void Ccode::trimSequences(int query) { for (int i = frontPos; i < rearPos; i++) { //add to newMap -//cout << query << '\t' << i << '\t' << spotMap[query][i] << endl; - newMap[spot] = spotMap[query][i]; + newMap[spot] = spotMap[i]; spot++; } - - //for (it = newMap.begin(); it!=newMap.end(); it++) { - //cout << query << '\t' << it->first << '\t' << it->second << endl; - //} - - spotMap[query] = newMap; - - + spotMap = newMap; } catch(exception& e) { errorOut(e, "Ccode", "trimSequences"); exit(1); } - } /***************************************************************************************************************/ -vector Ccode::findWindows(int query) { +vector Ccode::findWindows() { try { vector win; - it = trim[query].begin(); + it = trim.begin(); int length = it->second - it->first; - + //default is wanted = 10% of total length - if (windowSizes[query] > length) { + if (windowSizes > length) { mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length."); - windowSizes[query] = length / 10; - }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10; } - else if (windowSizes[query] > (length / 20)) { + windowSizes = length / 10; + }else if (windowSizes == 0) { windowSizes = length / 10; } + else if (windowSizes > (length * 0.20)) { mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); - }else if (windowSizes[query] < (length / 5)) { + }else if (windowSizes < (length * 0.05)) { mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); } //save starting points of each window - for (int m = it->first; m < (it->second-windowSizes[query]); m+=windowSizes[query]) { win.push_back(m); } + for (int m = it->first; m < (it->second-windowSizes); m+=windowSizes) { win.push_back(m); } //save last window if (win[win.size()-1] < (it->first+length)) { - win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75 + win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75 } //with this you would get 1,25,50,75,100 - return win; - } catch(exception& e) { errorOut(e, "Ccode", "findWindows"); @@ -525,7 +425,7 @@ int Ccode::getDiff(string seqA, string seqB) { } //*************************************************************************************************************** //tried to make this look most like ccode original implementation -void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { +void Ccode::removeBadReferenceSeqs(vector& seqs) { try { vector< vector > numDiffBases; @@ -533,7 +433,7 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { //initialize to 0 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); } - it = trim[query].begin(); + it = trim.begin(); int length = it->second - it->first; //calc differences from each sequence to everyother seq in the set @@ -589,9 +489,8 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { seqs = goodSeqs; }else { //warn, but dont remove any - mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine(); + mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine(); } - } catch(exception& e) { @@ -600,46 +499,33 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { } } //*************************************************************************************************************** -vector< vector > Ccode::findClosest(int start, int end, int numWanted) { +//makes copy of templateseq for filter +vector Ccode::findClosest(Sequence* q, int numWanted) { try{ - vector< vector > topMatches; topMatches.resize(querySeqs.size()); - - float smallestOverall, smallestLeft, smallestRight; - smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000; + vector topMatches; - //for each sequence in querySeqs - find top matches to use as reference - for(int j = start; j < end; j++){ - - Sequence query = *(querySeqs[j]); + Sequence query = *(q); - vector distances; + //calc distance to each sequence in template seqs + for (int i = 0; i < templateSeqs.size(); i++) { - //calc distance to each sequence in template seqs - for (int i = 0; i < templateSeqs.size(); i++) { - - Sequence ref = *(templateSeqs[i]); + Sequence ref = *(templateSeqs[i]); - //find overall dist - distCalc->calcDist(query, ref); - float dist = distCalc->getDist(); + //find overall dist + distCalc->calcDist(query, ref); + float dist = distCalc->getDist(); - //save distance - SeqDist temp; - temp.seq = templateSeqs[i]; - temp.dist = dist; + //save distance + SeqDist temp; + temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned()); + temp.dist = dist; - distances.push_back(temp); - } - - sort(distances.begin(), distances.end(), compareSeqDist); - - //save the number of top matches wanted - for (int h = 0; h < numWanted; h++) { - topMatches[j].push_back(distances[h]); - } + topMatches.push_back(temp); } + sort(topMatches.begin(), topMatches.end(), compareSeqDist); + return topMatches; } @@ -650,21 +536,21 @@ vector< vector > Ccode::findClosest(int start, int end, int numWanted) } /**************************************************************************************************/ //find the distances from each reference sequence to every other reference sequence for each window for this query -void Ccode::getAverageRef(vector ref, int query) { +void Ccode::getAverageRef(vector ref) { try { - vector< vector< vector > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2. + vector< vector< vector > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2. //initialize diffs vector diffs.resize(ref.size()); for (int i = 0; i < diffs.size(); i++) { diffs[i].resize(ref.size()); for (int j = 0; j < diffs[i].size(); j++) { - diffs[i][j].resize(windows[query].size(), 0); + diffs[i][j].resize(windows.size(), 0); } } - it = trim[query].begin(); + it = trim.begin(); //find the distances from each reference sequence to every other reference sequence for each window for this query for (int i = 0; i < ref.size(); i++) { @@ -676,23 +562,23 @@ void Ccode::getAverageRef(vector ref, int query) { string refJ = ref[j].seq->getAligned(); - for (int k = 0; k < windows[query].size(); k++) { + for (int k = 0; k < windows.size(); k++) { string refIWindowk, refJWindowk; - if (k < windows[query].size()-1) { + if (k < windows.size()-1) { //get window strings - refIWindowk = refI.substr(windows[query][k], windowSizes[query]); - refJWindowk = refJ.substr(windows[query][k], windowSizes[query]); + refIWindowk = refI.substr(windows[k], windowSizes); + refJWindowk = refJ.substr(windows[k], windowSizes); }else { //last window may be smaller than rest - see findwindows //get window strings - refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k])); - refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k])); + refIWindowk = refI.substr(windows[k], (it->second-windows[k])); + refJWindowk = refJ.substr(windows[k], (it->second-windows[k])); } //find differences int diff = getDiff(refIWindowk, refJWindowk); -//cout << i << '\t' << j << '\t' << k << '\t' << diff << endl; + //save differences in [i][j][k] and [j][i][k] since they are the same diffs[i][j][k] = diff; diffs[j][i][k] = diff; @@ -704,20 +590,20 @@ void Ccode::getAverageRef(vector ref, int query) { }//i //initialize sumRef for this query - sumRef[query].resize(windows[query].size(), 0); - sumSquaredRef[query].resize(windows[query].size(), 0); - averageRef[query].resize(windows[query].size(), 0); + sumRef.resize(windows.size(), 0); + sumSquaredRef.resize(windows.size(), 0); + averageRef.resize(windows.size(), 0); //find the sum of the differences for hte reference sequences for (int i = 0; i < diffs.size(); i++) { for (int j = 0; j < i; j++) { //increment this querys reference sequences combos - refCombo[query]++; + refCombo++; for (int k = 0; k < diffs[i][j].size(); k++) { - sumRef[query][k] += diffs[i][j][k]; - sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]); + sumRef[k] += diffs[i][j][k]; + sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]); }//k }//j @@ -725,8 +611,8 @@ void Ccode::getAverageRef(vector ref, int query) { //find the average of the differences for the references for each window - for (int i = 0; i < windows[query].size(); i++) { - averageRef[query][i] = sumRef[query][i] / (float) refCombo[query]; + for (int i = 0; i < windows.size(); i++) { + averageRef[i] = sumRef[i] / (float) refCombo; } } @@ -736,7 +622,7 @@ void Ccode::getAverageRef(vector ref, int query) { } } /**************************************************************************************************/ -void Ccode::getAverageQuery (vector ref, int query) { +void Ccode::getAverageQuery (vector ref, Sequence* query) { try { vector< vector > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2. @@ -744,35 +630,35 @@ void Ccode::getAverageQuery (vector ref, int query) { //initialize diffs vector diffs.resize(ref.size()); for (int j = 0; j < diffs.size(); j++) { - diffs[j].resize(windows[query].size(), 0); + diffs[j].resize(windows.size(), 0); } - it = trim[query].begin(); + it = trim.begin(); - string refQuery = querySeqs[query]->getAligned(); + string refQuery = query->getAligned(); //jgetAligned(); - for (int k = 0; k < windows[query].size(); k++) { + for (int k = 0; k < windows.size(); k++) { string QueryWindowk, refJWindowk; - if (k < windows[query].size()-1) { + if (k < windows.size()-1) { //get window strings - QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]); - refJWindowk = refJ.substr(windows[query][k], windowSizes[query]); + QueryWindowk = refQuery.substr(windows[k], windowSizes); + refJWindowk = refJ.substr(windows[k], windowSizes); }else { //last window may be smaller than rest - see findwindows //get window strings - QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k])); - refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k])); + QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k])); + refJWindowk = refJ.substr(windows[k], (it->second-windows[k])); } //find differences int diff = getDiff(QueryWindowk, refJWindowk); -//cout << j << '\t' << k << '\t' << diff << endl; + //save differences diffs[j][k] = diff; @@ -781,25 +667,23 @@ void Ccode::getAverageQuery (vector ref, int query) { //initialize sumRef for this query - sumQuery[query].resize(windows[query].size(), 0); - sumSquaredQuery[query].resize(windows[query].size(), 0); - averageQuery[query].resize(windows[query].size(), 0); + sumQuery.resize(windows.size(), 0); + sumSquaredQuery.resize(windows.size(), 0); + averageQuery.resize(windows.size(), 0); //find the sum of the differences for (int j = 0; j < diffs.size(); j++) { for (int k = 0; k < diffs[j].size(); k++) { - sumQuery[query][k] += diffs[j][k]; - sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]); + sumQuery[k] += diffs[j][k]; + sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]); }//k }//j //find the average of the differences for the references for each window - for (int i = 0; i < windows[query].size(); i++) { - averageQuery[query][i] = sumQuery[query][i] / (float) ref.size(); + for (int i = 0; i < windows.size(); i++) { + averageQuery[i] = sumQuery[i] / (float) ref.size(); } - - } catch(exception& e) { errorOut(e, "Ccode", "getAverageQuery"); @@ -807,23 +691,23 @@ void Ccode::getAverageQuery (vector ref, int query) { } } /**************************************************************************************************/ -void Ccode::findVarianceRef (int query) { +void Ccode::findVarianceRef() { try { - varRef[query].resize(windows[query].size(), 0); - sdRef[query].resize(windows[query].size(), 0); + varRef.resize(windows.size(), 0); + sdRef.resize(windows.size(), 0); //for each window - for (int i = 0; i < windows[query].size(); i++) { - varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1); - sdRef[query][i] = sqrt(varRef[query][i]); + for (int i = 0; i < windows.size(); i++) { + varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1); + sdRef[i] = sqrt(varRef[i]); //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation - if (averageRef[query][i] < 0.001) { averageRef[query][i] = 0.001; } - if (sumRef[query][i] < 0.001) { sumRef[query][i] = 0.001; } - if (varRef[query][i] < 0.001) { varRef[query][i] = 0.001; } - if (sumSquaredRef[query][i] < 0.001) { sumSquaredRef[query][i] = 0.001; } - if (sdRef[query][i] < 0.001) { sdRef[query][i] = 0.001; } + if (averageRef[i] < 0.001) { averageRef[i] = 0.001; } + if (sumRef[i] < 0.001) { sumRef[i] = 0.001; } + if (varRef[i] < 0.001) { varRef[i] = 0.001; } + if (sumSquaredRef[i] < 0.001) { sumSquaredRef[i] = 0.001; } + if (sdRef[i] < 0.001) { sdRef[i] = 0.001; } } } @@ -833,22 +717,22 @@ void Ccode::findVarianceRef (int query) { } } /**************************************************************************************************/ -void Ccode::findVarianceQuery (int query) { +void Ccode::findVarianceQuery() { try { - varQuery[query].resize(windows[query].size(), 0); - sdQuery[query].resize(windows[query].size(), 0); + varQuery.resize(windows.size(), 0); + sdQuery.resize(windows.size(), 0); //for each window - for (int i = 0; i < windows[query].size(); i++) { - varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1); - sdQuery[query][i] = sqrt(varQuery[query][i]); + for (int i = 0; i < windows.size(); i++) { + varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1); + sdQuery[i] = sqrt(varQuery[i]); //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation - if (averageQuery[query][i] < 0.001) { averageQuery[query][i] = 0.001; } - if (sumQuery[query][i] < 0.001) { sumQuery[query][i] = 0.001; } - if (varQuery[query][i] < 0.001) { varQuery[query][i] = 0.001; } - if (sumSquaredQuery[query][i] < 0.001) { sumSquaredQuery[query][i] = 0.001; } - if (sdQuery[query][i] < 0.001) { sdQuery[query][i] = 0.001; } + if (averageQuery[i] < 0.001) { averageQuery[i] = 0.001; } + if (sumQuery[i] < 0.001) { sumQuery[i] = 0.001; } + if (varQuery[i] < 0.001) { varQuery[i] = 0.001; } + if (sumSquaredQuery[i] < 0.001) { sumSquaredQuery[i] = 0.001; } + if (sdQuery[i] < 0.001) { sdQuery[i] = 0.001; } } } @@ -858,44 +742,44 @@ void Ccode::findVarianceQuery (int query) { } } /**************************************************************************************************/ -void Ccode::determineChimeras (int query) { +void Ccode::determineChimeras() { try { - isChimericConfidence[query].resize(windows[query].size(), false); - isChimericTStudent[query].resize(windows[query].size(), false); - isChimericANOVA[query].resize(windows[query].size(), false); - anova[query].resize(windows[query].size()); + isChimericConfidence.resize(windows.size(), false); + isChimericTStudent.resize(windows.size(), false); + isChimericANOVA.resize(windows.size(), false); + anova.resize(windows.size()); //for each window - for (int i = 0; i < windows[query].size(); i++) { + for (int i = 0; i < windows.size(); i++) { //get confidence limits - float t = getT(closest[query].size()-1); //how many seqs you are comparing to this querySeq - float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; - float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; -//cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl; - if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) { /* range does not include 1 */ - isChimericConfidence[query][i] = true; /* significantly higher at P<0.05 */ -//cout << i << " made it here" << endl; + float t = getT(closest.size()-1); //how many seqs you are comparing to this querySeq + float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; + float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; + + if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) { /* range does not include 1 */ + isChimericConfidence[i] = true; /* significantly higher at P<0.05 */ + } //student t test - int degreeOfFreedom = refCombo[query] + closest[query].size() - 2; - float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size())); /* denominator, without sqrt(), for ts calculations */ + int degreeOfFreedom = refCombo + closest.size() - 2; + float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size())); /* denominator, without sqrt(), for ts calculations */ - float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT))); /* value of ts for t-student test */ + float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT))); /* value of ts for t-student test */ t = getT(degreeOfFreedom); -//cout << i << '\t' << t << '\t' << ts << endl; - if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) { - isChimericTStudent[query][i] = true; /* significantly higher at P<0.05 */ + + if ((ts >= t) && (averageQuery[i] > averageRef[i])) { + isChimericTStudent[i] = true; /* significantly higher at P<0.05 */ } //anova test - float value1 = sumQuery[query][i] + sumRef[query][i]; - float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i]; - float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]); - float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) ); + float value1 = sumQuery[i] + sumRef[i]; + float value2 = sumSquaredQuery[i] + sumSquaredRef[i]; + float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo); + float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) ); float value5 = value2 - value4; float value6 = value3 - value4; float value7 = value5 - value6; @@ -904,11 +788,13 @@ void Ccode::determineChimeras (int query) { float f = getF(degreeOfFreedom); - if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i])) { - isChimericANOVA[query][i] = true; /* significant P<0.05 */ + if ((anovaValue >= f) && (averageQuery[i] > averageRef[i])) { + isChimericANOVA[i] = true; /* significant P<0.05 */ } - anova[query][i] = anovaValue; + if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; } + + anova[i] = anovaValue; } } @@ -1020,116 +906,7 @@ float Ccode::getF(int numseq) { exit(1); } } +//*************************************************************************************************************** -/**************************************************************************************************/ -void Ccode::createProcessesClosest() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - closest = findClosest(lines[process]->start, lines[process]->end, numWanted); - mothurOut("Done finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < closest[i].size(); j++) { - closest[i][j].seq->printSequence(out); - } - } - out << ">" << endl; //to stop sequence read - - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < closest[i].size(); j++) { - out << closest[i][j].dist << '\t'; - } - out << endl; - } - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;i > tempClosest; tempClosest.resize(querySeqs.size()); - //get pairs - for (int k = lines[i]->start; k < lines[i]->end; k++) { - vector tempVector; - - for (int j = 0; j < numWanted; j++) { - - Sequence* temp = new Sequence(in); - gobble(in); - - tempVector.push_back(temp); - } - - tempClosest[k] = tempVector; - } - - string junk; - in >> junk; gobble(in); // to get ">" - - vector< vector > dists; dists.resize(querySeqs.size()); - - for (int i = lines[process]->start; i < lines[process]->end; i++) { - dists[i].resize(closest[i].size()); - for (int j = 0; j < closest[i].size(); j++) { - in >> dists[i][j]; - } - gobble(in); - } - - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < closest[i].size(); j++) { - closest[i][j].seq = tempClosest[i][j]; - closest[i][j].dist = dists[i][j]; - } - } - - in.close(); - remove(s.c_str()); - } - - -#else - closest = findClosest(lines[0]->start, lines[0]->end, numWanted); -#endif - - } - catch(exception& e) { - errorOut(e, "Ccode", "createProcessesClosest"); - exit(1); - } -} -//***************************************************************************************************************