From 5a1e62397b91f57d0d3aff635891df04b8999a88 Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 12 Feb 2010 19:50:54 +0000 Subject: [PATCH] working on chimeras --- Mothur.xcodeproj/project.pbxproj | 6 + aligncommand.cpp | 10 +- bellerophon.cpp | 2 +- blastdb.cpp | 53 +- blastdb.hpp | 1 + ccode.cpp | 1204 +++++++----------------------- ccode.h | 81 +- chimera.cpp | 53 +- chimera.h | 71 +- chimeracheckrdp.cpp | 256 ++----- chimeracheckrdp.h | 37 +- chimerarealigner.cpp | 108 +++ chimerarealigner.h | 37 + chimeraseqscommand.cpp | 271 ++++++- chimeraseqscommand.h | 14 +- chimeraslayer.cpp | 299 +++----- chimeraslayer.h | 27 +- commandfactory.cpp | 1 + database.hpp | 2 + decalc.cpp | 35 +- decalc.h | 2 +- engine.cpp | 10 +- globaldata.cpp | 1 + maligner.cpp | 163 +++- maligner.h | 35 +- mergefilecommand.cpp | 6 +- mothur.cpp | 6 +- mothur.h | 15 + nast.cpp | 3 +- otuhierarchycommand.cpp | 17 +- otuhierarchycommand.h | 2 +- pintail.cpp | 867 ++++----------------- pintail.h | 41 +- readtree.cpp | 4 +- readtreecommand.cpp | 3 +- slayer.cpp | 143 +++- slayer.h | 7 +- tree.cpp | 26 +- tree.h | 2 +- treemap.cpp | 4 +- treenode.cpp | 10 +- treenode.h | 6 +- unweighted.cpp | 122 +-- weighted.cpp | 47 +- 44 files changed, 1619 insertions(+), 2491 deletions(-) create mode 100644 chimerarealigner.cpp create mode 100644 chimerarealigner.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 21dbf8d..51a592b 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -188,6 +188,7 @@ A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; }; A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; }; A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; }; + A7E8A22F1125939F0011D39C /* chimerarealigner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E8A22E1125939F0011D39C /* chimerarealigner.cpp */; }; A7F5759710CEBC0600E20E47 /* libreadline.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A7F5759610CEBC0600E20E47 /* libreadline.a */; }; EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; }; EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; }; @@ -593,6 +594,8 @@ A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; }; A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; }; A7E4A823106A9AD700688F62 /* maligner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = maligner.cpp; sourceTree = SOURCE_ROOT; }; + A7E8A22D1125939F0011D39C /* chimerarealigner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerarealigner.h; sourceTree = SOURCE_ROOT; }; + A7E8A22E1125939F0011D39C /* chimerarealigner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerarealigner.cpp; sourceTree = SOURCE_ROOT; }; A7F5759610CEBC0600E20E47 /* libreadline.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libreadline.a; path = "../readline-6.0/libreadline.a"; sourceTree = SOURCE_ROOT; }; C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = ""; }; EB1216860F619B83004A865F /* bergerparker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bergerparker.h; sourceTree = SOURCE_ROOT; }; @@ -733,6 +736,8 @@ A75B887B104C16860083C454 /* ccode.cpp */, A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */, A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */, + A7E8A22D1125939F0011D39C /* chimerarealigner.h */, + A7E8A22E1125939F0011D39C /* chimerarealigner.cpp */, A7B04491106CC3E60046FC83 /* chimeraslayer.h */, A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */, 372A55531017922B00C5194B /* decalc.h */, @@ -1337,6 +1342,7 @@ A704E8A31106045D00870157 /* otuhierarchycommand.cpp in Sources */, A70B00C8110885EB002F453A /* setdircommand.cpp in Sources */, A794201111107897003AECCD /* distancedb.cpp in Sources */, + A7E8A22F1125939F0011D39C /* chimerarealigner.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 7782d1e..708a854 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -218,6 +218,7 @@ int AlignCommand::execute(){ string alignFileName = outputDir + getRootName(getSimpleName(candidateFileNames[s])) + "align"; string reportFileName = outputDir + getRootName(getSimpleName(candidateFileNames[s])) + "align.report"; string accnosFileName = outputDir + getRootName(getSimpleName(candidateFileNames[s])) + "flip.accnos"; + bool hasAccnos = true; int numFastaSeqs = 0; for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); @@ -307,7 +308,7 @@ int AlignCommand::execute(){ mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well."); }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); } mothurOutEndLine(); - } + }else{ hasAccnos = false; } } #else ifstream inFASTA; @@ -320,7 +321,7 @@ int AlignCommand::execute(){ driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); //delete accnos file if its blank else report to user - if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); } + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; } else { mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + "."); if (!flip) { @@ -331,7 +332,10 @@ int AlignCommand::execute(){ #endif - + mothurOut("Output File Names: " + alignFileName + ", " + reportFileName); + if (hasAccnos) { mothurOut(", " + accnosFileName + "."); } + else { mothurOut("."); } + mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences."); mothurOutEndLine(); diff --git a/bellerophon.cpp b/bellerophon.cpp index 7c64d47..afb88d2 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -230,7 +230,7 @@ int Bellerophon::getChimeras() { sort(pref.begin(), pref.end(), comparePref); return 0; - + } catch(exception& e) { errorOut(e, "Bellerophon", "getChimeras"); diff --git a/blastdb.cpp b/blastdb.cpp index 6b45a48..50a0401 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -81,6 +81,55 @@ vector BlastDB::findClosestSequences(Sequence* seq, int n) { } /**************************************************************************************************/ +//assumes you have added all the template sequences using the addSequence function and run generateDB. +map BlastDB::findClosest(Sequence* seq, int n) { + try{ + map topMatches; + + ofstream queryFile; + openOutputFile(queryFileName, queryFile); + queryFile << '>' << seq->getName() << endl; + queryFile << seq->getUnaligned() << endl; + queryFile.close(); + + // the goal here is to quickly survey the database to find the closest match. To do this we are using the default + // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too + // long. With this setting, it seems comparable in speed to the suffix tree approach. + + string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -b " + toString(n) + " -v " + toString(n); + blastCommand += (" -i " + queryFileName + " -o " + blastFileName); + system(blastCommand.c_str()); + + ifstream m8FileHandle; + openInputFile(blastFileName, m8FileHandle); + + string dummy; + int templateAccession; + gobble(m8FileHandle); +//string name = seq->getName(); +//ofstream out; +//openOutputFileAppend(name, out); + while(!m8FileHandle.eof()){ + m8FileHandle >> dummy >> templateAccession >> searchScore; +//out << dummy << '\t' << templateAccession << '\t' << searchScore << endl; + //get rest of junk in line + while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); + //out << c; + if (c == 10 || c == 13){ break; } } + + gobble(m8FileHandle); + topMatches[templateAccession] = searchScore; + } + m8FileHandle.close(); +//out.close(); + return topMatches; + } + catch(exception& e) { + errorOut(e, "BlastDB", "findClosest"); + exit(1); + } +} +/**************************************************************************************************/ void BlastDB::addSequence(Sequence seq) { try { @@ -103,7 +152,7 @@ void BlastDB::addSequence(Sequence seq) { void BlastDB::generateDB() { try { - mothurOut("Generating the temporary BLAST database...\t"); cout.flush(); + //mothurOut("Generating the temporary BLAST database...\t"); cout.flush(); path = globaldata->argv; path = path.substr(0, (path.find_last_of('m'))); @@ -111,7 +160,7 @@ void BlastDB::generateDB() { string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F // option tells formatdb that seqs are DNA, not prot - mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush(); + //mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush(); } catch(exception& e) { errorOut(e, "BlastDB", "generateDB"); diff --git a/blastdb.hpp b/blastdb.hpp index c718bb4..fea55c8 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -23,6 +23,7 @@ public: void generateDB(); void addSequence(Sequence); vector findClosestSequences(Sequence*, int); + map findClosest(Sequence*, int); //template index -> searchscore private: string dbFileName; diff --git a/ccode.cpp b/ccode.cpp index c474d7a..bfcd44d 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -13,292 +13,216 @@ //*************************************************************************************************************** -Ccode::Ccode(string filename, string temp, string o) { fastafile = filename; templateFile = temp; outputDir = o; } +Ccode::Ccode(string filename, string o) { + fastafile = filename; outputDir = o; + distCalc = new eachGapDist(); + decalc = new DeCalculator(); + + mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo"; + ofstream out2; + openOutputFile(mapInfo, out2); + + out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; + out2.close(); +} //*************************************************************************************************************** - Ccode::~Ccode() { - try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - } - catch(exception& e) { - errorOut(e, "Ccode", "~Ccode"); - exit(1); - } + delete distCalc; + delete decalc; } //*************************************************************************************************************** +void Ccode::printHeader(ostream& out) { + out << "For full window mapping info refer to " << mapInfo << endl << endl; +} +//*************************************************************************************************************** void Ccode::print(ostream& out) { try { mothurOutEndLine(); - string mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo"; ofstream out2; - openOutputFile(mapInfo, out2); - - out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl; + openOutputFileAppend(mapInfo, out2); - for (int j = 0; j < querySeqs.size(); j++) { - out2 << querySeqs[j]->getName() << endl; - for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) { - out2 << it->first << '\t' << it->second << endl; - } + out2 << querySeq->getName() << endl; + for (it = spotMap.begin(); it!= spotMap.end(); it++) { + out2 << it->first << '\t' << it->second << endl; } out2.close(); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); - out << "For full window mapping info refer to " << mapInfo << endl << endl; - - for (int i = 0; i < querySeqs.size(); i++) { - - out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl; + out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl; - for (int j = 0; j < closest[i].size(); j++) { - out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl; - } - out << endl << endl; + for (int j = 0; j < closest.size(); j++) { + out << setprecision(3) << closest[j].seq->getName() << '\t' << closest[j].dist << endl; + } + out << endl << endl; - //for each window - //window mapping info. - out << "Mapping information: "; - //you mask and did not filter - if ((seqMask != "") && (!filter)) { out << "mask and trim."; } + //for each window + //window mapping info. + out << "Mapping information: "; + //you mask and did not filter + if ((seqMask != "") && (!filter)) { out << "mask and trim."; } - //you filtered and did not mask - if ((seqMask == "") && (filter)) { out << "filter and trim."; } + //you filtered and did not mask + if ((seqMask == "") && (filter)) { out << "filter and trim."; } - //you masked and filtered - if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; } + //you masked and filtered + if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; } - out << endl << "Window\tStartPos\tEndPos" << endl; - it = trim[i].begin(); + out << endl << "Window\tStartPos\tEndPos" << endl; + it = trim.begin(); - for (int k = 0; k < windows[i].size()-1; k++) { - out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl; - } + for (int k = 0; k < windows.size()-1; k++) { + out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl; + } - out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl; - out << endl; + out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl; + out << endl; - out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl; - for (int k = 0; k < windows[i].size(); k++) { - float ds = averageQuery[i][k] / averageRef[i][k]; - out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl; - } - out << endl; + out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl; + for (int k = 0; k < windows.size(); k++) { + float ds = averageQuery[k] / averageRef[k]; + out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl; + } + out << endl; - //varRef - //varQuery - /* F test for differences among variances. - * varQuery is expected to be higher or similar than varRef */ - //float fs = varQuery[query][i] / varRef[query][i]; /* F-Snedecor, test for differences of variances */ + //varRef + //varQuery + /* F test for differences among variances. + * varQuery is expected to be higher or similar than varRef */ + //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */ - bool results = false; + bool results = false; - //confidence limit, t - Student, anova - out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl; + //confidence limit, t - Student, anova + out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl; - for (int k = 0; k < windows[i].size(); k++) { - string temp = ""; - if (isChimericConfidence[i][k]) { temp += "*\t"; } - else { temp += "\t"; } + for (int k = 0; k < windows.size(); k++) { + string temp = ""; + if (isChimericConfidence[k]) { temp += "*\t"; } + else { temp += "\t"; } - if (isChimericTStudent[i][k]) { temp += "*\t"; } - else { temp += "\t"; } + if (isChimericTStudent[k]) { temp += "*\t"; } + else { temp += "\t"; } - if (isChimericANOVA[i][k]) { temp += "*\t"; } - else { temp += "\t"; } + if (isChimericANOVA[k]) { temp += "*\t"; } + else { temp += "\t"; } - out << k+1 << '\t' << temp << endl; + out << k+1 << '\t' << temp << endl; - if (temp == "*\t*\t*\t") { results = true; } - } - out << endl; + if (temp == "*\t*\t*\t") { results = true; } + } + out << endl; - if (results) { - mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine(); - } + if (results) { + mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine(); } + } catch(exception& e) { errorOut(e, "Ccode", "print"); exit(1); } } - //*************************************************************************************************************** -int Ccode::getChimeras() { +int Ccode::getChimeras(Sequence* query) { try { - - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - templateSeqs = readSeqs(templateFile); - mothurOut("Done."); mothurOutEndLine(); - - int numSeqs = querySeqs.size(); - - if (unaligned) { mothurOut("Your sequences need to be aligned when you use the bellerophon ccode."); mothurOutEndLine(); return 1; } - - closest.resize(numSeqs); - - refCombo.resize(numSeqs, 0); - sumRef.resize(numSeqs); - varRef.resize(numSeqs); - varQuery.resize(numSeqs); - sdRef.resize(numSeqs); - sdQuery.resize(numSeqs); - sumQuery.resize(numSeqs); - sumSquaredRef.resize(numSeqs); - sumSquaredQuery.resize(numSeqs); - averageRef.resize(numSeqs); - averageQuery.resize(numSeqs); - anova.resize(numSeqs); - isChimericConfidence.resize(numSeqs); - isChimericTStudent.resize(numSeqs); - isChimericANOVA.resize(numSeqs); - trim.resize(numSeqs); - spotMap.resize(numSeqs); - windowSizes.resize(numSeqs, window); - windows.resize(numSeqs); - - //break up file if needed - int linesPerProcess = numSeqs / processors ; - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - - #else - lines.push_back(new linePair(0, numSeqs)); - #endif - distCalc = new eachGapDist(); - decalc = new DeCalculator(); - - //find closest - if (processors == 1) { - mothurOut("Finding top matches for sequences... "); cout.flush(); - closest = findClosest(lines[0]->start, lines[0]->end, numWanted); - mothurOut("Done."); mothurOutEndLine(); - }else { createProcessesClosest(); } + closest.clear(); + refCombo = 0; + sumRef.clear(); + varRef.clear(); + varQuery.clear(); + sdRef.clear(); + sdQuery.clear(); + sumQuery.clear(); + sumSquaredRef.clear(); + sumSquaredQuery.clear(); + averageRef.clear(); + averageQuery.clear(); + anova.clear(); + isChimericConfidence.clear(); + isChimericTStudent.clear(); + isChimericANOVA.clear(); + trim.clear(); + spotMap.clear(); + windowSizes = window; + windows.clear(); - //initialize spotMap - for (int j = 0; j < numSeqs; j++) { - for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) { - spotMap[j][i] = i; - } - } + + querySeq = query; + + //find closest matches to query + closest = findClosest(query, numWanted); + //initialize spotMap + for (int i = 0; i < query->getAligned().length(); i++) { spotMap[i] = i; } + //mask sequences if the user wants to if (seqMask != "") { decalc->setMask(seqMask); - //mask querys - for (int i = 0; i < querySeqs.size(); i++) { - decalc->runMask(querySeqs[i]); - } - - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); - } + decalc->runMask(query); - for (int i = 0; i < numSeqs; i++) { - spotMap[i] = decalc->getMaskMap(); - } + //mask closest + for (int i = 0; i < closest.size(); i++) { decalc->runMask(closest[i].seq); } + + spotMap = decalc->getMaskMap(); } if (filter) { - vector temp = templateSeqs; - for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); } + vector temp; + for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); } + temp.push_back(query); createFilter(temp); - runFilter(querySeqs); - runFilter(templateSeqs); + for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]); } //update spotMap map newMap; int spot = 0; - int j = 0; for (int i = 0; i < filterString.length(); i++) { if (filterString[i] == '1') { //add to newMap - newMap[spot] = spotMap[j][i]; + newMap[spot] = spotMap[i]; spot++; } } - - for (int i = 0; i < numSeqs; i++) { - spotMap[i] = newMap; - } + spotMap = newMap; } //trim sequences - this follows ccodes remove_extra_gaps - for (int i = 0; i < querySeqs.size(); i++) { - trimSequences(i); - } + trimSequences(query); + //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length(). //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations - for (int i = 0; i < querySeqs.size(); i++) { - windows[i] = findWindows(i); - } + windows = findWindows(); + //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later - if (processors == 1) { - for (int i = 0; i < closest.size(); i++) { - removeBadReferenceSeqs(closest[i], i); - } - }else { createProcessesRemoveBad(); } - + removeBadReferenceSeqs(closest); - if (processors == 1) { - //find the averages for each querys references - for (int i = 0; i < numSeqs; i++) { - getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i]. - } - - //find the averages for the query - for (int i = 0; i < numSeqs; i++) { - getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i]. - } - }else { createProcessesAverages(); } - if (processors == 1) { - //find the averages for each querys references - for (int i = 0; i < numSeqs; i++) { - findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } - - //find the averages for the query - for (int i = 0; i < numSeqs; i++) { - findVarianceQuery(i); //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } - }else { createProcessesVariances(); } + //find the averages for each querys references + getAverageRef(closest); //fills sumRef, averageRef, sumSquaredRef and refCombo. + getAverageQuery(closest, query); //fills sumQuery, averageQuery, sumSquaredQuery. + - if (processors == 1) { - for (int i = 0; i < numSeqs; i++) { - determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - } - }else { createProcessesDetermine(); } + //find the averages for each querys references + findVarianceRef(); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0. + + + //find the averages for the query + findVarianceQuery(); //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0. + + determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - delete distCalc; - delete decalc; + for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; } return 0; } @@ -309,17 +233,17 @@ int Ccode::getChimeras() { } /***************************************************************************************************************/ //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences." -void Ccode::trimSequences(int query) { +void Ccode::trimSequences(Sequence* query) { try { int frontPos = 0; //should contain first position in all seqs that is not a gap character - int rearPos = querySeqs[query]->getAligned().length(); + int rearPos = query->getAligned().length(); //********find first position in closest seqs that is a non gap character***********// //find first position all query seqs that is a non gap character - for (int i = 0; i < closest[query].size(); i++) { + for (int i = 0; i < closest.size(); i++) { - string aligned = closest[query][i].seq->getAligned(); + string aligned = closest[i].seq->getAligned(); int pos = 0; //find first spot in this seq @@ -335,7 +259,7 @@ void Ccode::trimSequences(int query) { } //find first position all querySeq[query] that is a non gap character - string aligned = querySeqs[query]->getAligned(); + string aligned = query->getAligned(); int pos = 0; //find first spot in this seq @@ -351,9 +275,9 @@ void Ccode::trimSequences(int query) { //********find last position in closest seqs that is a non gap character***********// - for (int i = 0; i < closest[query].size(); i++) { + for (int i = 0; i < closest.size(); i++) { - string aligned = closest[query][i].seq->getAligned(); + string aligned = closest[i].seq->getAligned(); int pos = aligned.length(); //find first spot in this seq @@ -369,7 +293,7 @@ void Ccode::trimSequences(int query) { } //find last position all querySeqs[query] that is a non gap character - aligned = querySeqs[query]->getAligned(); + aligned = query->getAligned(); pos = aligned.length(); //find first spot in this seq @@ -391,7 +315,7 @@ void Ccode::trimSequences(int query) { tempTrim[frontPos] = rearPos; //save trimmed locations - trim[query] = tempTrim; + trim = tempTrim; //update spotMask map newMap; @@ -399,56 +323,45 @@ void Ccode::trimSequences(int query) { for (int i = frontPos; i < rearPos; i++) { //add to newMap -//cout << query << '\t' << i << '\t' << spotMap[query][i] << endl; - newMap[spot] = spotMap[query][i]; + newMap[spot] = spotMap[i]; spot++; } - - //for (it = newMap.begin(); it!=newMap.end(); it++) { - //cout << query << '\t' << it->first << '\t' << it->second << endl; - //} - - spotMap[query] = newMap; - - + spotMap = newMap; } catch(exception& e) { errorOut(e, "Ccode", "trimSequences"); exit(1); } - } /***************************************************************************************************************/ -vector Ccode::findWindows(int query) { +vector Ccode::findWindows() { try { vector win; - it = trim[query].begin(); + it = trim.begin(); int length = it->second - it->first; //default is wanted = 10% of total length - if (windowSizes[query] > length) { + if (windowSizes > length) { mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length."); - windowSizes[query] = length / 10; - }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10; } - else if (windowSizes[query] > (length * 0.20)) { + windowSizes = length / 10; + }else if (windowSizes == 0) { windowSizes = length / 10; } + else if (windowSizes > (length * 0.20)) { mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); - }else if (windowSizes[query] < (length * 0.05)) { + }else if (windowSizes < (length * 0.05)) { mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine(); } //save starting points of each window - for (int m = it->first; m < (it->second-windowSizes[query]); m+=windowSizes[query]) { win.push_back(m); } + for (int m = it->first; m < (it->second-windowSizes); m+=windowSizes) { win.push_back(m); } //save last window if (win[win.size()-1] < (it->first+length)) { - win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75 + win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75 } //with this you would get 1,25,50,75,100 - return win; - } catch(exception& e) { errorOut(e, "Ccode", "findWindows"); @@ -512,7 +425,7 @@ int Ccode::getDiff(string seqA, string seqB) { } //*************************************************************************************************************** //tried to make this look most like ccode original implementation -void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { +void Ccode::removeBadReferenceSeqs(vector& seqs) { try { vector< vector > numDiffBases; @@ -520,7 +433,7 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { //initialize to 0 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); } - it = trim[query].begin(); + it = trim.begin(); int length = it->second - it->first; //calc differences from each sequence to everyother seq in the set @@ -576,9 +489,8 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { seqs = goodSeqs; }else { //warn, but dont remove any - mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine(); + mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine(); } - } catch(exception& e) { @@ -587,46 +499,33 @@ void Ccode::removeBadReferenceSeqs(vector& seqs, int query) { } } //*************************************************************************************************************** -vector< vector > Ccode::findClosest(int start, int end, int numWanted) { +//makes copy of templateseq for filter +vector Ccode::findClosest(Sequence* q, int numWanted) { try{ - vector< vector > topMatches; topMatches.resize(querySeqs.size()); - - float smallestOverall, smallestLeft, smallestRight; - smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000; + vector topMatches; - //for each sequence in querySeqs - find top matches to use as reference - for(int j = start; j < end; j++){ - - Sequence query = *(querySeqs[j]); + Sequence query = *(q); - vector distances; + //calc distance to each sequence in template seqs + for (int i = 0; i < templateSeqs.size(); i++) { - //calc distance to each sequence in template seqs - for (int i = 0; i < templateSeqs.size(); i++) { - - Sequence ref = *(templateSeqs[i]); + Sequence ref = *(templateSeqs[i]); - //find overall dist - distCalc->calcDist(query, ref); - float dist = distCalc->getDist(); + //find overall dist + distCalc->calcDist(query, ref); + float dist = distCalc->getDist(); - //save distance - SeqDist temp; - temp.seq = templateSeqs[i]; - temp.dist = dist; + //save distance + SeqDist temp; + temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned()); + temp.dist = dist; - distances.push_back(temp); - } - - sort(distances.begin(), distances.end(), compareSeqDist); - - //save the number of top matches wanted - for (int h = 0; h < numWanted; h++) { - topMatches[j].push_back(distances[h]); - } + topMatches.push_back(temp); } + sort(topMatches.begin(), topMatches.end(), compareSeqDist); + return topMatches; } @@ -637,21 +536,21 @@ vector< vector > Ccode::findClosest(int start, int end, int numWanted) } /**************************************************************************************************/ //find the distances from each reference sequence to every other reference sequence for each window for this query -void Ccode::getAverageRef(vector ref, int query) { +void Ccode::getAverageRef(vector ref) { try { - vector< vector< vector > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2. + vector< vector< vector > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2. //initialize diffs vector diffs.resize(ref.size()); for (int i = 0; i < diffs.size(); i++) { diffs[i].resize(ref.size()); for (int j = 0; j < diffs[i].size(); j++) { - diffs[i][j].resize(windows[query].size(), 0); + diffs[i][j].resize(windows.size(), 0); } } - it = trim[query].begin(); + it = trim.begin(); //find the distances from each reference sequence to every other reference sequence for each window for this query for (int i = 0; i < ref.size(); i++) { @@ -663,23 +562,23 @@ void Ccode::getAverageRef(vector ref, int query) { string refJ = ref[j].seq->getAligned(); - for (int k = 0; k < windows[query].size(); k++) { + for (int k = 0; k < windows.size(); k++) { string refIWindowk, refJWindowk; - if (k < windows[query].size()-1) { + if (k < windows.size()-1) { //get window strings - refIWindowk = refI.substr(windows[query][k], windowSizes[query]); - refJWindowk = refJ.substr(windows[query][k], windowSizes[query]); + refIWindowk = refI.substr(windows[k], windowSizes); + refJWindowk = refJ.substr(windows[k], windowSizes); }else { //last window may be smaller than rest - see findwindows //get window strings - refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k])); - refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k])); + refIWindowk = refI.substr(windows[k], (it->second-windows[k])); + refJWindowk = refJ.substr(windows[k], (it->second-windows[k])); } //find differences int diff = getDiff(refIWindowk, refJWindowk); -//cout << i << '\t' << j << '\t' << k << '\t' << diff << endl; + //save differences in [i][j][k] and [j][i][k] since they are the same diffs[i][j][k] = diff; diffs[j][i][k] = diff; @@ -691,20 +590,20 @@ void Ccode::getAverageRef(vector ref, int query) { }//i //initialize sumRef for this query - sumRef[query].resize(windows[query].size(), 0); - sumSquaredRef[query].resize(windows[query].size(), 0); - averageRef[query].resize(windows[query].size(), 0); + sumRef.resize(windows.size(), 0); + sumSquaredRef.resize(windows.size(), 0); + averageRef.resize(windows.size(), 0); //find the sum of the differences for hte reference sequences for (int i = 0; i < diffs.size(); i++) { for (int j = 0; j < i; j++) { //increment this querys reference sequences combos - refCombo[query]++; + refCombo++; for (int k = 0; k < diffs[i][j].size(); k++) { - sumRef[query][k] += diffs[i][j][k]; - sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]); + sumRef[k] += diffs[i][j][k]; + sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]); }//k }//j @@ -712,8 +611,8 @@ void Ccode::getAverageRef(vector ref, int query) { //find the average of the differences for the references for each window - for (int i = 0; i < windows[query].size(); i++) { - averageRef[query][i] = sumRef[query][i] / (float) refCombo[query]; + for (int i = 0; i < windows.size(); i++) { + averageRef[i] = sumRef[i] / (float) refCombo; } } @@ -723,7 +622,7 @@ void Ccode::getAverageRef(vector ref, int query) { } } /**************************************************************************************************/ -void Ccode::getAverageQuery (vector ref, int query) { +void Ccode::getAverageQuery (vector ref, Sequence* query) { try { vector< vector > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2. @@ -731,35 +630,35 @@ void Ccode::getAverageQuery (vector ref, int query) { //initialize diffs vector diffs.resize(ref.size()); for (int j = 0; j < diffs.size(); j++) { - diffs[j].resize(windows[query].size(), 0); + diffs[j].resize(windows.size(), 0); } - it = trim[query].begin(); + it = trim.begin(); - string refQuery = querySeqs[query]->getAligned(); + string refQuery = query->getAligned(); //jgetAligned(); - for (int k = 0; k < windows[query].size(); k++) { + for (int k = 0; k < windows.size(); k++) { string QueryWindowk, refJWindowk; - if (k < windows[query].size()-1) { + if (k < windows.size()-1) { //get window strings - QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]); - refJWindowk = refJ.substr(windows[query][k], windowSizes[query]); + QueryWindowk = refQuery.substr(windows[k], windowSizes); + refJWindowk = refJ.substr(windows[k], windowSizes); }else { //last window may be smaller than rest - see findwindows //get window strings - QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k])); - refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k])); + QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k])); + refJWindowk = refJ.substr(windows[k], (it->second-windows[k])); } //find differences int diff = getDiff(QueryWindowk, refJWindowk); -//cout << j << '\t' << k << '\t' << diff << endl; + //save differences diffs[j][k] = diff; @@ -768,25 +667,23 @@ void Ccode::getAverageQuery (vector ref, int query) { //initialize sumRef for this query - sumQuery[query].resize(windows[query].size(), 0); - sumSquaredQuery[query].resize(windows[query].size(), 0); - averageQuery[query].resize(windows[query].size(), 0); + sumQuery.resize(windows.size(), 0); + sumSquaredQuery.resize(windows.size(), 0); + averageQuery.resize(windows.size(), 0); //find the sum of the differences for (int j = 0; j < diffs.size(); j++) { for (int k = 0; k < diffs[j].size(); k++) { - sumQuery[query][k] += diffs[j][k]; - sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]); + sumQuery[k] += diffs[j][k]; + sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]); }//k }//j //find the average of the differences for the references for each window - for (int i = 0; i < windows[query].size(); i++) { - averageQuery[query][i] = sumQuery[query][i] / (float) ref.size(); + for (int i = 0; i < windows.size(); i++) { + averageQuery[i] = sumQuery[i] / (float) ref.size(); } - - } catch(exception& e) { errorOut(e, "Ccode", "getAverageQuery"); @@ -794,23 +691,23 @@ void Ccode::getAverageQuery (vector ref, int query) { } } /**************************************************************************************************/ -void Ccode::findVarianceRef (int query) { +void Ccode::findVarianceRef() { try { - varRef[query].resize(windows[query].size(), 0); - sdRef[query].resize(windows[query].size(), 0); + varRef.resize(windows.size(), 0); + sdRef.resize(windows.size(), 0); //for each window - for (int i = 0; i < windows[query].size(); i++) { - varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1); - sdRef[query][i] = sqrt(varRef[query][i]); + for (int i = 0; i < windows.size(); i++) { + varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1); + sdRef[i] = sqrt(varRef[i]); //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation - if (averageRef[query][i] < 0.001) { averageRef[query][i] = 0.001; } - if (sumRef[query][i] < 0.001) { sumRef[query][i] = 0.001; } - if (varRef[query][i] < 0.001) { varRef[query][i] = 0.001; } - if (sumSquaredRef[query][i] < 0.001) { sumSquaredRef[query][i] = 0.001; } - if (sdRef[query][i] < 0.001) { sdRef[query][i] = 0.001; } + if (averageRef[i] < 0.001) { averageRef[i] = 0.001; } + if (sumRef[i] < 0.001) { sumRef[i] = 0.001; } + if (varRef[i] < 0.001) { varRef[i] = 0.001; } + if (sumSquaredRef[i] < 0.001) { sumSquaredRef[i] = 0.001; } + if (sdRef[i] < 0.001) { sdRef[i] = 0.001; } } } @@ -820,22 +717,22 @@ void Ccode::findVarianceRef (int query) { } } /**************************************************************************************************/ -void Ccode::findVarianceQuery (int query) { +void Ccode::findVarianceQuery() { try { - varQuery[query].resize(windows[query].size(), 0); - sdQuery[query].resize(windows[query].size(), 0); + varQuery.resize(windows.size(), 0); + sdQuery.resize(windows.size(), 0); //for each window - for (int i = 0; i < windows[query].size(); i++) { - varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1); - sdQuery[query][i] = sqrt(varQuery[query][i]); + for (int i = 0; i < windows.size(); i++) { + varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1); + sdQuery[i] = sqrt(varQuery[i]); //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation - if (averageQuery[query][i] < 0.001) { averageQuery[query][i] = 0.001; } - if (sumQuery[query][i] < 0.001) { sumQuery[query][i] = 0.001; } - if (varQuery[query][i] < 0.001) { varQuery[query][i] = 0.001; } - if (sumSquaredQuery[query][i] < 0.001) { sumSquaredQuery[query][i] = 0.001; } - if (sdQuery[query][i] < 0.001) { sdQuery[query][i] = 0.001; } + if (averageQuery[i] < 0.001) { averageQuery[i] = 0.001; } + if (sumQuery[i] < 0.001) { sumQuery[i] = 0.001; } + if (varQuery[i] < 0.001) { varQuery[i] = 0.001; } + if (sumSquaredQuery[i] < 0.001) { sumSquaredQuery[i] = 0.001; } + if (sdQuery[i] < 0.001) { sdQuery[i] = 0.001; } } } @@ -845,44 +742,44 @@ void Ccode::findVarianceQuery (int query) { } } /**************************************************************************************************/ -void Ccode::determineChimeras (int query) { +void Ccode::determineChimeras() { try { - isChimericConfidence[query].resize(windows[query].size(), false); - isChimericTStudent[query].resize(windows[query].size(), false); - isChimericANOVA[query].resize(windows[query].size(), false); - anova[query].resize(windows[query].size()); + isChimericConfidence.resize(windows.size(), false); + isChimericTStudent.resize(windows.size(), false); + isChimericANOVA.resize(windows.size(), false); + anova.resize(windows.size()); //for each window - for (int i = 0; i < windows[query].size(); i++) { + for (int i = 0; i < windows.size(); i++) { //get confidence limits - float t = getT(closest[query].size()-1); //how many seqs you are comparing to this querySeq - float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i]; - float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i]; -//cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl; - if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) { /* range does not include 1 */ - isChimericConfidence[query][i] = true; /* significantly higher at P<0.05 */ -//cout << i << " made it here" << endl; + float t = getT(closest.size()-1); //how many seqs you are comparing to this querySeq + float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i]; + float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i]; + + if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) { /* range does not include 1 */ + isChimericConfidence[i] = true; /* significantly higher at P<0.05 */ + } //student t test - int degreeOfFreedom = refCombo[query] + closest[query].size() - 2; - float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size())); /* denominator, without sqrt(), for ts calculations */ + int degreeOfFreedom = refCombo + closest.size() - 2; + float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size())); /* denominator, without sqrt(), for ts calculations */ - float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT))); /* value of ts for t-student test */ + float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT))); /* value of ts for t-student test */ t = getT(degreeOfFreedom); -//cout << i << '\t' << t << '\t' << ts << endl; - if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) { - isChimericTStudent[query][i] = true; /* significantly higher at P<0.05 */ + + if ((ts >= t) && (averageQuery[i] > averageRef[i])) { + isChimericTStudent[i] = true; /* significantly higher at P<0.05 */ } //anova test - float value1 = sumQuery[query][i] + sumRef[query][i]; - float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i]; - float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]); - float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) ); + float value1 = sumQuery[i] + sumRef[i]; + float value2 = sumSquaredQuery[i] + sumSquaredRef[i]; + float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo); + float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) ); float value5 = value2 - value4; float value6 = value3 - value4; float value7 = value5 - value6; @@ -891,13 +788,13 @@ void Ccode::determineChimeras (int query) { float f = getF(degreeOfFreedom); - if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i])) { - isChimericANOVA[query][i] = true; /* significant P<0.05 */ + if ((anovaValue >= f) && (averageQuery[i] > averageRef[i])) { + isChimericANOVA[i] = true; /* significant P<0.05 */ } if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; } - anova[query][i] = anovaValue; + anova[i] = anovaValue; } } @@ -1009,597 +906,6 @@ float Ccode::getF(int numseq) { exit(1); } } - -/**************************************************************************************************/ -void Ccode::createProcessesClosest() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - closest = findClosest(lines[process]->start, lines[process]->end, numWanted); - mothurOut("Done finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < closest[i].size(); j++) { - closest[i][j].seq->printSequence(out); - } - } - out << ">" << endl; //to stop sequence read - - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < closest[i].size(); j++) { - out << closest[i][j].dist << '\t'; - } - out << endl; - } - out.close(); - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;i > tempClosest; tempClosest.resize(querySeqs.size()); - //get pairs - for (int k = lines[i]->start; k < lines[i]->end; k++) { - vector tempVector; - - for (int j = 0; j < numWanted; j++) { - - Sequence* temp = new Sequence(in); - gobble(in); - - tempVector.push_back(temp); - } - - tempClosest[k] = tempVector; - } - - string junk; - in >> junk; gobble(in); // to get ">" - - vector< vector > dists; dists.resize(querySeqs.size()); - - for (int k = lines[i]->start; k < lines[i]->end; k++) { - dists[k].resize(numWanted); - for (int j = 0; j < numWanted; j++) { - in >> dists[k][j]; - } - gobble(in); - - } - - for (int k = lines[i]->start; k < lines[i]->end; k++) { - closest[k].resize(numWanted); - for (int j = 0; j < closest[k].size(); j++) { - closest[k][j].seq = tempClosest[k][j]; - closest[k][j].dist = dists[k][j]; - } - } - - in.close(); - remove(s.c_str()); - } - - -#else - closest = findClosest(lines[0]->start, lines[0]->end, numWanted); -#endif - - } - catch(exception& e) { - errorOut(e, "Ccode", "createProcessesClosest"); - exit(1); - } -} - -//*************************************************************************************************************** -void Ccode::createProcessesRemoveBad() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - for (int i = lines[process]->start; i < lines[process]->end; i++) { - removeBadReferenceSeqs(closest[i], i); - } - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << closest[i].size() << endl; - for (int j = 0; j < closest[i].size(); j++) { - closest[i][j].seq->printSequence(out); - } - out << ">" << endl; //to stop sequence read - } - - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < closest[i].size(); j++) { - out << closest[i][j].dist << '\t'; - } - out << endl; - } - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;i > tempClosest; tempClosest.resize(querySeqs.size()); - vector sizes; - //get pairs - for (int k = lines[i]->start; k < lines[i]->end; k++) { - - int num; - in >> num; - sizes.push_back(num); - gobble(in); - - vector tempVector; - - for (int j = 0; j < num; j++) { - - Sequence* temp = new Sequence(in); - gobble(in); - - tempVector.push_back(temp); - } - string junk; - in >> junk; gobble(in); // to get ">" - - tempClosest[k] = tempVector; - } - - vector< vector > dists; dists.resize(querySeqs.size()); - int count = 0; - for (int k = lines[i]->start; k < lines[i]->end; k++) { - dists[k].resize(sizes[count]); - for (int j = 0; j < sizes[count]; j++) { - in >> dists[k][j]; - } - gobble(in); - count++; - } - - count = 0; - for (int k = lines[i]->start; k < lines[i]->end; k++) { - for (int j = 0; j < sizes[count]; j++) { - closest[k][j].seq = tempClosest[k][j]; - closest[k][j].dist = dists[k][j]; - } - count++; - } - - in.close(); - remove(s.c_str()); - } -#else - for (int i = 0; i < closest.size(); i++) { - removeBadReferenceSeqs(closest[i], i); - } -#endif - - } - catch(exception& e) { - errorOut(e, "Ccode", "createProcessesRemoveBad"); - exit(1); - } -} -//*************************************************************************************************************** -void Ccode::createProcessesAverages() { - try { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - //find the averages for each querys references - for (int i = lines[process]->start; i < lines[process]->end; i++) { - getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i]. - } - - //find the averages for the query - for (int i = lines[process]->start; i < lines[process]->end; i++) { - getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i]. - } - - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < windows[i].size(); j++) { - out << sumRef[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << averageRef[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << sumSquaredRef[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << sumQuery[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << averageQuery[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << sumSquaredQuery[i][j] << '\t'; - } - out << endl; - out << refCombo[i] << endl; - } - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;istart; k < lines[i]->end; k++) { - sumRef[k].resize(windows[k].size()); - averageRef[k].resize(windows[k].size()); - sumSquaredRef[k].resize(windows[k].size()); - averageQuery[k].resize(windows[k].size()); - sumQuery[k].resize(windows[k].size()); - sumSquaredQuery[k].resize(windows[k].size()); - - for (int j = 0; j < windows[k].size(); j++) { - in >> sumRef[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> averageRef[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> sumSquaredRef[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> sumQuery[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> averageQuery[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> sumSquaredQuery[k][j]; - } - gobble(in); - in >> refCombo[k]; - gobble(in); - } - - in.close(); - remove(s.c_str()); - } -#else - //find the averages for each querys references - for (int i = 0; i < querySeqs.size(); i++) { - getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i]. - } - - //find the averages for the query - for (int i = 0; i < querySeqs.size(); i++) { - getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i]. - } - -#endif - - } - catch(exception& e) { - errorOut(e, "Ccode", "createProcessesAverages"); - exit(1); - } -} -//*************************************************************************************************************** -void Ccode::createProcessesVariances() { - try { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - //find the averages for each querys references - for (int i = lines[process]->start; i < lines[process]->end; i++) { - findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } - - //find the averages for the query - for (int i = lines[process]->start; i < lines[process]->end; i++) { - findVarianceQuery(i); //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } - - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < windows[i].size(); j++) { - out << varRef[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << sdRef[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << varQuery[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << sdQuery[i][j] << '\t'; - } - out << endl; - } - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;istart; k < lines[i]->end; k++) { - varRef[k].resize(windows[k].size()); - sdRef[k].resize(windows[k].size()); - varQuery[k].resize(windows[k].size()); - sdQuery[k].resize(windows[k].size()); - - for (int j = 0; j < windows[k].size(); j++) { - in >> varRef[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> sdRef[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> varQuery[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> sdQuery[k][j]; - } - gobble(in); - } - - in.close(); - remove(s.c_str()); - } -#else - //find the averages for each querys references - for (int i = 0; i < querySeqs.size(); i++) { - findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } - - //find the averages for the query - for (int i = 0; i < querySeqs.size(); i++) { - findVarianceQuery(i); //fills v arQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0. - } -#endif - } - catch(exception& e) { - errorOut(e, "Ccode", "createProcessesVariances"); - exit(1); - } -} -//*************************************************************************************************************** -void Ccode::createProcessesDetermine() { - try { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - //find the averages for each querys references - for (int i = lines[process]->start; i < lines[process]->end; i++) { - determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - } - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = lines[process]->start; i < lines[process]->end; i++) { - for (int j = 0; j < windows[i].size(); j++) { - out << anova[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << isChimericConfidence[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << isChimericTStudent[i][j] << '\t'; - } - out << endl; - for (int j = 0; j < windows[i].size(); j++) { - out << isChimericANOVA[i][j] << '\t'; - } - out << endl; - } - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;istart; k < lines[i]->end; k++) { - anova[k].resize(windows[k].size()); - isChimericConfidence[k].resize(windows[k].size(), false); - isChimericTStudent[k].resize(windows[k].size(), false); - isChimericANOVA[k].resize(windows[k].size(), false); - int tempBool; - - for (int j = 0; j < windows[k].size(); j++) { - in >> anova[k][j]; - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> tempBool; - if (tempBool == 1) { isChimericConfidence[k][j] = true; } - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> tempBool; - if (tempBool == 1) { isChimericTStudent[k][j] = true; } - } - gobble(in); - for (int j = 0; j < windows[k].size(); j++) { - in >> tempBool; - if (tempBool == 1) { isChimericANOVA[k][j] = true; } - } - gobble(in); - } - - in.close(); - remove(s.c_str()); - } - #else - for (int i = 0; i < querySeqs.size(); i++) { - determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. - } - #endif - - } - catch(exception& e) { - errorOut(e, "Ccode", "createProcessesDetermine"); - exit(1); - } -} //*************************************************************************************************************** diff --git a/ccode.h b/ccode.h index 52592f9..c60b130 100644 --- a/ccode.h +++ b/ccode.h @@ -24,70 +24,57 @@ class Ccode : public Chimera { public: - Ccode(string, string, string); + Ccode(string, string); ~Ccode(); - int getChimeras(); + int getChimeras(Sequence* query); void print(ostream&); - - void setCons(string c) {} - void setQuantiles(string q) {} - - + void printHeader(ostream&); private: Dist* distCalc; DeCalculator* decalc; int iters; - string fastafile, templateFile; + string fastafile, mapInfo; + Sequence* querySeq; - vector lines; - vector querySeqs; - vector templateSeqs; - vector< map > spotMap; + map spotMap; map::iterator it; - vector< vector > windows; //windows[0] is the vector of window breaks for querySeqs[0] - vector windowSizes; //windowSizes[0] is the size of the windows for querySeqs[0] - vector< map > trim; //trim[0] is the map containing the starting and ending positions for querySeqs[0] set of seqs - vector< vector > closest; //closest[0] is a vector of sequence at are closest to queryseqs[0]... - vector< vector > averageRef; //averageRef[0] is the average distance at each window for the references for querySeqs[0] - vector< vector > averageQuery; //averageQuery[0] is the average distance at each winow for the query for querySeqs[0] - vector< vector > sumRef; //sumRef[0] is the sum of distances at each window for the references for querySeqs[0] - vector< vector > sumSquaredRef; //sumSquaredRef[0] is the sum of squared distances at each window for the references for querySeqs[0] - vector< vector > sumQuery; //sumQuery[0] is the sum of distances at each window for the comparison of query to references for querySeqs[0] - vector< vector > sumSquaredQuery; //sumSquaredQuery[0] is the sum of squared distances at each window for the comparison of query to references for querySeqs[0] - vector< vector > varRef; //varRef[0] is the variance among references seqs at each window for querySeqs[0] - vector< vector > varQuery; //varQuery[0] is the variance among references and querySeqs[0] at each window - vector< vector > sdRef; //sdRef[0] is the standard deviation of references seqs at each window for querySeqs[0] - vector< vector > sdQuery; //sdQuery[0] is the standard deviation of references and querySeqs[0] at each window - vector< vector > anova; //anova[0] is the vector of anova scores for each window for querySeqs[0] - vector refCombo; //refCombo[0] is the number of reference sequences combinations for querySeqs[0] - vector< vector > isChimericConfidence; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits - vector< vector > isChimericTStudent; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits - vector< vector > isChimericANOVA; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits + vector windows; //windows is the vector of window breaks for query + int windowSizes; //windowSizes is the size of the windows for query + map trim; //trim is the map containing the starting and ending positions for query + vector closest; //closest is a vector of sequence at are closest to query + vector averageRef; //averageRef is the average distance at each window for the references for query + vector averageQuery; //averageQuery is the average distance at each winow for the query for query + vector sumRef; //sumRef is the sum of distances at each window for the references for query + vector sumSquaredRef; //sumSquaredRef is the sum of squared distances at each window for the references for query + vector sumQuery; //sumQuery is the sum of distances at each window for the comparison of query to references for query + vector sumSquaredQuery; //sumSquaredQuery is the sum of squared distances at each window for the comparison of query to references for query + vector varRef; //varRef is the variance among references seqs at each window for query + vector varQuery; //varQuery is the variance among references and query at each window + vector sdRef; //sdRef is the standard deviation of references seqs at each window for query + vector sdQuery; //sdQuery is the standard deviation of references and query at each window + vector anova; //anova is the vector of anova scores for each window for query + int refCombo; //refCombo is the number of reference sequences combinations for query + vector isChimericConfidence; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits + vector isChimericTStudent; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits + vector isChimericANOVA; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits - vector< vector > findClosest(int, int, int); - void removeBadReferenceSeqs(vector&, int); //removes sequences from closest that are to different of too similar to eachother. - void trimSequences(int); - vector findWindows(int); - void getAverageRef(vector, int); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i]. - void getAverageQuery (vector, int); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i]. - void findVarianceRef (int); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0. - void findVarianceQuery (int); //fills varQuery[i] and sdQuery[i] - void determineChimeras (int); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i]. + vector findClosest(Sequence*, int); + void removeBadReferenceSeqs(vector&); //removes sequences from closest that are to different of too similar to eachother. + void trimSequences(Sequence*); + vector findWindows(); + void getAverageRef(vector); //fills sumRef, averageRef, sumSquaredRef and refCombo. + void getAverageQuery (vector, Sequence*); //fills sumQuery, averageQuery, sumSquaredQuery. + void findVarianceRef (); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0. + void findVarianceQuery (); //fills varQuery and sdQuery + void determineChimeras (); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. int getDiff(string, string); //return number of mismatched bases, a gap to base is not counted as a mismatch float getT(int); float getF(int); - - void createProcessesClosest(); - void createProcessesRemoveBad(); - void createProcessesAverages(); - void createProcessesVariances(); - void createProcessesDetermine(); - }; /***********************************************************/ diff --git a/chimera.cpp b/chimera.cpp index 4ae4991..bf16de4 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -13,7 +13,7 @@ //this is a vertical soft filter void Chimera::createFilter(vector seqs) { try { - + filterString = ""; int threshold = int (0.5 * seqs.size()); //cout << "threshhold = " << threshold << endl; @@ -60,33 +60,29 @@ void Chimera::createFilter(vector seqs) { } } //*************************************************************************************************************** -void Chimera::runFilter(vector seqs) { +void Chimera::runFilter(Sequence* seq) { try { - //for each sequence - for (int i = 0; i < seqs.size(); i++) { - - string seqAligned = seqs[i]->getAligned(); - string newAligned = ""; + string seqAligned = seq->getAligned(); + string newAligned = ""; - for (int j = 0; j < seqAligned.length(); j++) { - //if this spot is a gap - if (filterString[j] == '1') { newAligned += seqAligned[j]; } - } - - seqs[i]->setAligned(newAligned); + for (int j = 0; j < seqAligned.length(); j++) { + //if this spot is a gap + if (filterString[j] == '1') { newAligned += seqAligned[j]; } } - + + seq->setAligned(newAligned); } catch(exception& e) { errorOut(e, "Chimera", "runFilter"); exit(1); } } - //*************************************************************************************************************** vector Chimera::readSeqs(string file) { try { + + mothurOut("Reading sequences... "); cout.flush(); ifstream in; openInputFile(file, in); vector container; @@ -108,6 +104,8 @@ vector Chimera::readSeqs(string file) { } in.close(); + mothurOut("Done."); mothurOutEndLine(); + return container; } catch(exception& e) { @@ -187,6 +185,31 @@ vector< vector > Chimera::readQuantiles() { } } //*************************************************************************************************************** +Sequence* Chimera::getSequence(string name) { + try{ + Sequence* temp; + + //look through templateSeqs til you find it + int spot = -1; + for (int i = 0; i < templateSeqs.size(); i++) { + if (name == templateSeqs[i]->getName()) { + spot = i; + break; + } + } + + if(spot == -1) { mothurOut("Error: Could not find sequence."); mothurOutEndLine(); return NULL; } + + temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned()); + + return temp; + } + catch(exception& e) { + errorOut(e, "Chimera", "getSequence"); + exit(1); + } +} +//*************************************************************************************************************** diff --git a/chimera.h b/chimera.h index 8b3b0eb..a968a02 100644 --- a/chimera.h +++ b/chimera.h @@ -14,7 +14,7 @@ #include "mothur.h" #include "sequence.hpp" - +/***********************************************************************/ struct Preference { string name; vector leftParent; //keep the name of closest left associated with the two scores @@ -25,12 +25,52 @@ struct Preference { int midpoint; }; +/***********************************************************************/ +struct score_struct { + int prev; + int score; + int row; + int col; +}; +/***********************************************************************/ +struct trace_struct { + int col; + int oldCol; + int row; +}; +/***********************************************************************/ +struct results { + int regionStart; + int regionEnd; + int nastRegionStart; + int nastRegionEnd; + string parent; + float queryToParent; + float queryToParentLocal; + float divR; +}; +/***********************************************************************/ +struct rank { + int num; + float score; + rank(int n, float s) : num(n), score(s) {} +}; +/***********************************************************************/ struct SeqDist { Sequence* seq; float dist; }; - +//******************************************************************************************************************** +//sorts highest to lowest +inline bool compareMembers(rank left, rank right){ + return (left.score > right.score); +} +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareRegionStart(results left, results right){ + return (left.nastRegionStart < right.nastRegionStart); +} //******************************************************************************************************************** //sorts lowest to highest inline bool compareSeqDist(SeqDist left, SeqDist right){ @@ -59,8 +99,8 @@ class Chimera { public: Chimera(){}; + Chimera(string); Chimera(string, string); - Chimera(string, string, string); virtual ~Chimera(){}; virtual void setFilter(bool f) { filter = f; } virtual void setCorrection(bool c) { correction = c; } @@ -76,30 +116,37 @@ class Chimera { virtual void setDivR(float d) { divR = d; } virtual void setParents(int p) { parents = p; } virtual void setMinSim(int s) { minSim = s; } + virtual void setMinCoverage(int c) { minCov = c; } + virtual void setMinBS(int b) { minBS = b; } + virtual void setMinSNP(int s) { minSNP = s; } virtual void setIters(int i) { iters = i; } - + virtual void setTemplateSeqs(vector t) { templateSeqs = t; } + virtual bool getUnaligned() { return unaligned; } + virtual void setTemplateFile(string t) { templateFileName = t; } virtual void setCons(string){}; virtual void setQuantiles(string){}; + virtual void doPrep(){}; virtual vector readSeqs(string); virtual vector< vector > readQuantiles(); virtual void setMask(string); - virtual void runFilter(vector); + virtual void runFilter(Sequence*); virtual void createFilter(vector); + virtual void printHeader(ostream&){}; + virtual int getChimeras(Sequence*){ return 0; } + virtual int getChimeras(){ return 0; } + virtual void print(ostream&){}; - //pure functions - virtual int getChimeras() = 0; - virtual void print(ostream&) = 0; protected: + vector templateSeqs; bool filter, correction, svg, unaligned; - int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, parents, iters; + int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters; float divR; - string seqMask, quanfile, filterString, name, outputDir; - - + string seqMask, quanfile, filterString, name, outputDir, templateFileName; + Sequence* getSequence(string); //find sequence from name }; /***********************************************************************/ diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp index 04c143f..39ed487 100644 --- a/chimeracheckrdp.cpp +++ b/chimeracheckrdp.cpp @@ -10,12 +10,11 @@ #include "chimeracheckrdp.h" //*************************************************************************************************************** -ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp, string o) { fastafile = filename; templateFile = temp; outputDir = o; } +ChimeraCheckRDP::ChimeraCheckRDP(string filename, string o) { fastafile = filename; outputDir = o; } //*************************************************************************************************************** ChimeraCheckRDP::~ChimeraCheckRDP() { try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } delete templateDB; delete kmer; } @@ -28,113 +27,65 @@ ChimeraCheckRDP::~ChimeraCheckRDP() { void ChimeraCheckRDP::print(ostream& out) { try { - mothurOutEndLine(); - - //vector isChimeric; isChimeric.resize(querySeqs.size(), false); + mothurOut("Processing: " + querySeq->getName()); mothurOutEndLine(); - for (int i = 0; i < querySeqs.size(); i++) { + out << querySeq->getName() << endl; + out << "IS scores: " << '\t'; - out << querySeqs[i]->getName() << endl; - out << "IS scores: " << '\t'; - - //int lastChimericWindowFound = 0; - - for (int k = 0; k < IS[i].size(); k++) { - out << IS[i][k].score << '\t'; - //if (IS[i][k].score > chimeraCutoff) { isChimeric[i] = true; lastChimericWindowFound = k; } - } - out << endl; - //if (isChimeric[i]) { - //mothurOut(querySeqs[i]->getName() + "\tIS: " + toString(IS[i][lastChimericWindowFound].score) + "\tbreakpoint: " + toString(IS[i][lastChimericWindowFound].midpoint) + "\tleft parent: " + IS[i][lastChimericWindowFound].leftParent + "\tright parent: " + IS[i][lastChimericWindowFound].rightParent); mothurOutEndLine(); - //out << endl << "chimera: YES" << endl; - //}else{ - //out << endl << "chimera: NO" << endl; - //} + for (int k = 0; k < IS.size(); k++) { + out << IS[k].score << '\t'; + } + out << endl; + + if (svg) { + if (name != "") { //if user has specific names + map::iterator it = names.find(querySeq->getName()); - if (svg) { - - if (name != "") { //if user has specific names - map::iterator it = names.find(querySeqs[i]->getName()); - - if (it != names.end()) { //user wants pic of this - makeSVGpic(IS[i], i); //zeros out negative results - } - }else{//output them all - makeSVGpic(IS[i], i); //zeros out negative results - } + if (it != names.end()) { //user wants pic of this + makeSVGpic(IS); //zeros out negative results } + }else{//output them all + makeSVGpic(IS); //zeros out negative results + } } - - mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine(); } catch(exception& e) { errorOut(e, "ChimeraCheckRDP", "print"); exit(1); } } - //*************************************************************************************************************** -int ChimeraCheckRDP::getChimeras() { +void ChimeraCheckRDP::doPrep() { try { - - //read in query sequences and subject sequences - mothurOutEndLine(); - mothurOut("Reading query sequences... "); cout.flush(); - querySeqs = readSeqs(fastafile); - mothurOut("Done."); - //templateSeqs = readSeqs(templateFile); - templateDB = new AlignmentDB(templateFile, "kmer", kmerSize, 0.0,0.0,0.0,0.0); + templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0); mothurOutEndLine(); - int numSeqs = querySeqs.size(); - - IS.resize(numSeqs); - closest.resize(numSeqs); - - //break up file if needed - int linesPerProcess = numSeqs / processors ; - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - - #else - lines.push_back(new linePair(0, numSeqs)); - #endif - kmer = new Kmer(kmerSize); if (name != "") { readName(name); //fills name map with names of seqs the user wants to have .svg for. } + } + catch(exception& e) { + errorOut(e, "ChimeraCheckRDP", "doPrep"); + exit(1); + } +} +//*************************************************************************************************************** +int ChimeraCheckRDP::getChimeras(Sequence* query) { + try { - //find closest seq to each querySeq - for (int i = 0; i < querySeqs.size(); i++) { - closest[i] = templateDB->findClosestSequence(querySeqs[i]); - } - - //for each query find IS value - if (processors == 1) { - for (int i = 0; i < querySeqs.size(); i++) { - IS[i] = findIS(i); - } - }else { createProcessesIS(); } - + IS.clear(); + + querySeq = query; + + closest = templateDB->findClosestSequence(query); + + IS = findIS(); + //determine chimera report cutoff - window score above 95% //getCutoff(); - not very acurate predictor - //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - return 0; } catch(exception& e) { @@ -143,7 +94,7 @@ int ChimeraCheckRDP::getChimeras() { } } //*************************************************************************************************************** -vector ChimeraCheckRDP::findIS(int query) { +vector ChimeraCheckRDP::findIS() { try { @@ -154,13 +105,11 @@ vector ChimeraCheckRDP::findIS(int query) { vector< map > subjectKmerInfo; vector isValues; - string queryName = querySeqs[query]->getName(); - string seq = querySeqs[query]->getUnaligned(); - - mothurOut("Finding IS values for sequence " + toString(query+1)); mothurOutEndLine(); + string queryName = querySeq->getName(); + string seq = querySeq->getUnaligned(); queryKmerInfo = kmer->getKmerCounts(seq); - subjectKmerInfo = kmer->getKmerCounts(closest[query].getUnaligned()); + subjectKmerInfo = kmer->getKmerCounts(closest.getUnaligned()); //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]); @@ -238,7 +187,7 @@ vector ChimeraCheckRDP::findIS(int query) { delete left; delete right; - } + } return isValues; @@ -309,36 +258,10 @@ int ChimeraCheckRDP::calcKmers(map query, map subject) { } //*************************************************************************************************************** -void ChimeraCheckRDP::getCutoff() { - try{ - - vector temp; - - //store all is scores for all windows - for (int i = 0; i < IS.size(); i++) { - for (int j = 0; j < IS[i].size(); j++) { - temp.push_back(IS[i][j].score); - } - } - - //sort them - sort(temp.begin(), temp.end()); - - //get 95% - chimeraCutoff = temp[int(temp.size() * 0.95)]; - - } - catch(exception& e) { - errorOut(e, "ChimeraCheckRDP", "getCutoff"); - exit(1); - } -} - -//*************************************************************************************************************** -void ChimeraCheckRDP::makeSVGpic(vector info, int query) { +void ChimeraCheckRDP::makeSVGpic(vector info) { try{ - string file = outputDir + querySeqs[query]->getName() + ".chimeracheck.svg"; + string file = outputDir + querySeq->getName() + ".chimeracheck.svg"; ofstream outsvg; openOutputFile(file, outsvg); @@ -346,7 +269,7 @@ void ChimeraCheckRDP::makeSVGpic(vector info, int query) { outsvg << "\n"; outsvg << "\n"; - outsvg << "Plotted IS values for " + querySeqs[query]->getName() + "\n"; + outsvg << "Plotted IS values for " + querySeq->getName() + "\n"; outsvg << "\n"; outsvg << "\n"; @@ -392,96 +315,5 @@ void ChimeraCheckRDP::makeSVGpic(vector info, int query) { } } //*************************************************************************************************************** -void ChimeraCheckRDP::createProcessesIS() { - try { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - for (int i = lines[process]->start; i < lines[process]->end; i++) { - IS[i] = findIS(i); - } - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << IS[i].size() << endl; - for (int j = 0; j < IS[i].size(); j++) { - out << IS[i][j].leftParent << '\t'<< IS[i][j].rightParent << '\t' << IS[i][j].midpoint << '\t' << IS[i][j].score << endl; - } - } - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;istart; k < lines[i]->end; k++) { - - int size; - in >> size; gobble(in); - - string left, right; - int mid; - float score; - - IS[k].clear(); - - for (int j = 0; j < size; j++) { - in >> left >> right >> mid >> score; gobble(in); - - sim temp; - temp.leftParent = left; - temp.rightParent = right; - temp.midpoint = mid; - temp.score = score; - - IS[k].push_back(temp); - } - } - - in.close(); - remove(s.c_str()); - } -#else - for (int i = 0; i < querySeqs.size(); i++) { - IS[i] = findIS(i); - } -#endif - } - catch(exception& e) { - errorOut(e, "ChimeraCheckRDP", "createProcessesIS"); - exit(1); - } -} - -//*************************************************************************************************************** diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h index 6193d6d..e12e7d3 100644 --- a/chimeracheckrdp.h +++ b/chimeracheckrdp.h @@ -25,45 +25,30 @@ class ChimeraCheckRDP : public Chimera { public: - ChimeraCheckRDP(string, string, string); + ChimeraCheckRDP(string, string); ~ChimeraCheckRDP(); - int getChimeras(); + int getChimeras(Sequence*); void print(ostream&); - - void setCons(string){}; - void setQuantiles(string q) {}; - + void doPrep(); private: - vector lines; - vector querySeqs; + + Sequence* querySeq; AlignmentDB* templateDB; Kmer* kmer; - - vector< vector > IS; //IS[0] is the vector of IS values for each window for querySeqs[0] - float chimeraCutoff; - - - //map > >:: iterator it; - //map::iterator it2; - - vector closest; //closest[0] is the closest overall seq to querySeqs[0]. - - string fastafile, templateFile; + Sequence closest; //closest is the closest overall seq to query + + vector IS; //IS is the vector of IS values for each window for query + string fastafile; map names; - vector findIS(int); + vector findIS(); int calcKmers(map, map); - void getCutoff(); - void makeSVGpic(vector, int); + void makeSVGpic(vector); void readName(string); - - void createProcessesIS(); - }; - /***********************************************************/ #endif diff --git a/chimerarealigner.cpp b/chimerarealigner.cpp new file mode 100644 index 0000000..57c3641 --- /dev/null +++ b/chimerarealigner.cpp @@ -0,0 +1,108 @@ +/* + * chimerarealigner.cpp + * Mothur + * + * Created by westcott on 2/12/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "chimerarealigner.h" +#include "needlemanoverlap.hpp" +#include "nast.hpp" + +//*************************************************************************************************************** +ChimeraReAligner::ChimeraReAligner(vector t, int m, int mm) : match(m), misMatch(mm) { templateSeqs = t; } +//*************************************************************************************************************** +ChimeraReAligner::~ChimeraReAligner() {} +//*************************************************************************************************************** +void ChimeraReAligner::reAlign(Sequence* query, vector parents) { + try { + if (parents.size() != 0) { + vector queryParts; + vector parentParts; //queryParts[0] relates to parentParts[0] + + string qAligned = query->getAligned(); + string newQuery = ""; + + //sort parents by region start + sort(parents.begin(), parents.end(), compareRegionStart); + + //make sure you don't cutoff beginning of query + if (parents[0].nastRegionStart > 0) { newQuery += qAligned.substr(0, parents[0].nastRegionStart+1); } + int longest = 0; + //take query and break apart into pieces using breakpoints given by results of parents + for (int i = 0; i < parents.size(); i++) { + int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1; + string q = qAligned.substr(parents[i].nastRegionStart, length); + Sequence* queryFrag = new Sequence(query->getName(), q); + + queryParts.push_back(queryFrag); + + Sequence* parent = getSequence(parents[i].parent); + string p = parent->getAligned(); + p = p.substr(parents[i].nastRegionStart, length); + parent->setAligned(p); + + parentParts.push_back(parent); + + if (q.length() > longest) { longest = q.length(); } + if (p.length() > longest) { longest = p.length(); } + } + + //align each peice to correct parent from results + for (int i = 0; i < queryParts.size(); i++) { + alignment = new NeedlemanOverlap(-2.0, match, misMatch, longest+1); //default gapopen, match, mismatch, longestbase + Nast nast(alignment, queryParts[i], parentParts[i]); + delete alignment; + } + + //recombine pieces to form new query sequence + for (int i = 0; i < queryParts.size(); i++) { + newQuery += queryParts[i]->getAligned(); + } + + //make sure you don't cutoff end of query + if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd-1); } + + //set query to new aligned string + query->setAligned(newQuery); + + //free memory + for (int i = 0; i < queryParts.size(); i++) { delete queryParts[i]; } + for (int i = 0; i < parentParts.size(); i++) { delete parentParts[i]; } + + } //else leave query alone, you have bigger problems... + + } + catch(exception& e) { + errorOut(e, "ChimeraReAligner", "reAlign"); + exit(1); + } +} +//*************************************************************************************************************** +Sequence* ChimeraReAligner::getSequence(string name) { + try{ + Sequence* temp; + + //look through templateSeqs til you find it + int spot = -1; + for (int i = 0; i < templateSeqs.size(); i++) { + if (name == templateSeqs[i]->getName()) { + spot = i; + break; + } + } + + if(spot == -1) { mothurOut("Error: Could not find sequence."); mothurOutEndLine(); return NULL; } + + temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned()); + + return temp; + } + catch(exception& e) { + errorOut(e, "ChimeraReAligner", "getSequence"); + exit(1); + } +} +//*************************************************************************************************************** diff --git a/chimerarealigner.h b/chimerarealigner.h new file mode 100644 index 0000000..2b53841 --- /dev/null +++ b/chimerarealigner.h @@ -0,0 +1,37 @@ +#ifndef CHIMERAREALIGNER_H +#define CHIMERAREALIGNER_H + +/* + * chimerarealigner.h + * Mothur + * + * Created by westcott on 2/12/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "chimera.h" +#include "alignment.hpp" + +/***********************************************************/ + +class ChimeraReAligner { + + public: + ChimeraReAligner(vector, int, int); + ~ChimeraReAligner(); + + void reAlign(Sequence*, vector); + + private: + Sequence* querySeq; + Alignment* alignment; + vector templateSeqs; + int match, misMatch; + + Sequence* getSequence(string); //find sequence from name +}; +/***********************************************************/ + +#endif + diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index b06d369..e1eebf4 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -26,7 +26,8 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim", "parents", "iters","outputdir","inputdir" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", + "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -148,7 +149,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ svg = isTrue(temp); temp = validParameter.validFile(parameters, "window", false); - if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; } + if ((temp == "not found") && (method == "chimeraslayer")) { temp = "50"; } else if (temp == "not found") { temp = "0"; } convert(temp, window); @@ -158,25 +159,37 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found") { temp = "-4"; } convert(temp, mismatch); - temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.0"; } + temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.007"; } convert(temp, divR); temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "90"; } convert(temp, minSimilarity); - temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "5"; } + temp = validParameter.validFile(parameters, "mincov", false); if (temp == "not found") { temp = "70"; } + convert(temp, minCoverage); + + temp = validParameter.validFile(parameters, "minbs", false); if (temp == "not found") { temp = "90"; } + convert(temp, minBS); + + temp = validParameter.validFile(parameters, "minsnp", false); if (temp == "not found") { temp = "10"; } + convert(temp, minSNP); + + temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; } convert(temp, parents); - temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } + temp = validParameter.validFile(parameters, "iters", false); + if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; } + else if (temp == "not found") { temp = "1000"; } convert(temp, iters); temp = validParameter.validFile(parameters, "increment", false); - if ((temp == "not found") && ((method == "chimeracheck") || (method == "chimeraslayer"))) { temp = "10"; } + if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; } + else if ((temp == "not found") && (method == "chimeraslayer")) { temp = "5"; } else if (temp == "not found") { temp = "25"; } convert(temp, increment); temp = validParameter.validFile(parameters, "numwanted", false); - if ((temp == "not found") && (method == "chimeraslayer")) { temp = "10"; } + if ((temp == "not found") && (method == "chimeraslayer")) { temp = "15"; } else if (temp == "not found") { temp = "20"; } convert(temp, numwanted); @@ -216,7 +229,11 @@ void ChimeraSeqsCommand::help(){ mothurOut("The ksize parameter allows you to input kmersize. \n"); mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence.\n"); mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n"); - //mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n"); + mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n"); + mothurOut("The minsim parameter allows you .... \n"); + mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n"); + mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n"); + mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"); mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); mothurOut("Details for each method: \n"); mothurOut("\tpintail: \n"); @@ -252,22 +269,20 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } + int start = time(NULL); + if (method == "bellerophon") { chimera = new Bellerophon(fastafile, outputDir); } - else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile, outputDir); } - else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile, outputDir); } - else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, templatefile, outputDir); } - else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(fastafile, templatefile); } + else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); } + else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); } + else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); } + else if (method == "chimeraslayer") { chimera = new ChimeraSlayer("blast"); } else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine(); } - //saves time to avoid generating it chimera->setCons(consfile); - - //saves time to avoid generating it chimera->setQuantiles(quanfile); - chimera->setMask(maskfile); chimera->setFilter(filter); chimera->setCorrection(correction); @@ -283,25 +298,126 @@ int ChimeraSeqsCommand::execute(){ chimera->setDivR(divR); chimera->setParents(parents); chimera->setMinSim(minSimilarity); + chimera->setMinCoverage(minCoverage); + chimera->setMinBS(minBS); + chimera->setMinSNP(minSNP); chimera->setIters(iters); + chimera->setTemplateFile(templatefile); + - - //find chimeras - int error = chimera->getChimeras(); - //there was a problem - if (error == 1) { return 0; } + vector templateSeqs; + if ((method != "bellerophon") && (method != "chimeracheck")) { + templateSeqs = chimera->readSeqs(templatefile); + if (chimera->getUnaligned()) { + mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine(); + //free memory + for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + return 0; + } + + //set options + chimera->setTemplateSeqs(templateSeqs); + }else if (method == "bellerophon") {//run bellerophon separately since you need to read entire fastafile to run it + chimera->getChimeras(); + + string outputFName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras"; + ofstream out; + openOutputFile(outputFName, out); + + chimera->print(out); + out.close(); + return 0; + } + + //some methods need to do prep work before processing the chimeras + chimera->doPrep(); + + ofstream outHeader; + string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras.tempHeader"; + openOutputFile(tempHeader, outHeader); + + chimera->printHeader(outHeader); + outHeader.close(); + string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras"; - ofstream out; - openOutputFile(outputFileName, out); + + //break up file + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile); + + }else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numSeqs = positions.size(); + + int numSeqsPerProcessor = numSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + + createProcesses(outputFileName, fastafile); + + rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); + + //append alignment and report files + for(int i=1;i(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + lines.push_back(new linePair(0, numSeqs)); + + driver(lines[0], outputFileName, fastafile); + #endif - //print results - chimera->print(out); + //mothurOut("Output File Names: "); + //if ((filter) && (method == "bellerophon")) { mothurOut( + //if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; } + // else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; } - out.close(); + appendOutputFiles(tempHeader, outputFileName); + remove(tempHeader.c_str()); + + for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + + if (method == "chimeracheck") { mothurOutEndLine(); mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine(); } - delete chimera; + mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); mothurOutEndLine(); return 0; @@ -310,6 +426,107 @@ int ChimeraSeqsCommand::execute(){ errorOut(e, "ChimeraSeqsCommand", "execute"); exit(1); } +}//********************************************************************************************************************** + +int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filename){ + try { + ofstream out; + openOutputFile(outputFName, out); + + ifstream inFASTA; + openInputFile(filename, inFASTA); + + inFASTA.seekg(line->start); + + for(int i=0;inumSeqs;i++){ + + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file + + //find chimeras + chimera->getChimeras(candidateSeq); + + //print results + chimera->print(out); + } + delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ mothurOut("Processing sequence: " + toString(i+1)); mothurOutEndLine(); } + } + //report progress + if((line->numSeqs) % 100 != 0){ mothurOut("Processing sequence: " + toString(line->numSeqs)); mothurOutEndLine(); } + + out.close(); + inFASTA.close(); + + return 1; + } + catch(exception& e) { + errorOut(e, "ChimeraSeqsCommand", "driver"); + exit(1); + } } + /**************************************************************************************************/ +void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + // processIDS.resize(0); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename); + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;i processIDS; //processid + vector lines; + int driver(linePair*, string, string); + void createProcesses(string, string); + void appendOutputFiles(string, string); + bool abort; string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir; bool filter, correction, svg, printAll; - int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity; + int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs; float divR; Chimera* chimera; diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 5d3d670..b2c0a05 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -8,161 +8,109 @@ */ #include "chimeraslayer.h" +#include "chimerarealigner.h" //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string filename, string temp) { fastafile = filename; templateFile = temp; } +ChimeraSlayer::ChimeraSlayer(string mode) : searchMethod(mode) { decalc = new DeCalculator(); } //*************************************************************************************************************** - -ChimeraSlayer::~ChimeraSlayer() { - try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - } - catch(exception& e) { - errorOut(e, "ChimeraSlayer", "~ChimeraSlayer"); - exit(1); - } -} +ChimeraSlayer::~ChimeraSlayer() { delete decalc; } +//*************************************************************************************************************** +void ChimeraSlayer::printHeader(ostream& out) { + mothurOutEndLine(); + mothurOut("Only reporting sequence supported by 90% of bootstrapped results."); + mothurOutEndLine(); +} //*************************************************************************************************************** void ChimeraSlayer::print(ostream& out) { try { - mothurOutEndLine(); - mothurOut("Only reporting sequence supported by 90% of bootstrapped results."); - mothurOutEndLine(); - - for (int i = 0; i < querySeqs.size(); i++) { - - if (chimeraFlags[i] == "yes") { -// cout << i << endl; - if ((chimeraResults[i][0].bsa >= 90.0) || (chimeraResults[i][0].bsb >= 90.0)) { - mothurOut(querySeqs[i]->getName() + "\tyes"); mothurOutEndLine(); - out << querySeqs[i]->getName() << "\tyes" << endl; - }else { - out << querySeqs[i]->getName() << "\tyes" << endl; - //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine(); + if (chimeraFlags == "yes") { + string chimeraFlag = "no"; + if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR) + || + (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + + + if (chimeraFlag == "yes") { + if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) { + mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine(); } - - printBlock(chimeraResults[i][0], out, i); - - out << endl; - }else{ - out << querySeqs[i]->getName() << "\tno" << endl; - //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine(); } - } - + out << querySeq->getName() << "\tyes" << endl; + printBlock(chimeraResults[0], out); + out << endl; + }else { out << querySeq->getName() << "\tno" << endl; } + } catch(exception& e) { errorOut(e, "ChimeraSlayer", "print"); exit(1); } } - //*************************************************************************************************************** -int ChimeraSlayer::getChimeras() { +int ChimeraSlayer::getChimeras(Sequence* query) { try { + chimeraFlags = "no"; + querySeq = query; - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - templateSeqs = readSeqs(templateFile); - mothurOut("Done."); mothurOutEndLine(); - - int numSeqs = querySeqs.size(); - - if (unaligned) { mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine(); return 1; } - - chimeraResults.resize(numSeqs); - chimeraFlags.resize(numSeqs, "no"); - spotMap.resize(numSeqs); - - //break up file if needed - int linesPerProcess = numSeqs / processors ; - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - #else - lines.push_back(new linePair(0, numSeqs)); - #endif - - if (seqMask != "") { decalc = new DeCalculator(); } //to use below - - //initialize spotMap - for (int j = 0; j < numSeqs; j++) { - for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) { - spotMap[j][i] = i; - } + for (int i = 0; i < query->getAligned().length(); i++) { + spotMap[i] = i; } //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity - maligner = new Maligner(templateSeqs, numWanted, match, misMatch, 1.01, minSim); - slayer = new Slayer(window, increment, minSim, divR, iters); - - for (int i = 0; i < querySeqs.size(); i++) { + maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod); + slayer = new Slayer(window, increment, minSim, divR, iters, minSNP); - string chimeraFlag = maligner->getResults(querySeqs[i]); - //float percentIdentical = maligner->getPercentID(); - vector Results = maligner->getOutput(); + string chimeraFlag = maligner->getResults(query, decalc); + vector Results = maligner->getOutput(); + + //realign query to parents to improve slayers detection rate + //ChimeraReAligner realigner(templateSeqs, match, misMatch); + //realigner.reAlign(query, Results); - cout << "Processing sequence: " << i+1 << endl; + //if (chimeraFlag == "yes") { - //for (int j = 0; j < Results.size(); j++) { - //cout << "regionStart = " << Results[j].regionStart << "\tRegionEnd = " << Results[j].regionEnd << "\tName = " << Results[j].parent << "\tPerQP = " << Results[j].queryToParent << "\tLocalPerQP = " << Results[j].queryToParentLocal << "\tdivR = " << Results[j].divR << endl; - //} + //get sequence that were given from maligner results + vector seqs; + map removeDups; + map::iterator itDup; + for (int j = 0; j < Results.size(); j++) { + float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; + //only add if you are not a duplicate + itDup = removeDups.find(Results[j].parent); + if (itDup == removeDups.end()) { //this is not duplicate + removeDups[Results[j].parent] = dist; + }else if (dist > itDup->second) { //is this a stronger number for this parent + removeDups[Results[j].parent] = dist; + } + } + + for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { + Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template - //if (chimeraFlag == "yes") { + SeqDist member; + member.seq = seq; + member.dist = itDup->second; - //get sequence that were given from maligner results - vector seqs; - map removeDups; - map::iterator itDup; - for (int j = 0; j < Results.size(); j++) { - float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; - //only add if you are not a duplicate - itDup = removeDups.find(Results[j].parent); - if (itDup == removeDups.end()) { //this is not duplicate - removeDups[Results[j].parent] = dist; - }else if (dist > itDup->second) { //is this a stronger number for this parent - removeDups[Results[j].parent] = dist; - } - } - - for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { - Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template - - SeqDist member; - member.seq = seq; - member.dist = itDup->second; - - seqs.push_back(member); - } - - //limit number of parents to explore - default 5 - if (Results.size() > parents) { - //sort by distance - sort(seqs.begin(), seqs.end(), compareSeqDist); - //prioritize larger more similiar sequence fragments - reverse(seqs.begin(), seqs.end()); - - for (int k = seqs.size()-1; k > (parents-1); k--) { - delete seqs[k].seq; - seqs.pop_back(); - } - } + seqs.push_back(member); + } + + //limit number of parents to explore - default 3 + if (Results.size() > parents) { + //sort by distance + sort(seqs.begin(), seqs.end(), compareSeqDist); + //prioritize larger more similiar sequence fragments + reverse(seqs.begin(), seqs.end()); + + for (int k = seqs.size()-1; k > (parents-1); k--) { + delete seqs[k].seq; + seqs.pop_back(); + } + } - //put seqs into vector to send to slayer - vector seqsForSlayer; - for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } + //put seqs into vector to send to slayer + vector seqsForSlayer; + for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } + //cout << i+1 << "num parents = " << seqsForSlayer.size() << '\t' << chimeraFlag << endl; //ofstream out; //string name = querySeqs[i]->getName(); //cout << name << endl; @@ -172,50 +120,42 @@ int ChimeraSlayer::getChimeras() { //cout << name << endl; //name = name.substr(0, name.find_last_of("|")); //cout << name << endl; -//string filename = name + ".seqsforslayer"; +//string filename = toString(i+1) + ".seqsforslayer"; //openOutputFile(filename, out); -//for (int u = 0; u < seqsForSlayer.size(); u++) { seqsForSlayer[u]->printSequence(out); } +//cout << querySeqs[i]->getName() << endl; +//for (int u = 0; u < seqsForSlayer.size(); u++) { cout << seqsForSlayer[u]->getName() << '\t'; seqsForSlayer[u]->printSequence(out); } +//cout << endl; //out.close(); -//filename = name + ".fasta"; +//filename = toString(i+1) + ".fasta"; //openOutputFile(filename, out); -//q->printSequence(out); +//querySeqs[i]->printSequence(out); //out.close(); - - //mask then send to slayer... - if (seqMask != "") { - decalc->setMask(seqMask); - - //mask querys - decalc->runMask(querySeqs[i]); - - //mask parents - for (int k = 0; k < seqsForSlayer.size(); k++) { - decalc->runMask(seqsForSlayer[k]); - } - - for (int i = 0; i < numSeqs; i++) { - spotMap[i] = decalc->getMaskMap(); - } - } - - //send to slayer - chimeraFlags[i] = slayer->getResults(querySeqs[i], seqsForSlayer); - chimeraResults[i] = slayer->getOutput(); - - //free memory - for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } - //} - } - //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - + //mask then send to slayer... if (seqMask != "") { - delete decalc; + decalc->setMask(seqMask); + + //mask querys + decalc->runMask(query); + + //mask parents + for (int k = 0; k < seqsForSlayer.size(); k++) { + decalc->runMask(seqsForSlayer[k]); + } + + spotMap = decalc->getMaskMap(); } + //send to slayer + chimeraFlags = slayer->getResults(query, seqsForSlayer); + chimeraResults = slayer->getOutput(); + + //free memory + for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } + //} + return 0; } catch(exception& e) { @@ -224,37 +164,12 @@ int ChimeraSlayer::getChimeras() { } } //*************************************************************************************************************** -Sequence* ChimeraSlayer::getSequence(string name) { - try{ - Sequence* temp; - - //look through templateSeqs til you find it - int spot = -1; - for (int i = 0; i < templateSeqs.size(); i++) { - if (name == templateSeqs[i]->getName()) { - spot = i; - break; - } - } - - if(spot == -1) { mothurOut("Error: Could not find sequence in chimeraSlayer."); mothurOutEndLine(); return NULL; } - - temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned()); - - return temp; - } - catch(exception& e) { - errorOut(e, "ChimeraSlayer", "getSequence"); - exit(1); - } -} -//*************************************************************************************************************** -void ChimeraSlayer::printBlock(data_struct data, ostream& out, int i){ +void ChimeraSlayer::printBlock(data_struct data, ostream& out){ try { out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl; - out << "Left Window: " << spotMap[i][data.winLStart] << " " << spotMap[i][data.winLEnd] << endl; - out << "Right Window: " << spotMap[i][data.winRStart] << " " << spotMap[i][data.winREnd] << endl; + out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl; + out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl; out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl; out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl; @@ -263,6 +178,8 @@ void ChimeraSlayer::printBlock(data_struct data, ostream& out, int i){ out << "Similarity of query to parentA: " << data.qa << endl; out << "Similarity of query to parentB: " << data.qb << endl; + out << "Percent_ID QLA_QRB: " << data.qla_qrb << endl; + out << "Percent_ID QLB_QRA: " << data.qlb_qra << endl; //out << "Per_id(QL,A): " << data.qla << endl; //out << "Per_id(QL,B): " << data.qlb << endl; //out << "Per_id(QR,A): " << data.qra << endl; diff --git a/chimeraslayer.h b/chimeraslayer.h index 5343ec2..4095b86 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -23,32 +23,25 @@ class ChimeraSlayer : public Chimera { public: - ChimeraSlayer(string, string); + ChimeraSlayer(string); ~ChimeraSlayer(); - int getChimeras(); + int getChimeras(Sequence*); void print(ostream&); - - void setCons(string){}; - void setQuantiles(string q) {}; - + void printHeader(ostream&); private: + Sequence* querySeq; DeCalculator* decalc; Maligner* maligner; Slayer* slayer; - vector lines; - vector querySeqs; - vector templateSeqs; - vector< map > spotMap; + map spotMap; - vector< vector > chimeraResults; - vector chimeraFlags; - - string fastafile, templateFile; - - Sequence* getSequence(string); //find sequence from name - void printBlock(data_struct, ostream&, int i); + vector chimeraResults; + string chimeraFlags, searchMethod; + string fastafile; + + void printBlock(data_struct, ostream&); }; /************************************************************************/ diff --git a/commandfactory.cpp b/commandfactory.cpp index c787d07..9648474 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -137,6 +137,7 @@ CommandFactory::CommandFactory(){ commands["pcoa"] = "pcoa"; commands["otu.hierarchy"] = "otu.hierarchy"; commands["set.dir"] = "set.dir"; + commands["merge.files"] = "merge.files"; } /***********************************************************/ diff --git a/database.hpp b/database.hpp index f52cdd5..7905077 100644 --- a/database.hpp +++ b/database.hpp @@ -48,6 +48,7 @@ public: virtual void generateDB() = 0; virtual void addSequence(Sequence) = 0; //add sequence to search engine virtual vector findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query + virtual map findClosest(Sequence*, int){ return results; } // returns of n closest sequences to query and their search scores virtual float getSearchScore(); virtual int getLongestBase(); virtual void readKmerDB(ifstream&){}; @@ -59,6 +60,7 @@ public: protected: int numSeqs, longest; float searchScore; + map results; }; /**************************************************************************************************/ #endif diff --git a/decalc.cpp b/decalc.cpp index 177636c..911efe2 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -105,7 +105,7 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int if (size == 0) { if (cutoff > 1200) { size = 300; } else{ size = (cutoff / 4); } } else if (size > (cutoff / 4)) { - mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine(); + mothurOut("You have selected too large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine(); size = (cutoff / 4); } @@ -679,6 +679,7 @@ float DeCalculator::getCoef(vector obs, vector qav) { } } //*************************************************************************************************************** +//gets closest matches to each end, since chimeras will most likely have different parents on each end vector DeCalculator::findClosest(Sequence* querySeq, vector db, int numWanted) { try { @@ -687,14 +688,29 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetAligned(); + string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence + string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence + Sequence queryLeft(querySeq->getName(), leftQuery); + Sequence queryRight(querySeq->getName(), rightQuery); + for(int j = 0; j < db.size(); j++){ - Sequence temp = *(db[j]); + string dbAligned = db[j]->getAligned(); + string leftDB = dbAligned.substr(0, (dbAligned.length() / 3)); //first 1/3 of the sequence + string rightDB = dbAligned.substr(((dbAligned.length() / 3)*2)); //last 1/3 of the sequence + + Sequence dbLeft(db[j]->getName(), leftDB); + Sequence dbRight(db[j]->getName(), rightDB); + + distcalculator->calcDist(queryLeft, dbLeft); + float distLeft = distcalculator->getDist(); + + distcalculator->calcDist(queryRight, dbRight); + float distRight = distcalculator->getDist(); - distcalculator->calcDist(query, temp); - float dist = distcalculator->getDist(); + float dist = min(distLeft, distRight); //keep smallest distance SeqDist subject; subject.seq = db[j]; @@ -720,7 +736,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector topMatches) { +map DeCalculator::trimSeqs(Sequence* query, vector topMatches) { try { int frontPos = 0; //should contain first position in all seqs that is not a gap character @@ -809,6 +825,13 @@ void DeCalculator::trimSeqs(Sequence* query, vector topMatches) { topMatches[i]->setAligned(newAligned); } + map trimmedPos; + + for (int i = 0; i < newAligned.length(); i++) { + trimmedPos[i] = i+frontPos; + } + + return trimmedPos; } catch(exception& e) { errorOut(e, "DeCalculator", "trimSequences"); diff --git a/decalc.h b/decalc.h index 5f0a820..0119936 100644 --- a/decalc.h +++ b/decalc.h @@ -45,7 +45,7 @@ class DeCalculator { void setAlignmentLength(int l) { alignLength = l; } void runMask(Sequence*); void trimSeqs(Sequence*, Sequence*, map&); - void trimSeqs(Sequence*, vector); + map trimSeqs(Sequence*, vector); void removeObviousOutliers(vector< vector >&, int); vector calcFreq(vector, string); vector findWindows(Sequence*, int, int, int&, int); diff --git a/engine.cpp b/engine.cpp index 593c2c3..18e1a59 100644 --- a/engine.cpp +++ b/engine.cpp @@ -86,19 +86,27 @@ string Engine::getCommand() { #ifdef USE_READLINE char* nextCommand = NULL; nextCommand = readline("mothur > "); - if(nextCommand != NULL) { add_history(nextCommand); } + + if(nextCommand != NULL) { add_history(nextCommand); } + else{ //^D causes null string and we want it to quit mothur + nextCommand = "quit"; + cout << nextCommand << endl; + } + mothurOutJustToLog("mothur > " + toString(nextCommand)); return nextCommand; #else string nextCommand = ""; mothurOut("mothur > "); getline(cin, nextCommand); + mothurOutJustToLog("mothur > " + toString(nextCommand)); return nextCommand; #endif #else string nextCommand = ""; mothurOut("mothur > "); getline(cin, nextCommand); + mothurOutJustToLog("mothur > " + toString(nextCommand)); return nextCommand; #endif diff --git a/globaldata.cpp b/globaldata.cpp index c287169..8f8d1ad 100644 --- a/globaldata.cpp +++ b/globaldata.cpp @@ -123,6 +123,7 @@ void GlobalData::newRead() { labels.clear(); Groups.clear(); allLines = 1; runParse = true; + names.clear(); } catch(exception& e) { errorOut(e, "GlobalData", "newRead"); diff --git a/maligner.cpp b/maligner.cpp index 26ac563..de6004d 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -8,12 +8,14 @@ */ #include "maligner.h" +#include "database.hpp" +#include "blastdb.hpp" /***********************************************************************/ -Maligner::Maligner(vector temp, int num, int match, int misMatch, float div, int minCov) : - db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minCoverage(minCov) {} +Maligner::Maligner(vector temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) : + db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {} /***********************************************************************/ -string Maligner::getResults(Sequence* q) { +string Maligner::getResults(Sequence* q, DeCalculator* decalc) { try { outputResults.clear(); @@ -22,14 +24,36 @@ string Maligner::getResults(Sequence* q) { query = new Sequence(q->getName(), q->getAligned()); string chimera; - - decalc = new DeCalculator(); - //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate - refSeqs = decalc->findClosest(query, db, numWanted); - - refSeqs = minCoverageFilter(refSeqs); + if (searchMethod != "blast") { + //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate + refSeqs = decalc->findClosest(query, db, numWanted); + }else{ + refSeqs = getBlastSeqs(query, numWanted); + } +//ofstream out; +//string name = toString(numi+1); +//cout << name << endl; +//name = name.substr(name.find_first_of("|")+1); +//cout << name << endl; +//name = name.substr(name.find_first_of("|")+1); +//cout << name << endl; +//name = name.substr(0, name.find_last_of("|")); +//cout << name << endl; +//string filename = name + ".seqsformaligner"; +//openOutputFile(filename, out); +//for (int u = 0; u < refSeqs.size(); u++) { refSeqs[u]->printSequence(out); } +//out.close(); +//filename = name + ".fasta"; +//openOutputFile(filename, out); +//query->printSequence(out); +//out.close(); + +//for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl; } +//cout << "before = " << refSeqs.size() << endl; + refSeqs = minCoverageFilter(refSeqs); +//cout << "after = " << refSeqs.size() << endl; if (refSeqs.size() < 2) { for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } percentIdenticalQueryChimera = 0.0; @@ -37,9 +61,30 @@ string Maligner::getResults(Sequence* q) { } int chimeraPenalty = computeChimeraPenalty(); - +//cout << chimeraPenalty << endl; + //fills outputResults + chimera = chimeraMaligner(chimeraPenalty, decalc); + + + //free memory + delete query; + for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } + + return chimera; + } + catch(exception& e) { + errorOut(e, "Maligner", "getResults"); + exit(1); + } +} +/***********************************************************************/ +string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { + try { + + string chimera; + //trims seqs to first non gap char in all seqs and last non gap char in all seqs - decalc->trimSeqs(query, refSeqs); + spotMap = decalc->trimSeqs(query, refSeqs); vector temp = refSeqs; temp.push_back(query); @@ -67,8 +112,6 @@ string Maligner::getResults(Sequence* q) { percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq); - delete decalc; - //save output results for (int i = 0; i < trace.size(); i++) { int regionStart = trace[i].col; @@ -78,6 +121,8 @@ string Maligner::getResults(Sequence* q) { results temp; temp.parent = refSeqs[seqIndex]->getName(); + temp.nastRegionStart = spotMap[regionStart]; + temp.nastRegionEnd = spotMap[regionEnd]; temp.regionStart = regionStart; temp.regionEnd = regionEnd; @@ -97,15 +142,11 @@ string Maligner::getResults(Sequence* q) { outputResults.push_back(temp); } - - //free memory - delete query; - for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } - + return chimera; } catch(exception& e) { - errorOut(e, "Maligner", "getResults"); + errorOut(e, "Maligner", "chimeraMaligner"); exit(1); } } @@ -193,21 +234,27 @@ void Maligner::verticalFilter(vector seqs) { if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; } } + map newMap; //for each sequence for (int i = 0; i < seqs.size(); i++) { string seqAligned = seqs[i]->getAligned(); string newAligned = ""; + int count = 0; for (int j = 0; j < seqAligned.length(); j++) { //if this spot is not a gap - if (filterString[j] == '1') { newAligned += seqAligned[j]; } + if (filterString[j] == '1') { + newAligned += seqAligned[j]; + newMap[count] = spotMap[j]; + count++; + } } seqs[i]->setAligned(newAligned); } - + spotMap = newMap; } catch(exception& e) { errorOut(e, "Maligner", "verticalFilter"); @@ -452,4 +499,78 @@ float Maligner::computePercentID(string queryAlign, string chimera) { } } //*************************************************************************************************************** +vector Maligner::getBlastSeqs(Sequence* q, int num) { + try { +cout << q->getName() << endl; + //generate blastdb + Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty); + for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); } + database->generateDB(); + database->setNumSeqs(db.size()); + + //get parts of query + string queryAligned = q->getAligned(); + string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence + string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence + + Sequence* queryLeft = new Sequence(q->getName(), leftQuery); + Sequence* queryRight = new Sequence(q->getName(), rightQuery); + + map tempIndexesRight = database->findClosest(queryRight, num); + map tempIndexesLeft = database->findClosest(queryLeft, num); + + //merge results + vector mergedResults; + + map::iterator it; + map::iterator it2; + + //add in right guys merging common finds + for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) { + it2 = tempIndexesLeft.find(it->first); + + if (it2 == tempIndexesLeft.end()) { //result only present in right + rank temp(it->first, it->second); + mergedResults.push_back(temp); + + }else { //result present in both save best score + float bestscore; + if (it->second > it2->second) { bestscore = it->second; } + else { bestscore = it2->second; } + + rank temp(it->first, bestscore); + mergedResults.push_back(temp); + + tempIndexesLeft.erase(it2); + } + } + + //add in unique left guys + for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) { + rank temp(it->first, it->second); + mergedResults.push_back(temp); + } + + sort(mergedResults.begin(), mergedResults.end(), compareMembers); + + vector refResults; + for (int i = 0; i < numWanted; i++) { + Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned()); + refResults.push_back(temp); +cout << db[mergedResults[i].num]->getName() << endl; + } + + delete queryRight; + delete queryLeft; + delete database; + + return refResults; + } + catch(exception& e) { + errorOut(e, "Maligner", "getBlastSeqs"); + exit(1); + } +} + +//*************************************************************************************************************** diff --git a/maligner.h b/maligner.h index 6277e73..abe18a3 100644 --- a/maligner.h +++ b/maligner.h @@ -10,53 +10,32 @@ */ #include "decalc.h" +#include "chimera.h" /***********************************************************************/ //This class was modeled after the chimeraMaligner written by the Broad Institute -/***********************************************************************/ -struct score_struct { - int prev; - int score; - int row; - int col; -}; -/***********************************************************************/ -struct trace_struct { - int col; - int oldCol; - int row; -}; -/***********************************************************************/ -struct results { - int regionStart; - int regionEnd; - string parent; - float queryToParent; - float queryToParentLocal; - float divR; -}; - /**********************************************************************/ class Maligner { public: - Maligner(vector, int, int, int, float, int); + Maligner(vector, int, int, int, float, int, int, string); ~Maligner() {}; - string getResults(Sequence*); + string getResults(Sequence*, DeCalculator*); float getPercentID() { return percentIdenticalQueryChimera; } vector getOutput() { return outputResults; } private: - DeCalculator* decalc; Sequence* query; vector refSeqs; vector db; - int numWanted, matchScore, misMatchPenalty, minCoverage; + int numWanted, matchScore, misMatchPenalty, minCoverage, minSimilarity; + string searchMethod; float minDivR, percentIdenticalQueryChimera; vector outputResults; + map spotMap; vector minCoverageFilter(vector); //removes top matches that do not have minimum coverage with query. int computeChimeraPenalty(); @@ -68,6 +47,8 @@ class Maligner { vector mapTraceRegionsToAlignment(vector, vector); string constructChimericSeq(vector, vector); float computePercentID(string, string); + string chimeraMaligner(int, DeCalculator*); + vector getBlastSeqs(Sequence*, int); }; diff --git a/mergefilecommand.cpp b/mergefilecommand.cpp index d597339..0a8c0d8 100644 --- a/mergefilecommand.cpp +++ b/mergefilecommand.cpp @@ -96,7 +96,11 @@ int MergeFileCommand::execute(){ openInputFile(fileNames[i], inputFile); - while(!inputFile.eof()){ c = inputFile.get(); outputFile << c; } + while(!inputFile.eof()){ + c = inputFile.get(); + //-1 is eof char + if (int(c) != -1) { outputFile << c; } + } inputFile.close(); } diff --git a/mothur.cpp b/mothur.cpp index b71057a..8203c57 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -41,7 +41,11 @@ int main(int argc, char *argv[]){ mothurOutJustToLog("Windows version"); mothurOutEndLine(); mothurOutEndLine(); #endif - + + #ifdef USE_READLINE + mothurOutJustToLog("Using ReadLine"); + mothurOutEndLine(); mothurOutEndLine(); + #endif //header mothurOut("mothur v.1.8"); diff --git a/mothur.h b/mothur.h index 8190152..5f8a2fe 100644 --- a/mothur.h +++ b/mothur.h @@ -831,6 +831,21 @@ inline bool inUsersGroups(string groupname, vector Groups) { exit(1); } } +/**************************************************************************************************/ +//returns true if any of the strings in first vector are in second vector +inline bool inUsersGroups(vector groupnames, vector Groups) { + try { + + for (int i = 0; i < groupnames.size(); i++) { + if (inUsersGroups(groupnames[i], Groups)) { return true; } + } + return false; + } + catch(exception& e) { + errorOut(e, "mothur", "inUsersGroups"); + exit(1); + } +} /**************************************************************************************************/ diff --git a/nast.cpp b/nast.cpp index 6a4ae3d..c91c9fa 100644 --- a/nast.cpp +++ b/nast.cpp @@ -26,7 +26,6 @@ Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method maxInsertLength = 0; pairwiseAlignSeqs(); // This is part A in Fig. 2 of DeSantis et al. regapSequences(); // This is parts B-F in Fig. 2 of DeSantis et al. - } catch(exception& e) { errorOut(e, "Nast", "Nast"); @@ -41,7 +40,7 @@ void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment me try { alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned()); - + string candAln = alignment->getSeqAAln(); string tempAln = alignment->getSeqBAln(); diff --git a/otuhierarchycommand.cpp b/otuhierarchycommand.cpp index bc87211..c39a638 100644 --- a/otuhierarchycommand.cpp +++ b/otuhierarchycommand.cpp @@ -18,7 +18,7 @@ OtuHierarchyCommand::OtuHierarchyCommand(string option){ else { //valid paramters for this command - string Array[] = {"list","label","outputdir","inputdir"}; + string Array[] = {"list","label","output","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -63,7 +63,11 @@ OtuHierarchyCommand::OtuHierarchyCommand(string option){ else { splitAtDash(label, labels); if (labels.size() != 2) { mothurOut("You must provide 2 labels."); mothurOutEndLine(); abort = true; } - } + } + + output = validParameter.validFile(parameters, "output", false); if (output == "not found") { output = "name"; } + + if ((output != "name") && (output != "number")) { mothurOut("output options are name and number. I will use name."); mothurOutEndLine(); output = "name"; } } } @@ -77,7 +81,8 @@ OtuHierarchyCommand::OtuHierarchyCommand(string option){ void OtuHierarchyCommand::help(){ try { mothurOut("The otu.hierarchy command is used to see how otus relate at two distances. \n"); - mothurOut("The otu.hierarchy command parameters are list and label. Both parameters are required. \n"); + mothurOut("The otu.hierarchy command parameters are list, label and output. list and label parameters are required. \n"); + mothurOut("The output parameter allows you to output the names of the sequence in the OTUs or the OTU numbers. Options are name and number, default is name. \n"); mothurOut("The otu.hierarchy command should be in the following format: \n"); mothurOut("otu.hierarchy(list=yourListFile, label=yourLabels).\n"); mothurOut("Example otu.hierarchy(list=amazon.fn.list, label=0.01-0.03).\n"); @@ -140,7 +145,8 @@ int OtuHierarchyCommand::execute(){ string names = lists[1].get(i); //output column 1 - out << names << '\t'; + if (output == "name") { out << names << '\t'; } + else { out << i << '\t'; } map bins; //bin numbers in little that are in this bin in big map::iterator it; @@ -157,7 +163,8 @@ int OtuHierarchyCommand::execute(){ string col2 = ""; for (it = bins.begin(); it != bins.end(); it++) { - col2 += lists[0].get(it->first) + "\t"; + if (output == "name") { col2 += lists[0].get(it->first) + "\t"; } + else { col2 += toString(it->first) + "\t"; } } //output column 2 diff --git a/otuhierarchycommand.h b/otuhierarchycommand.h index c6749ef..1cd427f 100644 --- a/otuhierarchycommand.h +++ b/otuhierarchycommand.h @@ -25,7 +25,7 @@ public: private: bool abort; set labels; //holds labels to be used - string label, listFile, outputDir; + string label, listFile, outputDir, output; vector getListVectors(); diff --git a/pintail.cpp b/pintail.cpp index 6adfdf9..74e1ce4 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -18,106 +18,39 @@ inline bool compareQuanMembers(quanMember left, quanMember right){ } //*************************************************************************************************************** -Pintail::Pintail(string filename, string temp, string o) { fastafile = filename; templateFile = temp; outputDir = o; } +Pintail::Pintail(string filename, string o) { + fastafile = filename; outputDir = o; + distcalculator = new eachGapDist(); + decalc = new DeCalculator(); +} //*************************************************************************************************************** Pintail::~Pintail() { try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; } - } - catch(exception& e) { - errorOut(e, "Pintail", "~Pintail"); - exit(1); - } -} -//*************************************************************************************************************** -void Pintail::print(ostream& out) { - try { - - mothurOutEndLine(); - for (int i = 0; i < querySeqs.size(); i++) { - - int index = ceil(deviation[i]); - - //is your DE value higher than the 95% - string chimera; - if (quantiles[index][4] == 0.0) { - chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); - }else { - if (DE[i] > quantiles[index][4]) { chimera = "Yes"; } - else { chimera = "No"; } - } - - out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl; - if (chimera == "Yes") { - mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine(); - } - out << "Observed\t"; - - for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; } - out << endl; - - out << "Expected\t"; - - for (int m = 0; m < expectedDistance[i].size(); m++) { out << expectedDistance[i][m] << '\t'; } - out << endl; - - } + delete distcalculator; + delete decalc; } catch(exception& e) { - errorOut(e, "Pintail", "print"); + errorOut(e, "Pintail", "~Pintail"); exit(1); } } - //*************************************************************************************************************** -int Pintail::getChimeras() { +void Pintail::doPrep() { try { - - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - templateSeqs = readSeqs(templateFile); - mothurOut("Done."); mothurOutEndLine(); - - int numSeqs = querySeqs.size(); - - if (unaligned) { mothurOut("Your sequences need to be aligned when you use the pintail method."); mothurOutEndLine(); return 1; } - - obsDistance.resize(numSeqs); - expectedDistance.resize(numSeqs); - seqCoef.resize(numSeqs); - DE.resize(numSeqs); - Qav.resize(numSeqs); - bestfit.resize(numSeqs); - deviation.resize(numSeqs); - trimmed.resize(numSeqs); - windowSizes.resize(numSeqs, window); + windowSizesTemplate.resize(templateSeqs.size(), window); - windowsForeachQuery.resize(numSeqs); - h.resize(numSeqs); quantiles.resize(100); //one for every percent mismatch quantilesMembers.resize(100); //one for every percent mismatch - //break up file if needed - int linesPerProcess = numSeqs / processors ; + //if the user does not enter a mask then you want to keep all the spots in the alignment + if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); } + else { decalc->setAlignmentLength(seqMask.length()); } + + decalc->setMask(seqMask); #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - //find breakup of templatefile for quantiles if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } else { @@ -128,149 +61,38 @@ int Pintail::getChimeras() { } } #else - lines.push_back(new linePair(0, numSeqs)); templateLines.push_back(new linePair(0, templateSeqs.size())); #endif - - distcalculator = new eachGapDist(); - decalc = new DeCalculator(); - - //if the user does enter a mask then you want to keep all the spots in the alignment - if (seqMask.length() == 0) { decalc->setAlignmentLength(querySeqs[0]->getAligned().length()); } - else { decalc->setAlignmentLength(seqMask.length()); } - - decalc->setMask(seqMask); - - //find pairs - if (processors == 1) { - mothurOut("Finding closest sequence in template to each sequence... "); cout.flush(); - bestfit = findPairs(lines[0]->start, lines[0]->end); - mothurOut("Done."); mothurOutEndLine(); - }else { createProcessesPairs(); } - -//string o = "closestmatch.eachgap.fasta"; -//ofstream out7; -//openOutputFile(o, out7); -//for (int i = 0; i < bestfit.size(); i++) { - //out7 << ">" << querySeqs[i]->getName() << "-"<< bestfit[i]->getName() << endl; - //out7 << bestfit[i]->getAligned() << endl; -//} -//out7.close(); - //find P + mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") { mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush(); - probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFile)); + probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName)); mothurOut("Done."); mothurOutEndLine(); - }else { probabilityProfile = readFreq(); } - - //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; - mothurOut("Done."); mothurOutEndLine(); - - //mask sequences if the user wants to - if (seqMask != "") { - //mask querys - for (int i = 0; i < querySeqs.size(); i++) { - decalc->runMask(querySeqs[i]); - } - - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); - } - - for (int i = 0; i < bestfit.size(); i++) { - decalc->runMask(bestfit[i]); - } - - } - - if (filter) { - vector temp = templateSeqs; - for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); } - - createFilter(temp); - - runFilter(querySeqs); - runFilter(templateSeqs); - runFilter(bestfit); - } - - - if (processors == 1) { - - for (int j = 0; j < bestfit.size(); j++) { - decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]); - } - - mothurOut("Finding window breaks... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - it = trimmed[i].begin(); - vector win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); - windowsForeachQuery[i] = win; - } - mothurOut("Done."); mothurOutEndLine(); - - }else { createProcessesSpots(); } - - if (processors == 1) { - - mothurOut("Calculating observed distance... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); + }else { probabilityProfile = readFreq(); mothurOut("Done."); } + mothurOutEndLine(); - obsDistance[i] = obsi; - } - mothurOut("Done."); mothurOutEndLine(); - - - mothurOut("Finding variability... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile); - - Qav[i] = q; - } - mothurOut("Done."); mothurOutEndLine(); - - - mothurOut("Calculating alpha... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - float alpha = decalc->getCoef(obsDistance[i], Qav[i]); - seqCoef[i] = alpha; - } - mothurOut("Done."); mothurOutEndLine(); - - - mothurOut("Calculating expected distance... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector exp = decalc->calcExpected(Qav[i], seqCoef[i]); - expectedDistance[i] = exp; - } - mothurOut("Done."); mothurOutEndLine(); - - - mothurOut("Finding deviation... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - float de = decalc->calcDE(obsDistance[i], expectedDistance[i]); - DE[i] = de; - - it = trimmed[i].begin(); - float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); - deviation[i] = dist; - } - mothurOut("Done."); mothurOutEndLine(); - - } - else { createProcesses(); } - //quantiles are used to determine whether the de values found indicate a chimera //if you have to calculate them, its time intensive because you are finding the de and deviation values for each //combination of sequences in the template if (quanfile != "") { quantiles = readQuantiles(); } else { + //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras + if (seqMask != "") { + //mask templates + for (int i = 0; i < templateSeqs.size(); i++) { + decalc->runMask(templateSeqs[i]); + } + } + + if (filter) { + createFilter(templateSeqs); + for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); } + } + + mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); @@ -281,58 +103,16 @@ int Pintail::getChimeras() { string noOutliers, outliers; if ((!filter) && (seqMask == "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.quan"; + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan"; }else if ((filter) && (seqMask == "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.filtered.quan"; + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan"; }else if ((!filter) && (seqMask != "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.masked.quan"; + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan"; }else if ((filter) && (seqMask != "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.filtered.masked.quan"; + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan"; } - //outliers = getRootName(templateFile) + "pintail.quanYESOUTLIERS"; - - /*openOutputFile(outliers, out4); - - //adjust quantiles - for (int i = 0; i < quantilesMembers.size(); i++) { - vector temp; - - if (quantilesMembers[i].size() == 0) { - //in case this is not a distance found in your template files - for (int g = 0; g < 6; g++) { - temp.push_back(0.0); - } - }else{ - - sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers); - - //save 10% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score); - //save 25% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score); - //save 50% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score); - //save 75% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score); - //save 95% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score); - //save 99% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score); - - } - - //output quan value - out4 << i+1 << '\t'; - for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; } - out4 << endl; - - quantiles[i] = temp; - - } - - out4.close();*/ - + decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); openOutputFile(noOutliers, out5); @@ -376,15 +156,112 @@ int Pintail::getChimeras() { mothurOut("Done."); mothurOutEndLine(); - } + //if mask reread template + if ((seqMask != "") || (filter)) { + for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + templateSeqs = readSeqs(templateFileName); + } + } + //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } - - delete distcalculator; - delete decalc; + for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } + + } + catch(exception& e) { + errorOut(e, "Pintail", "doPrep"); + exit(1); + } +} +//*************************************************************************************************************** +void Pintail::print(ostream& out) { + try { + int index = ceil(deviation); + + //is your DE value higher than the 95% + string chimera; + if (index != 0) { //if index is 0 then its an exact match to a template seq + if (quantiles[index][4] == 0.0) { + chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); + }else { + if (DE > quantiles[index][4]) { chimera = "Yes"; } + else { chimera = "No"; } + } + }else{ chimera = "No"; } + + out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl; + if (chimera == "Yes") { + mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine(); + } + out << "Observed\t"; + + for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; } + out << endl; + + out << "Expected\t"; + + for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; } + out << endl; + + + } + catch(exception& e) { + errorOut(e, "Pintail", "print"); + exit(1); + } +} + +//*************************************************************************************************************** +int Pintail::getChimeras(Sequence* query) { + try { + querySeq = query; + trimmed.clear(); + windowSizes = window; + + //find pairs + bestfit = findPairs(query); + + //make P into Q + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; + + //mask sequences if the user wants to + if (seqMask != "") { + decalc->runMask(query); + decalc->runMask(bestfit); + } + + if (filter) { + runFilter(query); + runFilter(bestfit); + } + + //trim seq + decalc->trimSeqs(query, bestfit, trimmed); + + //find windows + it = trimmed.begin(); + windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment); + + //find observed distance + obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes); + + Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile); + + //find alpha + seqCoef = decalc->getCoef(obsDistance, Qav); + + //calculating expected distance + expectedDistance = decalc->calcExpected(Qav, seqCoef); + + //finding de + DE = decalc->calcDE(obsDistance, expectedDistance); + + //find distance between query and closest match + it = trimmed.begin(); + deviation = decalc->calcDist(query, bestfit, it->first, it->second); + delete bestfit; + return 0; } catch(exception& e) { @@ -437,16 +314,13 @@ vector Pintail::readFreq() { //*************************************************************************************************************** //calculate the distances from each query sequence to all sequences in the template to find the closest sequence -vector Pintail::findPairs(int start, int end) { +Sequence* Pintail::findPairs(Sequence* q) { try { - vector seqsMatches; + Sequence* seqsMatches; - for(int i = start; i < end; i++){ - - vector copy = decalc->findClosest(querySeqs[i], templateSeqs, 1); - seqsMatches.push_back(copy[0]); - } + vector copy = decalc->findClosest(q, templateSeqs, 1); + seqsMatches = copy[0]; return seqsMatches; @@ -456,458 +330,7 @@ vector Pintail::findPairs(int start, int end) { exit(1); } } - -/**************************************************************************************************/ - -void Pintail::createProcessesSpots() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - for (int j = lines[process]->start; j < lines[process]->end; j++) { - - //chops off beginning and end of sequences so they both start and end with a base - map trim; - - decalc->trimSeqs(querySeqs[j], bestfit[j], trim); - trimmed[j] = trim; - - } - - mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - for (int i = lines[process]->start; i < lines[process]->end; i++) { - it = trimmed[i].begin(); - windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); - } - mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output windowsForeachQuery - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << windowsForeachQuery[i].size() << '\t'; - for (int j = 0; j < windowsForeachQuery[i].size(); j++) { - out << windowsForeachQuery[i][j] << '\t'; - } - out << endl; - } - - //output windowSizes - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << windowSizes[i] << '\t'; - } - out << endl; - - //output trimmed values - for (int i = lines[process]->start; i < lines[process]->end; i++) { - it = trimmed[i].begin(); - out << it->first << '\t' << it->second << endl; - } - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;iend - lines[i]->start; - - int count = lines[i]->start; - for (int m = 0; m < size; m++) { - int num; - in >> num; - - vector win; int w; - for (int j = 0; j < num; j++) { - in >> w; - win.push_back(w); - } - - windowsForeachQuery[count] = win; - count++; - gobble(in); - } - - gobble(in); - count = lines[i]->start; - for (int m = 0; m < size; m++) { - int num; - in >> num; - - windowSizes[count] = num; - count++; - } - - gobble(in); - - count = lines[i]->start; - for (int m = 0; m < size; m++) { - int front, back; - in >> front >> back; - - map t; - - t[front] = back; - - trimmed[count] = t; - count++; - - gobble(in); - } - - - in.close(); - remove(s.c_str()); - } - - -#else - for (int j = 0; j < bestfit.size(); j++) { - //chops off beginning and end of sequences so they both start and end with a base - decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]); - } - - for (int i = lines[0]->start; i < lines[0]->end; i++) { - it = trimmed[i].begin(); - vector win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); - windowsForeachQuery[i] = win; - } - -#endif - } - catch(exception& e) { - errorOut(e, "Pintail", "createProcessesSpots"); - exit(1); - } -} /**************************************************************************************************/ - -void Pintail::createProcessesPairs() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - bestfit = findPairs(lines[process]->start, lines[process]->end); - mothurOut("Done finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output range and size - out << bestfit.size() << endl; - - //output pairs - for (int i = 0; i < bestfit.size(); i++) { - out << ">" << bestfit[i]->getName() << endl << bestfit[i]->getAligned() << endl; - } - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;i> size; gobble(in); - - //get pairs - int count = lines[i]->start; - for (int m = 0; m < size; m++) { - Sequence* temp = new Sequence(in); - bestfit[count] = temp; - - count++; - gobble(in); - } - - in.close(); - remove(s.c_str()); - } - - -#else - bestfit = findPairs(lines[0]->start, lines[0]->end); -#endif - } - catch(exception& e) { - errorOut(e, "Pintail", "createProcessesPairs"); - exit(1); - } -} -/**************************************************************************************************/ - -void Pintail::createProcesses() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - for (int i = lines[process]->start; i < lines[process]->end; i++) { - - vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); - obsDistance[i] = obsi; - - //calc Qav - vector q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile); - - //get alpha - float alpha = decalc->getCoef(obsDistance[i], q); - - //find expected - vector exp = decalc->calcExpected(q, alpha); - expectedDistance[i] = exp; - - //get de and deviation - float dei = decalc->calcDE(obsi, exp); - DE[i] = dei; - - it = trimmed[i].begin(); - float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); - deviation[i] = dist; - } - mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - int size = lines[process]->end - lines[process]->start; - out << size << endl; - - //output observed distances - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << obsDistance[i].size() << '\t'; - for (int j = 0; j < obsDistance[i].size(); j++) { - out << obsDistance[i][j] << '\t'; - } - out << endl; - } - - - //output expected distances - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << expectedDistance[i].size() << '\t'; - for (int j = 0; j < expectedDistance[i].size(); j++) { - out << expectedDistance[i][j] << '\t'; - } - out << endl; - } - - - //output de values - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << DE[i] << '\t'; - } - out << endl; - - //output de values - for (int i = lines[process]->start; i < lines[process]->end; i++) { - out << deviation[i] << '\t'; - } - out << endl; - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;i> size; gobble(in); - - //get observed distances - int count = lines[i]->start; - for (int m = 0; m < size; m++) { - int num; - in >> num; - - vector obs; float w; - for (int j = 0; j < num; j++) { - in >> w; - obs.push_back(w); - } - - obsDistance[count] = obs; - count++; - gobble(in); - } - - gobble(in); - - //get expected distances - count = lines[i]->start; - for (int m = 0; m < size; m++) { - int num; - in >> num; - - vector exp; float w; - for (int j = 0; j < num; j++) { - in >> w; - exp.push_back(w); - } - - expectedDistance[count] = exp; - count++; - gobble(in); - } - - gobble(in); - - count = lines[i]->start; - for (int m = 0; m < size; m++) { - float num; - in >> num; - - DE[count] = num; - count++; - } - - gobble(in); - - count = lines[i]->start; - for (int m = 0; m < size; m++) { - float num; - in >> num; - - deviation[count] = num; - count++; - } - - in.close(); - remove(s.c_str()); - } - - -#else - mothurOut("Calculating observed distance... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); - obsDistance[i] = obsi; - } - mothurOut("Done."); mothurOutEndLine(); - - - - mothurOut("Finding variability... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile); - Qav[i] = q; - } - mothurOut("Done."); mothurOutEndLine(); - - - - mothurOut("Calculating alpha... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - float alpha = decalc->getCoef(obsDistance[i], Qav[i]); - seqCoef.push_back(alpha); - } - mothurOut("Done."); mothurOutEndLine(); - - - - mothurOut("Calculating expected distance... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - vector exp = decalc->calcExpected(Qav[i], seqCoef[i]); - expectedDistance[i] = exp; - } - mothurOut("Done."); mothurOutEndLine(); - - - - mothurOut("Finding deviation... "); cout.flush(); - for (int i = lines[0]->start; i < lines[0]->end; i++) { - float de = decalc->calcDE(obsDistance[i], expectedDistance[i]); - DE[i] = de; - - it = trimmed[i].begin(); - float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); - deviation[i] = dist; - } - mothurOut("Done."); mothurOutEndLine(); - -#endif - } - catch(exception& e) { - errorOut(e, "Pintail", "createProcesses"); - exit(1); - } -} - - -/**************************************************************************************************/ - void Pintail::createProcessesQuan() { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) diff --git a/pintail.h b/pintail.h index 26154b2..ed52c3a 100644 --- a/pintail.h +++ b/pintail.h @@ -24,10 +24,10 @@ class Pintail : public Chimera { public: - Pintail(string, string, string); + Pintail(string, string); ~Pintail(); - int getChimeras(); + int getChimeras(Sequence*); void print(ostream&); void setCons(string c) { consfile = c; } @@ -39,44 +39,39 @@ class Pintail : public Chimera { Dist* distcalculator; DeCalculator* decalc; int iters; - string fastafile, templateFile, consfile; + string fastafile, consfile; - - vector lines; vector templateLines; - vector querySeqs; - vector templateSeqs; - - vector bestfit; //bestfit[0] matches queryseqs[0]... + Sequence*querySeq; + + Sequence* bestfit; //closest match to query in template - vector< vector > obsDistance; //obsDistance[0] is the vector of observed distances for queryseqs[0]... - vector< vector > expectedDistance; //expectedDistance[0] is the vector of expected distances for queryseqs[0]... - vector deviation; //deviation[0] is the percentage of mismatched pairs over the whole seq between querySeqs[0] and its best match. - vector< vector > windowsForeachQuery; // windowsForeachQuery[0] is a vector containing the starting spot in queryseqs[0] aligned sequence for each window. + vector obsDistance; //obsDistance is the vector of observed distances for query + vector expectedDistance; //expectedDistance is the vector of expected distances for query + float deviation; //deviation is the percentage of mismatched pairs over the whole seq between query and its best match. + vector windowsForeachQuery; // windowsForeachQuery is a vector containing the starting spot in query aligned sequence for each window. //this is needed so you can move by bases and not just spots in the alignment - vector windowSizes; //windowSizes[0] = window size of querySeqs[0] + int windowSizes; //windowSizes = window size of query vector windowSizesTemplate; //windowSizesTemplate[0] = window size of templateSeqs[0] - vector< map > trimmed; //trimmed[0] = start and stop of trimmed sequences for querySeqs[0] + map trimmed; //trimmed = start and stop of trimmed sequences for query map::iterator it; - vector< vector > Qav; //Qav[0] is the vector of average variablility for queryseqs[0]... - vector seqCoef; //seqCoef[0] is the coeff for queryseqs[0]... - vector DE; //DE[0] is the deviaation for queryseqs[0]... + vector Qav; //Qav is the vector of average variablility for query + float seqCoef; //seqCoef is the coeff for query + float DE; //DE is the deviaation for query vector probabilityProfile; vector< vector > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... vector< vector > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... - vector< set > h; + set h; vector readFreq(); - vector findPairs(int, int); + Sequence* findPairs(Sequence*); - void createProcessesSpots(); - void createProcessesPairs(); - void createProcesses(); void createProcessesQuan(); + void doPrep(); }; diff --git a/readtree.cpp b/readtree.cpp index 25fee86..cdaa387 100644 --- a/readtree.cpp +++ b/readtree.cpp @@ -354,7 +354,9 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T) { group = "xxx"; } - T->tree[n1].setGroup(group); + vector tempGroup; tempGroup.push_back(group); + + T->tree[n1].setGroup(tempGroup); T->tree[n1].setChildren(-1,-1); if(blen == 1){ diff --git a/readtreecommand.cpp b/readtreecommand.cpp index 09f899b..a11465d 100644 --- a/readtreecommand.cpp +++ b/readtreecommand.cpp @@ -87,7 +87,7 @@ ReadTreeCommand::ReadTreeCommand(string option){ namefile = validParameter.validFile(parameters, "name", true); if (namefile == "not open") { abort = true; } - else if (namefile == "not found") { treefile = ""; } + else if (namefile == "not found") { namefile = ""; } else { readNamesFile(); } if (abort == false) { @@ -162,6 +162,7 @@ int ReadTreeCommand::execute(){ if (it == nameMap.end()) { mothurOut(treeMap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); mothurOutEndLine(); treeMap->removeSeq(treeMap->namesOfSeqs[i]); + i--; //need this because removeSeq removes name from namesOfSeqs } } } diff --git a/slayer.cpp b/slayer.cpp index 8e9eb56..66db22b 100644 --- a/slayer.cpp +++ b/slayer.cpp @@ -10,8 +10,8 @@ #include "slayer.h" /***********************************************************************/ -Slayer::Slayer(int win, int increment, int parentThreshold, float div, int i) : - windowSize(win), windowStep(increment), parentFragmentThreshold(parentThreshold), divRThreshold(div), iters(i){} +Slayer::Slayer(int win, int increment, int parentThreshold, float div, int i, int snp) : + windowSize(win), windowStep(increment), parentFragmentThreshold(parentThreshold), divRThreshold(div), iters(i), percentSNPSample(snp){} /***********************************************************************/ string Slayer::getResults(Sequence* query, vector refSeqs) { try { @@ -20,13 +20,14 @@ string Slayer::getResults(Sequence* query, vector refSeqs) { for (int i = 0; i < refSeqs.size(); i++) { for (int j = i+1; j < refSeqs.size(); j++) { - + //make copies of query and each parent because runBellerophon removes gaps and messes them up Sequence* q = new Sequence(query->getName(), query->getAligned()); Sequence* leftParent = new Sequence(refSeqs[i]->getName(), refSeqs[i]->getAligned()); Sequence* rightParent = new Sequence(refSeqs[j]->getName(), refSeqs[j]->getAligned()); - vector divs = runBellerophon(q, leftParent, rightParent); + map spots; //map from spot in original sequence to spot in filtered sequence for query and both parents + vector divs = runBellerophon(q, leftParent, rightParent, spots); vector selectedDivs; for (int k = 0; k < divs.size(); k++) { @@ -39,16 +40,17 @@ string Slayer::getResults(Sequence* query, vector refSeqs) { //require at least 3 SNPs on each side of the break if ((numSNPSLeft >= 3) && (numSNPSRight >= 3)) { + + //removed in 12/09 version of chimeraSlayer + //int winSizeLeft = divs[k].winLEnd - divs[k].winLStart + 1; + //int winSizeRight = divs[k].winREnd - divs[k].winRStart + 1; - int winSizeLeft = divs[k].winLEnd - divs[k].winLStart + 1; - int winSizeRight = divs[k].winREnd - divs[k].winRStart + 1; - - float snpRateLeft = numSNPSLeft / (float) winSizeLeft; - float snpRateRight = numSNPSRight / (float) winSizeRight; - float logR = log(snpRateLeft / snpRateRight) / log(2.0); + //float snpRateLeft = numSNPSLeft / (float) winSizeLeft; + //float snpRateRight = numSNPSRight / (float) winSizeRight; + //float logR = log(snpRateLeft / snpRateRight) / log(2.0); // do not accept excess snp ratio on either side of the break - if (abs(logR) < 1 ) { + //if (abs(logR) < 1 ) { float BS_A, BS_B; bootstrapSNPS(snpsLeft, snpsRight, BS_A, BS_B); @@ -59,9 +61,15 @@ string Slayer::getResults(Sequence* query, vector refSeqs) { divs[k].bsMax = max(BS_A, BS_B); divs[k].chimeraMax = max(divs[k].qla_qrb, divs[k].qlb_qra); + + //so results reflect orignal alignment + divs[k].winLStart = spots[divs[k].winLStart]; + divs[k].winLEnd = spots[divs[k].winLEnd]; + divs[k].winRStart = spots[divs[k].winRStart]; + divs[k].winREnd = spots[divs[k].winREnd]; selectedDivs.push_back(divs[k]); - } + //} } } @@ -74,7 +82,7 @@ string Slayer::getResults(Sequence* query, vector refSeqs) { } } - + // compute bootstrap support if (all.size() > 0) { //sort them @@ -95,22 +103,31 @@ string Slayer::getResults(Sequence* query, vector refSeqs) { } } /***********************************************************************/ -vector Slayer::runBellerophon(Sequence* q, Sequence* pA, Sequence* pB) { +vector Slayer::runBellerophon(Sequence* q, Sequence* pA, Sequence* pB, map& spots) { try{ vector data; - + //vertical filter vector temp; temp.push_back(q); temp.push_back(pA); temp.push_back(pB); - map spots = verticalFilter(temp); + + //maps spot in new alignment to spot in alignment before filter + spots = verticalFilter(temp); //fills baseSpots //get these to avoid numerous function calls string query = q->getAligned(); string parentA = pA->getAligned(); string parentB = pB->getAligned(); int length = query.length(); - +//cout << q->getName() << endl << q->getAligned() << endl << endl; +//cout << pA->getName() << endl << pA->getAligned() << endl << endl; +//cout << pB->getName() << endl << pB->getAligned() << endl << endl; +//cout << " length = " << length << endl; +//cout << q->getName() << endl; +//cout << pA->getName() << '\t'; +//cout << pB->getName() << endl; + //check window size if (length < (2*windowSize+windowStep)) { mothurOut("Your window size is too large for " + q->getName() + ". I will make the window size " + toString(length/4) + " which is 1/4 the filtered length."); mothurOutEndLine(); @@ -143,9 +160,11 @@ vector Slayer::runBellerophon(Sequence* q, Sequence* pA, Sequence* //float avgQA_QB = ((QA*leftLength) + (QB*rightLength)) / (float) length; float divR_QLA_QRB = min((QLA_QRB/QA), (QLA_QRB/QB)); - float divR_QLB_QRA = min((QLB_QRA/QA), (QLB_QRA/QB)); +//cout << leftLength << '\t' << rightLength << '\t' << QLA << '\t' << QRB << '\t' << QLB << '\t' << QRA << '\t' << LAB << '\t' << RAB << '\t' << AB << '\t' << QA << '\t' << QB << '\t' << QLA_QRB << '\t' << QLB_QRA << endl; +//cout << divRThreshold << endl; +//cout << breakpoint << '\t' << divR_QLA_QRB << '\t' << divR_QLB_QRA << endl; //is one of them above the if (divR_QLA_QRB >= divRThreshold || divR_QLB_QRA >= divRThreshold) { @@ -167,10 +186,10 @@ vector Slayer::runBellerophon(Sequence* q, Sequence* pA, Sequence* member.rab = RAB; member.qra = QRA; member.qlb = QLB; - member.winLStart = spots[0]; - member.winLEnd = spots[breakpoint]; //so breakpoint reflects spot in alignment before filter - member.winRStart = spots[breakpoint+1]; - member.winREnd = spots[length-1]; + member.winLStart = 0; + member.winLEnd = breakpoint; + member.winRStart = breakpoint+1; + member.winREnd = length-1; member.querySeq = *(q); member.parentA = *(pA); member.parentB = *(pB); @@ -198,7 +217,7 @@ vector Slayer::getSNPS(string parentA, string query, string parentB, int l try { vector data; - +//cout << left << '\t' << right << endl; for (int i = left; i <= right; i++) { char A = parentA[i]; @@ -206,12 +225,31 @@ vector Slayer::getSNPS(string parentA, string query, string parentB, int l char B = parentB[i]; if ((A != Q) || (B != Q)) { - snps member; - member.queryChar = Q; - member.parentAChar = A; - member.parentBChar = B; +//cout << "not equal " << Q << '\t' << A << '\t' << B << endl; + + //ensure not neighboring a gap. change to 12/09 release of chimeraSlayer - not sure what this adds, but it eliminates alot of SNPS + if ( + //did query loose a base here during filter?? + ( i == 0 || abs (baseSpots[0][i] - baseSpots[0][i-1]) == 1) && + ( i == query.length() || abs (baseSpots[0][i] - baseSpots[0][i+1]) == 1) + && + //did parentA loose a base here during filter?? + ( i == 0 || abs (baseSpots[1][i] - baseSpots[1][i-1]) == 1) && + ( i == parentA.length() || abs (baseSpots[1][i] - baseSpots[1][i+1]) == 1) + && + //did parentB loose a base here during filter?? + ( i == 0 || abs (baseSpots[2][i] - baseSpots[2][i-1]) == 1) && + ( i == parentB.length() || abs (baseSpots[2][i] - baseSpots[2][i+1]) == 1) + ) + { - data.push_back(member); + snps member; + member.queryChar = Q; + member.parentAChar = A; + member.parentBChar = B; +//cout << "not neighboring a gap " << Q << '\t' << A << '\t' << B << '\t' << baseSpots[0][i] << '\t' << baseSpots[0][i+1] << '\t' << baseSpots[0][i-1] << '\t' << baseSpots[1][i] << '\t' << baseSpots[1][i+1] << '\t' << baseSpots[1][i-1] << '\t' << baseSpots[2][i] << '\t' << baseSpots[2][i+1] << '\t' << baseSpots[2][i-1] << endl; + data.push_back(member); + } } } @@ -232,9 +270,9 @@ void Slayer::bootstrapSNPS(vector left, vector right, float& BSA, fl int count_A = 0; // sceneario QLA,QRB supported int count_B = 0; // sceneario QLB,QRA supported - int numLeft = max(1, int(left.size()/10 +0.5)); - int numRight = max(1, int(right.size()/10 + 0.5)); - + int numLeft = max(1, int(left.size() * (percentSNPSample/(float)100) + 0.5)); + int numRight = max(1, int(right.size() * (percentSNPSample/(float)100) + 0.5)); + for (int i = 0; i < iters; i++) { //random sampling with replacement. @@ -278,11 +316,29 @@ void Slayer::bootstrapSNPS(vector left, vector right, float& BSA, fl if ((QLB > QLA) && (QRA > QRB)) { count_B++; } - + +//cout << "selected left snp: \n"; +//for (int j = 0; j < selectedLeft.size(); j++) { cout << selectedLeft[j].parentAChar; } +//cout << endl; +//for (int j = 0; j < selectedLeft.size(); j++) { cout << selectedLeft[j].queryChar; } +//cout << endl; +//for (int j = 0; j < selectedLeft.size(); j++) { cout << selectedLeft[j].parentBChar; } +//cout << endl; +//cout << "selected right snp: \n"; +//for (int j = 0; j < selectedRight.size(); j++) { cout << selectedRight[j].parentAChar; } +//cout << endl; +//for (int i = 0; i < selectedRight.size(); i++) { cout << selectedRight[i].queryChar; } +//cout << endl; +//for (int i = 0; i < selectedRight.size(); i++) { cout << selectedRight[i].parentBChar; } +//cout << endl; } + + + BSA = ((float) count_A / (float) iters) * 100; BSB = ((float) count_B / (float) iters) * 100; +//cout << "bsa = " << BSA << " bsb = " << BSB << endl; } catch(exception& e) { @@ -359,7 +415,7 @@ float Slayer::computePercentID(string queryFrag, string parent, int left, int ri try { int total = 0; int matches = 0; - + for (int i = left; i <= right; i++) { total++; if (queryFrag[i] == parent[i]) { @@ -380,6 +436,10 @@ float Slayer::computePercentID(string queryFrag, string parent, int left, int ri //remove columns that contain any gaps map Slayer::verticalFilter(vector seqs) { try { + //find baseSpots + baseSpots.clear(); + baseSpots.resize(3); //query, parentA, parentB + vector gaps; gaps.resize(seqs[0]->getAligned().length(), 0); string filterString = (string(seqs[0]->getAligned().length(), '1')); @@ -391,11 +451,11 @@ map Slayer::verticalFilter(vector seqs) { for (int j = 0; j < seqAligned.length(); j++) { //if this spot is a gap - if ((seqAligned[j] == '-') || (seqAligned[j] == '.') || (toupper(seqAligned[j]) == 'N')) { gaps[j]++; } + if ((seqAligned[j] == '-') || (seqAligned[j] == '.') || (toupper(seqAligned[j]) == 'N')) { gaps[j]++; } } } - //zero out spot where all sequences have blanks + //zero out spot where any sequences have blanks int numColRemoved = 0; int count = 0; map maskMap; maskMap.clear(); @@ -407,16 +467,25 @@ map Slayer::verticalFilter(vector seqs) { count++; } } - + //for each sequence for (int i = 0; i < seqs.size(); i++) { string seqAligned = seqs[i]->getAligned(); string newAligned = ""; + int baseCount = 0; + int count = 0; for (int j = 0; j < seqAligned.length(); j++) { + //are you a base + if ((seqAligned[j] != '-') && (seqAligned[j] != '.') && (toupper(seqAligned[j]) != 'N')) { baseCount++; } + //if this spot is not a gap - if (filterString[j] == '1') { newAligned += seqAligned[j]; } + if (filterString[j] == '1') { + newAligned += seqAligned[j]; + baseSpots[i][count] = baseCount; + count++; + } } seqs[i]->setAligned(newAligned); diff --git a/slayer.h b/slayer.h index dc13e6d..099cb20 100644 --- a/slayer.h +++ b/slayer.h @@ -64,7 +64,7 @@ class Slayer { public: - Slayer(int, int, int, float, int); + Slayer(int, int, int, float, int, int); ~Slayer() {}; string getResults(Sequence*, vector); @@ -73,14 +73,15 @@ class Slayer { private: - int windowSize, windowStep, parentFragmentThreshold, iters; + int windowSize, windowStep, parentFragmentThreshold, iters, percentSNPSample; float divRThreshold; vector outputResults; + vector< map > baseSpots; map verticalFilter(vector); float computePercentID(string, string, int, int); - vector runBellerophon(Sequence*, Sequence*, Sequence*); + vector runBellerophon(Sequence*, Sequence*, Sequence*, map&); vector getSNPS(string, string, string, int, int); void bootstrapSNPS(vector, vector, float&, float&); float snpQA(vector); diff --git a/tree.cpp b/tree.cpp index 52b7322..731eb64 100644 --- a/tree.cpp +++ b/tree.cpp @@ -27,7 +27,8 @@ Tree::Tree() { //initialize leaf nodes if (i <= (numLeaves-1)) { tree[i].setName(globaldata->Treenames[i]); - tree[i].setGroup(globaldata->gTreemap->getGroup(globaldata->Treenames[i])); + vector tempGroups; tempGroups.push_back(globaldata->gTreemap->getGroup(globaldata->Treenames[i])); + tree[i].setGroup(tempGroups); //set pcount and pGroup for groupname to 1. tree[i].pcount[globaldata->gTreemap->getGroup(globaldata->Treenames[i])] = 1; tree[i].pGroups[globaldata->gTreemap->getGroup(globaldata->Treenames[i])] = 1; @@ -37,7 +38,8 @@ Tree::Tree() { //intialize non leaf nodes }else if (i > (numLeaves-1)) { tree[i].setName(""); - tree[i].setGroup(""); + vector tempGroups; + tree[i].setGroup(tempGroups); } } } @@ -119,6 +121,14 @@ void Tree::addNamesToCounts() { } }//end if + //update groups to reflect all the groups this node represents + vector nodeGroups; + map::iterator itGroups; + for (itGroups = tree[i].pcount.begin(); itGroups != tree[i].pcount.end(); itGroups++) { + nodeGroups.push_back(itGroups->first); + } + tree[i].setGroup(nodeGroups); + }//end else }//end for @@ -375,7 +385,7 @@ void Tree::randomLabels(vector g) { tree[z].pGroups = (tree[i].pGroups); tree[i].pGroups = (lib_hold); - string zgroup = tree[z].getGroup(); + vector zgroup = tree[z].getGroup(); tree[z].setGroup(tree[i].getGroup()); tree[i].setGroup(zgroup); @@ -394,7 +404,7 @@ void Tree::randomLabels(vector g) { exit(1); } } -/**************************************************************************************************/ +/************************************************************************************************** void Tree::randomLabels(string groupA, string groupB) { try { @@ -447,7 +457,9 @@ void Tree::assembleRandomUnifracTree(vector g) { } /*************************************************************************************************/ void Tree::assembleRandomUnifracTree(string groupA, string groupB) { - randomLabels(groupA, groupB); + + vector temp; temp.push_back(groupA); temp.push_back(groupB); + randomLabels(temp); assembleTree(); } @@ -584,7 +596,9 @@ void Tree::printBranch(int node, ostream& out, string mode) { } } }else { //you are a leaf - out << tree[node].getGroup(); + string leafGroup = globaldata->gTreemap->getGroup(tree[node].getName()); + + out << leafGroup; if (mode == "branch") { //if there is a branch length then print it if (tree[node].getBranchLength() != -1) { diff --git a/tree.h b/tree.h index a076cef..37028be 100644 --- a/tree.h +++ b/tree.h @@ -55,7 +55,7 @@ private: void randomTopology(); void randomBlengths(); void randomLabels(vector); - void randomLabels(string, string); + //void randomLabels(string, string); void printBranch(int, ostream&, string); //recursively print out tree void parseTreeFile(); //parses through tree file to find names of nodes and number of them //this is required in case user has sequences in the names file that are diff --git a/treemap.cpp b/treemap.cpp index e472570..cd046f9 100644 --- a/treemap.cpp +++ b/treemap.cpp @@ -49,7 +49,7 @@ void TreeMap::removeSeq(string seqName) { //erase name from namesOfSeqs for (int i = 0; i < namesOfSeqs.size(); i++) { if (namesOfSeqs[i] == seqName) { - namesOfSeqs.erase (namesOfSeqs.begin()+i); + namesOfSeqs.erase(namesOfSeqs.begin()+i); break; } } @@ -61,8 +61,6 @@ void TreeMap::removeSeq(string seqName) { //remove seq from treemap it = treemap.find(seqName); treemap.erase(it); - - } /************************************************************/ diff --git a/treenode.cpp b/treenode.cpp index 54cdae4..648707a 100644 --- a/treenode.cpp +++ b/treenode.cpp @@ -25,7 +25,7 @@ Node::Node() { /****************************************************************/ void Node::setName(string Name) { name = Name; } /****************************************************************/ -void Node::setGroup(string groups) { group =groups; } +void Node::setGroup(vector groups) { group =groups; } /****************************************************************/ void Node::setBranchLength(float l) { branchLength = l; } /****************************************************************/ @@ -41,7 +41,7 @@ void Node::setChildren(int lc, int rc) { lchild = lc; rchild = rc; } //leftchild /****************************************************************/ string Node::getName() { return name; } /****************************************************************/ -string Node::getGroup() { return group; } +vector Node::getGroup() { return group; } /****************************************************************/ float Node::getBranchLength() { return branchLength; } /****************************************************************/ @@ -60,12 +60,16 @@ int Node::getIndex() { return vectorIndex; } //to be used by printTree in the Tree class to print the leaf info void Node::printNode() { try{ - mothurOut(toString(parent) + " " + toString(lchild) + " " + toString(rchild) + " " + group); + mothurOut(toString(parent) + " " + toString(lchild) + " " + toString(rchild) + " "); + + for (int i = 0; i < group.size(); i++) { mothurOut( group[i] + " "); } + //there is a branch length if (branchLength != -1) { mothurOut(" " + toString(branchLength)); } mothurOut(" |"); + map::iterator it; for(it=pGroups.begin();it!=pGroups.end();it++){ mothurOut(" " + it->first + ":" + toString(it->second)); diff --git a/treenode.h b/treenode.h index d6163a2..c25806b 100644 --- a/treenode.h +++ b/treenode.h @@ -21,7 +21,7 @@ class Node { ~Node() { pGroups.clear(); pcount.clear(); }; void setName(string); - void setGroup(string); + void setGroup(vector); void setBranchLength(float); void setLabel(float); void setParent(int); @@ -30,7 +30,7 @@ class Node { void setLengthToLeaves(float); string getName(); - string getGroup(); + vector getGroup(); float getBranchLength(); float getLengthToLeaves(); float getLabel(); @@ -52,7 +52,7 @@ class Node { private: string name; - string group; + vector group; float branchLength, length2leaf, label; int parent; int lchild; diff --git a/unweighted.cpp b/unweighted.cpp index c39e662..e6ed0eb 100644 --- a/unweighted.cpp +++ b/unweighted.cpp @@ -48,14 +48,8 @@ EstOutput Unweighted::getValues(Tree* t) { //groups in this combo groups.push_back(globaldata->Groups[a]); groups.push_back(globaldata->Groups[l]); - for(int i=t->getNumLeaves();igetNumNodes();i++){ - - int lc = t->tree[i].getLChild(); //lc = vector index of left child - int rc = t->tree[i].getRChild(); //rc = vector index of right child - - /**********************************************************************/ - //This section adds in all lengths that are non leaf - + for(int i=0;igetNumNodes();i++){ + copyIpcount = t->tree[i].pcount; for (it = copyIpcount.begin(); it != copyIpcount.end();) { if (inUsersGroups(it->first, groups) != true) { @@ -73,30 +67,10 @@ EstOutput Unweighted::getValues(Tree* t) { totalBL += abs(t->tree[i].getBranchLength()); } - /**********************************************************************/ - //This section adds in all lengths that are leaf - - //if i's chidren are leaves - if (t->tree[rc].getRChild() == -1) { - //if rc is a valid group and rc has a BL - if ((inUsersGroups(t->tree[rc].getGroup(), groups) == true) && (t->tree[rc].getBranchLength() != -1)) { - UniqueBL += abs(t->tree[rc].getBranchLength()); - totalBL += abs(t->tree[rc].getBranchLength()); - } - } - - if (t->tree[lc].getLChild() == -1) { - //if lc is a valid group and lc has a BL - if ((inUsersGroups(t->tree[lc].getGroup(), groups) == true) && (t->tree[lc].getBranchLength() != -1)) { - UniqueBL += abs(t->tree[lc].getBranchLength()); - totalBL += abs(t->tree[lc].getBranchLength()); - } - } - - /**********************************************************************/ } UW = (UniqueBL / totalBL); + //cout << globaldata->Groups[a] << globaldata->Groups[l] << '\t' << UniqueBL << '\t' << totalBL << endl; if (isnan(UW) || isinf(UW)) { UW = 0; } @@ -126,14 +100,8 @@ EstOutput Unweighted::getValues(Tree* t) { UW = 0.00; //Unweighted Value = UniqueBL / totalBL; copyIpcount.clear(); - for(int i=t->getNumLeaves();igetNumNodes();i++){ - - int lc = t->tree[i].getLChild(); //lc = vector index of left child - int rc = t->tree[i].getRChild(); //rc = vector index of right child - - /**********************************************************************/ - //This section adds in all lengths that are non leaf - + for(int i=0;igetNumNodes();i++){ + copyIpcount = t->tree[i].pcount; for (it = copyIpcount.begin(); it != copyIpcount.end();) { if (inUsersGroups(it->first, groups) != true) { @@ -151,27 +119,6 @@ EstOutput Unweighted::getValues(Tree* t) { totalBL += abs(t->tree[i].getBranchLength()); } - /**********************************************************************/ - //This section adds in all lengths that are leaf - - //if i's chidren are leaves - if (t->tree[rc].getRChild() == -1) { - //if rc is a valid group and rc has a BL - if ((inUsersGroups(t->tree[rc].getGroup(), groups) == true) && (t->tree[rc].getBranchLength() != -1)) { - UniqueBL += abs(t->tree[rc].getBranchLength()); - totalBL += abs(t->tree[rc].getBranchLength()); - } - } - - if (t->tree[lc].getLChild() == -1) { - //if lc is a valid group and lc has a BL - if ((inUsersGroups(t->tree[lc].getGroup(), groups) == true) && (t->tree[lc].getBranchLength() != -1)) { - UniqueBL += abs(t->tree[lc].getBranchLength()); - totalBL += abs(t->tree[lc].getBranchLength()); - } - } - - /**********************************************************************/ } UW = (UniqueBL / totalBL); @@ -236,12 +183,9 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB) { copyTree->assembleRandomUnifracTree(groups[0], groups[1]); //copyTree->createNewickFile("random"+groupA+toString(count)); - - for(int i=copyTree->getNumLeaves();igetNumNodes();i++){ - - int lc = copyTree->tree[i].getLChild(); //lc = vector index of left child - int rc = copyTree->tree[i].getRChild(); //rc = vector index of right child - + + for(int i=0;igetNumNodes();i++){ + /**********************************************************************/ //This section adds in all lengths that are non leaf copyIpcount = copyTree->tree[i].pcount; @@ -261,27 +205,6 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB) { totalBL += abs(copyTree->tree[i].getBranchLength()); } - /**********************************************************************/ - //This section adds in all lengths that are leaf - - //if i's chidren are leaves - if (copyTree->tree[rc].getRChild() == -1) { - //if rc is a valid group and rc has a BL - if ((inUsersGroups(copyTree->tree[rc].getGroup(), groups) == true) && (copyTree->tree[rc].getBranchLength() != -1)) { - UniqueBL += abs(copyTree->tree[rc].getBranchLength()); - totalBL += abs(copyTree->tree[rc].getBranchLength()); - } - } - - if (copyTree->tree[lc].getLChild() == -1) { - //if lc is a valid group and lc has a BL - if ((inUsersGroups(copyTree->tree[lc].getGroup(), groups) == true) && (copyTree->tree[lc].getBranchLength() != -1)) { - UniqueBL += abs(copyTree->tree[lc].getBranchLength()); - totalBL += abs(copyTree->tree[lc].getBranchLength()); - } - } - - /**********************************************************************/ } UW = (UniqueBL / totalBL); @@ -320,13 +243,7 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB) { //swap labels in all the groups you want to compare copyTree->assembleRandomUnifracTree(groups); - for(int i=copyTree->getNumLeaves();igetNumNodes();i++){ - - int lc = copyTree->tree[i].getLChild(); //lc = vector index of left child - int rc = copyTree->tree[i].getRChild(); //rc = vector index of right child - - /**********************************************************************/ - //This section adds in all lengths that are non leaf + for(int i=0;igetNumNodes();i++){ copyIpcount = copyTree->tree[i].pcount; for (it = copyIpcount.begin(); it != copyIpcount.end();) { @@ -345,27 +262,6 @@ EstOutput Unweighted::getValues(Tree* t, string groupA, string groupB) { totalBL += abs(copyTree->tree[i].getBranchLength()); } - /**********************************************************************/ - //This section adds in all lengths that are leaf - - //if i's chidren are leaves - if (copyTree->tree[rc].getRChild() == -1) { - //if rc is a valid group and rc has a BL - if ((inUsersGroups(copyTree->tree[rc].getGroup(), groups) == true) && (copyTree->tree[rc].getBranchLength() != -1)) { - UniqueBL += abs(copyTree->tree[rc].getBranchLength()); - totalBL += abs(copyTree->tree[rc].getBranchLength()); - } - } - - if (copyTree->tree[lc].getLChild() == -1) { - //if lc is a valid group and lc has a BL - if ((inUsersGroups(copyTree->tree[lc].getGroup(), groups) == true) && (copyTree->tree[lc].getBranchLength() != -1)) { - UniqueBL += abs(copyTree->tree[lc].getBranchLength()); - totalBL += abs(copyTree->tree[lc].getBranchLength()); - } - } - - /**********************************************************************/ } UW = (UniqueBL / totalBL); diff --git a/weighted.cpp b/weighted.cpp index d1bc40e..8a31229 100644 --- a/weighted.cpp +++ b/weighted.cpp @@ -26,6 +26,8 @@ EstOutput Weighted::getValues(Tree* t) { //initialize weighted scores WScore[globaldata->Groups[i]+globaldata->Groups[l]] = 0.0; + vector groups; groups.push_back(globaldata->Groups[i]); groups.push_back(globaldata->Groups[l]); + D.push_back(0.0000); //initialize a spot in D for each combination /********************************************************/ @@ -52,9 +54,25 @@ EstOutput Weighted::getValues(Tree* t) { //is this sum from a sequence which is in one of the users groups if (inUsersGroups(t->tree[v].getGroup(), globaldata->Groups) == true) { //is this sum from a sequence which is in this groupCombo - if ((t->tree[v].getGroup() == globaldata->Groups[i]) || (t->tree[v].getGroup() == globaldata->Groups[l])) { - sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()]; - D[count] += sum; + if (inUsersGroups(t->tree[v].getGroup(), groups)) { + int numSeqsInGroupI, numSeqsInGroupL; + + map::iterator it; + it = t->tree[v].pcount.find(groups[0]); + if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group i + numSeqsInGroupI = it->second; + }else{ numSeqsInGroupI = 0; } + + it = t->tree[v].pcount.find(groups[1]); + if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group l + numSeqsInGroupL = it->second; + }else{ numSeqsInGroupL = 0; } + + double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groups[0]]) + ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groups[1]]); + + //sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()]; + + D[count] += weightedSum; } } } @@ -125,6 +143,7 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { WScore[(groupA+groupB)] = 0.0; float D = 0.0; + vector groups; groups.push_back(groupA); groups.push_back(groupB); /********************************************************/ //calculate a D value for the group combo @@ -147,9 +166,25 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { sum += abs(t->tree[index].getBranchLength()); } - if ((t->tree[v].getGroup() == groupA) || (t->tree[v].getGroup() == groupB)) { - sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()]; - D += sum; + if (inUsersGroups(t->tree[v].getGroup(), groups)) { + int numSeqsInGroupI, numSeqsInGroupL; + + map::iterator it; + it = t->tree[v].pcount.find(groups[0]); + if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group i + numSeqsInGroupI = it->second; + }else{ numSeqsInGroupI = 0; } + + it = t->tree[v].pcount.find(groups[1]); + if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group l + numSeqsInGroupL = it->second; + }else{ numSeqsInGroupL = 0; } + + double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groups[0]]) + ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groups[1]]); + + //sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()]; + + D += weightedSum; } } /********************************************************/ -- 2.39.2