From c4fb347858dd8cfea4d2933f429fff4130dca65b Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 12 Oct 2009 21:26:13 +0000 Subject: [PATCH] finished chimeraslayer method and added get.listcount command --- Mothur.xcodeproj/project.pbxproj | 6 + aligncommand.cpp | 13 +- ccode.h | 1 + chimera.h | 6 +- chimeracheckrdp.h | 1 + chimeraseqscommand.cpp | 13 +- chimeraslayer.cpp | 220 ++++++++++++++----------------- chimeraslayer.h | 4 +- commandfactory.cpp | 3 + getlistcountcommand.cpp | 197 +++++++++++++++++++++++++++ getlistcountcommand.h | 44 +++++++ helpcommand.cpp | 2 +- maligner.cpp | 4 +- mothur.h | 2 +- nast.cpp | 10 +- pintail.h | 1 + slayer.cpp | 50 +++---- slayer.h | 6 +- 18 files changed, 413 insertions(+), 170 deletions(-) create mode 100644 getlistcountcommand.cpp create mode 100644 getlistcountcommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 6dbf319..fa7c672 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -164,6 +164,7 @@ A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; }; A70DECD91063D8B40057C03C /* secondarystructurecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */; }; A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; }; + A73329CF1083B3B3003B10C5 /* getlistcountcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */; }; A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; }; A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; }; A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; }; @@ -524,6 +525,8 @@ A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = secondarystructurecommand.cpp; sourceTree = SOURCE_ROOT; }; A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = SOURCE_ROOT; }; A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = SOURCE_ROOT; }; + A73329CD1083B3B3003B10C5 /* getlistcountcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlistcountcommand.h; sourceTree = ""; }; + A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlistcountcommand.cpp; sourceTree = ""; }; A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; }; A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; }; A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; }; @@ -845,6 +848,8 @@ A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */, A70B53A90F4CD7AD0064797E /* getlinecommand.h */, A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */, + A73329CD1083B3B3003B10C5 /* getlistcountcommand.h */, + A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */, 370B88050F8A4EE4005AB382 /* getoturepcommand.h */, 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */, 3749273D0FD5956B0031C06B /* getrabundcommand.h */, @@ -1209,6 +1214,7 @@ A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */, A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */, A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */, + A73329CF1083B3B3003B10C5 /* getlistcountcommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index da807bd..48c67ec 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -276,26 +276,27 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){ if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { alignment->resize(candidateSeq->getUnaligned().length()+1); } - + report.setCandidate(candidateSeq); Sequence temp = templateDB->findClosestSequence(candidateSeq); Sequence* templateSeq = &temp; - + report.setTemplate(templateSeq); report.setSearchParameters(search, templateDB->getSearchScore()); - + Nast nast(alignment, candidateSeq, templateSeq); report.setAlignmentParameters(align, alignment); - + report.setNastParameters(nast); alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; - + report.print(); - delete candidateSeq; + delete candidateSeq; + } alignmentFile.close(); diff --git a/ccode.h b/ccode.h index bc1aad8..de7c7c4 100644 --- a/ccode.h +++ b/ccode.h @@ -14,6 +14,7 @@ #include "dist.h" #include "decalc.h" +/***********************************************************/ //This class was created using the algorythms described in the // "Evaluating putative chimeric sequences from PCR-amplified products" paper //by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez. diff --git a/chimera.h b/chimera.h index 2d6ac0b..33e514b 100644 --- a/chimera.h +++ b/chimera.h @@ -76,7 +76,7 @@ class Chimera { virtual void setDivR(float d) { divR = d; } virtual void setParents(int p) { parents = p; } virtual void setMinSim(int s) { minSim = s; } - virtual void setPrint(int p) { printAll = p; } + virtual void setIters(int i) { iters = i; } virtual void setCons(string){}; @@ -94,8 +94,8 @@ class Chimera { protected: - bool filter, correction, svg, printAll; - int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, parents; + bool filter, correction, svg; + int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, parents, iters; float divR; string seqMask, quanfile, filterString, name; diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h index 7563c02..f436718 100644 --- a/chimeracheckrdp.h +++ b/chimeracheckrdp.h @@ -16,6 +16,7 @@ #include "kmerdb.hpp" #include "database.hpp" +/***********************************************************/ //This class was created using the algorythms described in //CHIMERA_CHECK version 2.7 written by Niels Larsen. diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index f368869..14973cc 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -26,7 +26,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim", "parents", "printall" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim", "parents", "iters" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -78,9 +78,6 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; } correction = isTrue(temp); - temp = validParameter.validFile(parameters, "printall", false); if (temp == "not found") { temp = "F"; } - printAll = isTrue(temp); - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } convert(temp, processors); @@ -109,6 +106,9 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "5"; } convert(temp, parents); + + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } + convert(temp, iters); temp = validParameter.validFile(parameters, "increment", false); if ((temp == "not found") && ((method == "chimeracheck") || (method == "chimeraslayer"))) { temp = "10"; } @@ -140,7 +140,7 @@ void ChimeraSeqsCommand::help(){ //"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name" //mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED and if using a template that the template file sequences are the same length as the fasta file sequences.\n\n"); mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n"); - mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name.\n"); + mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation, quantile, numwanted, ksize, svg, name, iters.\n"); mothurOut("The fasta parameter is always required and template is required if using pintail, ccode or chimeracheck.\n"); mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. \n"); mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n"); @@ -156,6 +156,7 @@ void ChimeraSeqsCommand::help(){ mothurOut("The ksize parameter allows you to input kmersize. \n"); mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence.\n"); mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n"); + mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n"); mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); mothurOut("Details for each method: \n"); mothurOut("\tpintail: \n"); @@ -220,7 +221,7 @@ int ChimeraSeqsCommand::execute(){ chimera->setDivR(divR); chimera->setParents(parents); chimera->setMinSim(minSimilarity); - chimera->setPrint(printAll); + chimera->setIters(iters); //find chimeras diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index bd1908d..86ed532 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -27,118 +27,28 @@ ChimeraSlayer::~ChimeraSlayer() { void ChimeraSlayer::print(ostream& out) { try { mothurOutEndLine(); + mothurOut("Only reporting sequence supported by 90% of bootstrapped results."); + mothurOutEndLine(); for (int i = 0; i < querySeqs.size(); i++) { if (chimeraFlags[i] == "yes") { - mothurOut(querySeqs[i]->getName() + "\tyes"); mothurOutEndLine(); - + if ((chimeraResults[i][0].bsa >= 90.0) || (chimeraResults[i][0].bsb >= 90.0)) { + mothurOut(querySeqs[i]->getName() + "\tyes"); mothurOutEndLine(); + out << querySeqs[i]->getName() << "\tyes" << endl; + }else { + out << querySeqs[i]->getName() << "\tno" << endl; + mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine(); + } + + printBlock(chimeraResults[i][0], out, i); + + out << endl; }else{ out << querySeqs[i]->getName() << "\tno" << endl; - mothurOut("no"); + mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine(); } } -/* - - my $div_ratio_QLA_QRB = $data_struct->{div_ratio_QLA_QRB}; - my $div_ratio_QRA_QLB = $data_struct->{div_ratio_QLB_QRA}; - - my $per_id_QLA = $data_struct->{per_id_QLA}; - my $per_id_QRB = $data_struct->{per_id_QRB}; - my $per_id_AB = $data_struct->{per_id_AB}; - my $per_id_QA = $data_struct->{per_id_QA}; - my $per_id_QB = $data_struct->{per_id_QB}; - my $per_id_LAB = $data_struct->{per_id_LAB}; - my $per_id_RAB = $data_struct->{per_id_RAB}; - my $per_id_QRA = $data_struct->{per_id_QRA}; - my $per_id_QLB = $data_struct->{per_id_QLB}; - my $per_id_QLB_QRA = $data_struct->{per_id_QLB_QRA}; - my $per_id_QLA_QRB = $data_struct->{per_id_QLA_QRB}; - - my $win_left_end5 = $data_struct->{win_left_end5}; - my $win_left_end3 = $data_struct->{win_left_end3}; - my $win_right_end5 = $data_struct->{win_right_end5}; - my $win_right_end3 = $data_struct->{win_right_end3}; - my $Q = $data_struct->{query_alignment}; - my $A = $data_struct->{parent_A_alignment}; - my $B = $data_struct->{parent_B_alignment}; - my $BS_A = $data_struct->{BS_A}; - my $BS_B = $data_struct->{BS_B}; - - my @Q_chars = @{$Q->{align}}; - my @A_chars = @{$A->{align}}; - my @B_chars = @{$B->{align}}; - - my $query_acc = $Q->{acc}; - my $A_acc = $A->{acc}; - my $B_acc = $B->{acc}; - - my $break_left = $Q->{seqPos}->[$win_left_end3]; - my $break_right = $Q->{seqPos}->[$win_right_end5]; - - - cout << "//\n## CHIMERA\t" << querySeqs[i]->getName() << "\t" << $break_left-$break_right" << endl - << "\tDIV_QLARB: ". sprintf("%.3f", $div_ratio_QLA_QRB) - << "\tBS_QLARB: " . sprintf("%.2f", $BS_A) - << "\tDIV_QRALB: " . sprintf("%.3f", $div_ratio_QRA_QLB) - << "\tBS_QRALB: " . sprintf("%.2f", $BS_B) - << "\t$A_acc\t$B_acc" - << "\tbreakpoint: $break_left-$break_right\n\n"; - - ## draw illustration: - - print " Per_id parents: " . sprintf("%.2f", $per_id_AB) . "\n\n"; - print " Per_id(Q,A): " . sprintf("%.2f", $per_id_QA) . "\n"; - print "--------------------------------------------------- A: $A_acc\n" - . " " . sprintf("%.2f", $per_id_QLA) . " " . sprintf("%.2f", $per_id_QRA) . "\n" - . "~~~~~~~~~~~~~~~~~~~~~~~~\\ /~~~~~~~~~~~~~~~~~~~~~~~~ Q: $query_acc\n" - . "DivR: " . sprintf("%.3f", $div_ratio_QLA_QRB) . " BS: " . sprintf("%.2f", $BS_A) . " |\n" - . "Per_id(QLA,QRB): " . sprintf("%.2f", $per_id_QLA_QRB) . " |\n" - . " |\n" - . " (L-AB: " . sprintf("%.2f", $per_id_LAB) . ") | (R-AB: " . sprintf("%.2f", $per_id_RAB) . ")\n" - . " WinL:$win_left_end5-$win_left_end3 | WinR:$win_right_end5-$win_right_end3\n" - . " |\n" - . "Per_id(QLB,QRA): " . sprintf("%.2f", $per_id_QLB_QRA) . " |\n" - . "DivR: " . sprintf("%.3f", $div_ratio_QRA_QLB) . " BS: " . sprintf("%.2f", $BS_B) . " |\n" - . "~~~~~~~~~~~~~~~~~~~~~~~~/ \\~~~~~~~~~~~~~~~~~~~~~~~~~ Q: $query_acc\n" - . " " . sprintf("%.2f", $per_id_QLB) . " " . sprintf("%.2f", $per_id_QRB) . "\n" - . "---------------------------------------------------- B: $B_acc\n"; - print " Per_id(Q,B): ". sprintf("%.2f", $per_id_QB) . "\n\n"; - - my $deltaL = $per_id_QLA - $per_id_QLB; - my $deltaR = $per_id_QRA - $per_id_QRB; - - print "DeltaL: " . sprintf("%.2f", $deltaL) . " DeltaR: " . sprintf("%.2f", $deltaR) . "\n\n"; - - unless ($printAlignmentsFlag) { return; } - - - ## build the left windows: - my @Q_left_win = @Q_chars[$win_left_end5..$win_left_end3]; - my @A_left_win = @A_chars[$win_left_end5..$win_left_end3]; - my @B_left_win = @B_chars[$win_left_end5..$win_left_end3]; - - &print_alignment($A_acc, \@A_left_win, - $query_acc, \@Q_left_win, - $B_acc, \@B_left_win); - - print "\t\t** Breakpoint **\n\n"; - - my @Q_right_win = @Q_chars[$win_right_end5..$win_right_end3]; - my @A_right_win = @A_chars[$win_right_end5..$win_right_end3]; - my @B_right_win = @B_chars[$win_right_end5..$win_right_end3]; - - &print_alignment($A_acc, \@A_right_win, - $query_acc, \@Q_right_win, - $B_acc, \@B_right_win); - - return; -} - - -#### - - */ } catch(exception& e) { @@ -161,6 +71,7 @@ void ChimeraSlayer::getChimeras() { chimeraResults.resize(numSeqs); chimeraFlags.resize(numSeqs, "no"); + spotMap.resize(numSeqs); //break up file if needed int linesPerProcess = numSeqs / processors ; @@ -183,39 +94,56 @@ void ChimeraSlayer::getChimeras() { if (seqMask != "") { decalc = new DeCalculator(); } //to use below + //initialize spotMap + for (int j = 0; j < numSeqs; j++) { + for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) { + spotMap[j][i] = i; + } + } + //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity maligner = new Maligner(templateSeqs, numWanted, match, misMatch, 1.01, minSim); - slayer = new Slayer(window, increment, minSim, divR); + slayer = new Slayer(window, increment, minSim, divR, iters); for (int i = 0; i < querySeqs.size(); i++) { string chimeraFlag = maligner->getResults(querySeqs[i]); - float percentIdentical = maligner->getPercentID(); + //float percentIdentical = maligner->getPercentID(); vector Results = maligner->getOutput(); - //cout << querySeqs[i]->getName() << '\t' << chimeraFlag << '\t' << percentIdentical << endl; + cout << "Processing sequence: " << i+1 << endl; - for (int j = 0; j < Results.size(); j++) { + //for (int j = 0; j < Results.size(); j++) { //cout << "regionStart = " << Results[j].regionStart << "\tRegionEnd = " << Results[j].regionEnd << "\tName = " << Results[j].parent << "\tPerQP = " << Results[j].queryToParent << "\tLocalPerQP = " << Results[j].queryToParentLocal << "\tdivR = " << Results[j].divR << endl; - } + //} - if (chimeraFlag == "yes") { + //if (chimeraFlag == "yes") { //get sequence that were given from maligner results vector seqs; + map removeDups; + map::iterator itDup; for (int j = 0; j < Results.size(); j++) { - Sequence* seq = getSequence(Results[j].parent); //makes copy so you can filter and mask and not effect template - - //seq = NULL if error occurred in getSequence - if (seq == NULL) { break; } - else { - SeqDist member; - member.seq = seq; - member.dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; - seqs.push_back(member); + float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; + //only add if you are not a duplicate + itDup = removeDups.find(Results[j].parent); + if (itDup == removeDups.end()) { //this is not duplicate + removeDups[Results[j].parent] = dist; + }else if (dist > itDup->second) { //is this a stronger number for this parent + removeDups[Results[j].parent] = dist; } } - + + for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { + Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template + + SeqDist member; + member.seq = seq; + member.dist = itDup->second; + + seqs.push_back(member); + } + //limit number of parents to explore - default 5 if (Results.size() > parents) { //sort by distance @@ -232,6 +160,24 @@ void ChimeraSlayer::getChimeras() { //put seqs into vector to send to slayer vector seqsForSlayer; for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } +//ofstream out; +//string name = querySeqs[i]->getName(); +//cout << name << endl; +//name = name.substr(name.find_first_of("|")+1); +//cout << name << endl; +//name = name.substr(name.find_first_of("|")+1); +//cout << name << endl; +//name = name.substr(0, name.find_last_of("|")); +//cout << name << endl; +//string filename = name + ".seqsforslayer"; +//openOutputFile(filename, out); +//for (int u = 0; u < seqsForSlayer.size(); u++) { seqsForSlayer[u]->printSequence(out); } +//out.close(); +//filename = name + ".fasta"; +//openOutputFile(filename, out); +//q->printSequence(out); +//out.close(); + //mask then send to slayer... if (seqMask != "") { @@ -245,6 +191,10 @@ void ChimeraSlayer::getChimeras() { decalc->runMask(seqsForSlayer[k]); } + for (int i = 0; i < numSeqs; i++) { + spotMap[i] = decalc->getMaskMap(); + } + } //send to slayer @@ -253,7 +203,7 @@ void ChimeraSlayer::getChimeras() { //free memory for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } - } + //} } //free memory @@ -296,6 +246,34 @@ Sequence* ChimeraSlayer::getSequence(string name) { } } //*************************************************************************************************************** +void ChimeraSlayer::printBlock(data_struct data, ostream& out, int i){ + try { + + out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl; + out << "Left Window: " << spotMap[i][data.winLStart] << " " << spotMap[i][data.winLEnd] << endl; + out << "Right Window: " << spotMap[i][data.winRStart] << " " << spotMap[i][data.winREnd] << endl; + + out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl; + out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl; + + out << "Similarity of parents: " << data.ab << endl; + out << "Similarity of query to parentA: " << data.qa << endl; + out << "Similarity of query to parentB: " << data.qb << endl; + + //out << "Per_id(QL,A): " << data.qla << endl; + //out << "Per_id(QL,B): " << data.qlb << endl; + //out << "Per_id(QR,A): " << data.qra << endl; + //out << "Per_id(QR,B): " << data.qrb << endl; + + out << "DeltaL: " << (data.qla - data.qlb) << endl; + out << "DeltaR: " << (data.qra - data.qrb) << endl; + } + catch(exception& e) { + errorOut(e, "ChimeraSlayer", "printBlock"); + exit(1); + } +} +//*************************************************************************************************************** diff --git a/chimeraslayer.h b/chimeraslayer.h index a305b5b..379209e 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -40,6 +40,7 @@ class ChimeraSlayer : public Chimera { vector lines; vector querySeqs; vector templateSeqs; + vector< map > spotMap; vector< vector > chimeraResults; vector chimeraFlags; @@ -47,8 +48,7 @@ class ChimeraSlayer : public Chimera { string fastafile, templateFile; Sequence* getSequence(string); //find sequence from name - - + void printBlock(data_struct, ostream&, int i); }; /************************************************************************/ diff --git a/commandfactory.cpp b/commandfactory.cpp index 4337024..9adf679 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -56,6 +56,7 @@ #include "systemcommand.h" #include "secondarystructurecommand.h" #include "getsharedotucommand.h" +#include "getlistcountcommand.h" /***********************************************************/ @@ -110,6 +111,7 @@ CommandFactory::CommandFactory(){ commands["system"] = "system"; commands["align.check"] = "align.check"; commands["get.sharedotu"] = "get.sharedotu"; + commands["get.listcount"] = "get.listcount"; commands["quit"] = "quit"; } @@ -174,6 +176,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "system") { command = new SystemCommand(optionString); } else if(commandName == "align.check") { command = new AlignCheckCommand(optionString); } else if(commandName == "get.sharedotu") { command = new GetSharedOTUCommand(optionString); } + else if(commandName == "get.listcount") { command = new GetListCountCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/getlistcountcommand.cpp b/getlistcountcommand.cpp new file mode 100644 index 0000000..f14e17a --- /dev/null +++ b/getlistcountcommand.cpp @@ -0,0 +1,197 @@ +/* + * getlistcountcommand.cpp + * Mothur + * + * Created by westcott on 10/12/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "getlistcountcommand.h" + +//********************************************************************************************************************** +GetListCountCommand::GetListCountCommand(string option){ + try { + globaldata = GlobalData::getInstance(); + abort = false; + allLines = 1; + labels.clear(); + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string AlignArray[] = {"list","label"}; + vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + string ranRead = globaldata->getListFile(); + + //check for required parameters + listfile = validParameter.validFile(parameters, "list", true); + if ((listfile == "not found") && (globaldata->getListFile() == "")) { mothurOut("You must read a listfile before running the get.listcount command."); mothurOutEndLine(); abort = true; } + else if ((listfile == "not found") && (globaldata->getListFile() != "")) { listfile = globaldata->getListFile(); } + else if (listfile == "not open") { abort = true; } + else { globaldata->setListFile(listfile); } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + + label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; } + else { + if(label != "all") { splitAtDash(label, labels); allLines = 0; } + else { allLines = 1; } + } + + //if the user has not specified any labels use the ones from read.otu + if ((label == "") && (ranRead != "")) { + allLines = globaldata->allLines; + labels = globaldata->labels; + } + } + } + catch(exception& e) { + errorOut(e, "GetListCountCommand", "GetListCountCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void GetListCountCommand::help(){ + try { + mothurOut("The get.listcount command can only be executed after a successful read.otu command of a listfile or providing a list file using the list parameter.\n"); + mothurOut("The get.listcount command parameters are list and label. No parameters are required.\n"); + mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and are separated by dashes.\n"); + mothurOut("The get.listcount command should be in the following format: get.listcount(list=yourlistFile, label=yourLabels).\n"); + mothurOut("Example get.listcount(list=amazon.fn.list, label=0.10).\n"); + mothurOut("The default value for label is all lines in your inputfile.\n"); + mothurOut("The get.listcount command outputs a .otu file for each distance you specify listing the bin number and the names of the sequences in that bin.\n"); + mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListFile).\n\n"); + } + catch(exception& e) { + errorOut(e, "GetListCountCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +GetListCountCommand::~GetListCountCommand(){} + +//********************************************************************************************************************** + +int GetListCountCommand::execute(){ + try { + if (abort == true) { return 0; } + + globaldata->setFormat("list"); + + //read list file + read = new ReadOTUFile(listfile); + read->read(&*globaldata); + + input = globaldata->ginput; + list = globaldata->gListVector; + string lastLabel = list->getLabel(); + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; + + while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if(allLines == 1 || labels.count(list->getLabel()) == 1){ + + process(list); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + } + + if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + delete list; + list = input->getListVector(lastLabel); + + process(list); + + processedLabels.insert(list->getLabel()); + userLabels.erase(list->getLabel()); + } + + lastLabel = list->getLabel(); + + delete list; + list = input->getListVector(); + } + + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); + needToRun = true; + }else { + mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (list != NULL) { delete list; } + list = input->getListVector(lastLabel); + + process(list); + delete list; + } + + delete read; + globaldata->gListVector = NULL; + + return 0; + } + catch(exception& e) { + errorOut(e, "GetListCountCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** +//return 1 if error, 0 otherwise +void GetListCountCommand::process(ListVector* list) { + try { + string binnames, name, sequence; + string outputFileName = getRootName(listfile) + list->getLabel() + ".otu"; + openOutputFile(outputFileName, out); + + mothurOut(list->getLabel()); mothurOutEndLine(); + + //for each bin in the list vector + for (int i = 0; i < list->getNumBins(); i++) { + binnames = list->get(i); + out << i+1 << '\t' << binnames << endl; + } + + out.close(); + } + catch(exception& e) { + errorOut(e, "GetListCountCommand", "process"); + exit(1); + } +} +//********************************************************************************************************************** + + diff --git a/getlistcountcommand.h b/getlistcountcommand.h new file mode 100644 index 0000000..468ea80 --- /dev/null +++ b/getlistcountcommand.h @@ -0,0 +1,44 @@ +#ifndef GETLISTCOUNTCOMMAND_H +#define GETLISTCOUNTCOMMAND_H +/* + * getlistcountcommand.h + * Mothur + * + * Created by westcott on 10/12/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" +#include "inputdata.h" +#include "listvector.hpp" +#include "readotu.h" + +class GlobalData; + +/**********************************************************/ + +class GetListCountCommand : public Command { + +public: + GetListCountCommand(string); + ~GetListCountCommand(); + int execute(); + void help(); + +private: + GlobalData* globaldata; + ListVector* list; + ReadOTUFile* read; + InputData* input; + + bool abort, allLines; + set labels; //holds labels to be used + string label, listfile; + ofstream out; + + void process(ListVector*); +}; +/**********************************************************/ + +#endif diff --git a/helpcommand.cpp b/helpcommand.cpp index ba4f53d..fc2f291 100644 --- a/helpcommand.cpp +++ b/helpcommand.cpp @@ -32,7 +32,7 @@ int HelpCommand::execute(){ delete validCommands; - mothurOutEndLine(); mothurOut("For further assistance please refer to the Mothur manual on our wiki at http://www.mothur.org/wiki, or contact Pat Schloss at pschloss@umich.edu.\n"); + mothurOutEndLine(); mothurOut("For further assistance please refer to the Mothur manual on our wiki at http://www.mothur.org/wiki, or contact Pat Schloss at mothur.bugs@gmail.com.\n"); return 0; } diff --git a/maligner.cpp b/maligner.cpp index c7f9be0..26ac563 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -16,6 +16,8 @@ Maligner::Maligner(vector temp, int num, int match, int misMatch, flo string Maligner::getResults(Sequence* q) { try { + outputResults.clear(); + //make copy so trimming doesn't destroy query from calling class - remember to deallocate query = new Sequence(q->getName(), q->getAligned()); @@ -25,7 +27,7 @@ string Maligner::getResults(Sequence* q) { //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate refSeqs = decalc->findClosest(query, db, numWanted); - + refSeqs = minCoverageFilter(refSeqs); if (refSeqs.size() < 2) { diff --git a/mothur.h b/mothur.h index 836e066..84dbdb3 100644 --- a/mothur.h +++ b/mothur.h @@ -267,7 +267,7 @@ inline void errorOut(exception& e, string object, string function) { mothurOut("Error: "); mothurOut(toString(e.what())); - mothurOut(" has occurred in the " + object + " class function " + function + ". Please contact Pat Schloss at pschloss@umich.edu, and be sure to include the mothur.logFile with your inquiry."); + mothurOut(" has occurred in the " + object + " class function " + function + ". Please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry."); mothurOutEndLine(); } diff --git a/nast.cpp b/nast.cpp index 294940c..f247037 100644 --- a/nast.cpp +++ b/nast.cpp @@ -226,7 +226,7 @@ void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis candidateSeq->setAligned(candAln); return; } - + int fullAlignIndex = 0; int pairwiseAlignIndex = 0; string newTemplateAlign = ""; // this is going to be messy so we want a temporary template @@ -323,7 +323,9 @@ void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis newTemplateAlign += tempAln[i];// } - int start, end; + int start = 0; + int end = candAln.length()-1; + for(int i=0;isetAligned(candAln); } catch(exception& e) { diff --git a/pintail.h b/pintail.h index 37efd00..aada728 100644 --- a/pintail.h +++ b/pintail.h @@ -14,6 +14,7 @@ #include "dist.h" #include "decalc.h" +/***********************************************************/ //This class was created using the algorythms described in the // "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper //by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1. diff --git a/slayer.cpp b/slayer.cpp index 6e7f0e3..95734c7 100644 --- a/slayer.cpp +++ b/slayer.cpp @@ -10,12 +10,11 @@ #include "slayer.h" /***********************************************************************/ -Slayer::Slayer(int win, int increment, int parentThreshold, float div) : - windowSize(win), windowStep(increment), parentFragmentThreshold(parentThreshold), divRThreshold(div) {} +Slayer::Slayer(int win, int increment, int parentThreshold, float div, int i) : + windowSize(win), windowStep(increment), parentFragmentThreshold(parentThreshold), divRThreshold(div), iters(i){} /***********************************************************************/ string Slayer::getResults(Sequence* query, vector refSeqs) { try { -cout << "refSeqs = " << refSeqs.size() << endl; vector all; all.clear(); for (int i = 0; i < refSeqs.size(); i++) { @@ -46,7 +45,6 @@ cout << "refSeqs = " << refSeqs.size() << endl; float snpRateLeft = numSNPSLeft / (float) winSizeLeft; float snpRateRight = numSNPSRight / (float) winSizeRight; - float logR = log(snpRateLeft / snpRateRight) / log(2); // do not accept excess snp ratio on either side of the break @@ -54,7 +52,7 @@ cout << "refSeqs = " << refSeqs.size() << endl; float BS_A, BS_B; bootstrapSNPS(snpsLeft, snpsRight, BS_A, BS_B); - + divs[k].bsa = BS_A; divs[k].bsb = BS_B; @@ -101,18 +99,17 @@ vector Slayer::runBellerophon(Sequence* q, Sequence* pA, Sequence* try{ vector data; -cout << q->getName() << '\t' << q->getAligned().length() << endl; + //vertical filter vector temp; temp.push_back(q); temp.push_back(pA); temp.push_back(pB); - verticalFilter(temp); + map spots = verticalFilter(temp); //get these to avoid numerous function calls string query = q->getAligned(); string parentA = pA->getAligned(); string parentB = pB->getAligned(); int length = query.length(); -cout << q->getName() << '\t' << length << endl; //check window size if (length < (2*windowSize+windowStep)) { @@ -170,10 +167,10 @@ cout << q->getName() << '\t' << length << endl; member.rab = RAB; member.qra = QRA; member.qlb = QLB; - member.winLStart = 0; - member.winLEnd = breakpoint; - member.winRStart = breakpoint+1; - member.winREnd = length-1; + member.winLStart = spots[0]; + member.winLEnd = spots[breakpoint]; //so breakpoint reflects spot in alignment before filter + member.winRStart = spots[breakpoint+1]; + member.winREnd = spots[length-1]; member.querySeq = *(q); member.parentA = *(pA); member.parentB = *(pB); @@ -238,15 +235,16 @@ void Slayer::bootstrapSNPS(vector left, vector right, float& BSA, fl int numLeft = max(1, int(left.size()/10 +0.5)); int numRight = max(1, int(right.size()/10 + 0.5)); - for (int i = 0; i < 100; i++) { + for (int i = 0; i < iters; i++) { //random sampling with replacement. vector selectedLeft; + for (int j = 0; j < numLeft; j++) { int index = int(rand() % left.size()); selectedLeft.push_back(left[index]); } - + vector selectedRight; for (int j = 0; j < numRight; j++) { int index = int(rand() % right.size()); @@ -268,7 +266,7 @@ void Slayer::bootstrapSNPS(vector left, vector right, float& BSA, fl float QLB = snpQB(selectedLeft); float QRB = snpQB(selectedRight); - + //in original - not used - not sure why? //float ALB = snpAB(selectedLeft); //float ARB = snpAB(selectedRight); @@ -283,8 +281,8 @@ void Slayer::bootstrapSNPS(vector left, vector right, float& BSA, fl } - BSA = (float) count_A; - BSB = (float) count_B; + BSA = ((float) count_A / (float) iters) * 100; + BSB = ((float) count_B / (float) iters) * 100; } catch(exception& e) { @@ -304,7 +302,7 @@ float Slayer::snpQA(vector data) { } } - float percentID = (numIdentical / data.size()) * 100; + float percentID = (numIdentical / (float) data.size()) * 100; return percentID; } @@ -325,7 +323,7 @@ float Slayer::snpQB(vector data) { } } - float percentID = (numIdentical / data.size()) * 100; + float percentID = (numIdentical / (float) data.size()) * 100; return percentID; @@ -346,7 +344,7 @@ float Slayer::snpAB(vector data) { } } - float percentID = (numIdentical / data.size()) * 100; + float percentID = (numIdentical / (float) data.size()) * 100; return percentID; @@ -380,7 +378,7 @@ float Slayer::computePercentID(string queryFrag, string parent, int left, int ri } /***********************************************************************/ //remove columns that contain any gaps -void Slayer::verticalFilter(vector seqs) { +map Slayer::verticalFilter(vector seqs) { try { vector gaps; gaps.resize(seqs[0]->getAligned().length(), 0); @@ -399,11 +397,17 @@ void Slayer::verticalFilter(vector seqs) { //zero out spot where all sequences have blanks int numColRemoved = 0; + int count = 0; + map maskMap; maskMap.clear(); for(int i = 0; i < seqs[0]->getAligned().length(); i++){ if(gaps[i] != 0) { filterString[i] = '0'; numColRemoved++; } + else { + maskMap[count] = i; + count++; + } } - + //for each sequence for (int i = 0; i < seqs.size(); i++) { @@ -417,6 +421,8 @@ void Slayer::verticalFilter(vector seqs) { seqs[i]->setAligned(newAligned); } + + return maskMap; } catch(exception& e) { errorOut(e, "Slayer", "verticalFilter"); diff --git a/slayer.h b/slayer.h index ea79727..dc13e6d 100644 --- a/slayer.h +++ b/slayer.h @@ -64,7 +64,7 @@ class Slayer { public: - Slayer(int, int, int, float); + Slayer(int, int, int, float, int); ~Slayer() {}; string getResults(Sequence*, vector); @@ -73,11 +73,11 @@ class Slayer { private: - int windowSize, windowStep, parentFragmentThreshold; + int windowSize, windowStep, parentFragmentThreshold, iters; float divRThreshold; vector outputResults; - void verticalFilter(vector); + map verticalFilter(vector); float computePercentID(string, string, int, int); vector runBellerophon(Sequence*, Sequence*, Sequence*); -- 2.39.2