From 861f46b74c17adec8c6ad6d89f232ae7485797bf Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 8 Jul 2009 13:31:28 +0000 Subject: [PATCH] added list.seqs command --- Mothur.xcodeproj/project.pbxproj | 6 + aligncommand.cpp | 8 +- chimeraseqscommand.cpp | 391 ++++++++++++++++++++++--------- chimeraseqscommand.h | 28 ++- commandfactory.cpp | 8 +- eachgapdist.h | 2 + globaldata.hpp | 9 +- listseqscommand.cpp | 240 +++++++++++++++++++ listseqscommand.h | 37 +++ mothur.h | 2 +- nastreport.cpp | 2 +- rarefact.cpp | 11 +- rarefact.h | 4 +- rarefactsharedcommand.cpp | 15 +- rarefactsharedcommand.h | 2 +- sequence.cpp | 2 +- 16 files changed, 618 insertions(+), 149 deletions(-) create mode 100644 listseqscommand.cpp create mode 100644 listseqscommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index b8a1057..f5bc8ad 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -76,6 +76,7 @@ 37AD4DCA0F28F3DD00AA2D49 /* readtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4DC90F28F3DD00AA2D49 /* readtree.cpp */; }; 37AFC71F0F445386005F492D /* sharedsobscollectsummary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AFC71E0F445386005F492D /* sharedsobscollectsummary.cpp */; }; 37B28F680F27590100808A62 /* deconvolutecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37B28F670F27590100808A62 /* deconvolutecommand.cpp */; }; + 37B73C761004BEFD008C4B41 /* listseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */; }; 37C1D9730F86506E0059E3F0 /* binsequencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */; }; 37C753CE0FB3415200DBD02E /* distancecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37C753CD0FB3415200DBD02E /* distancecommand.cpp */; }; 37D928550F21331F001D4494 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37D927B80F21331F001D4494 /* ace.cpp */; }; @@ -318,6 +319,8 @@ 37AFC71E0F445386005F492D /* sharedsobscollectsummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedsobscollectsummary.cpp; sourceTree = SOURCE_ROOT; }; 37B28F660F27590100808A62 /* deconvolutecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deconvolutecommand.h; sourceTree = SOURCE_ROOT; }; 37B28F670F27590100808A62 /* deconvolutecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconvolutecommand.cpp; sourceTree = SOURCE_ROOT; }; + 37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = ""; }; + 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = ""; }; 37C1D9710F86506E0059E3F0 /* binsequencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = binsequencecommand.h; sourceTree = SOURCE_ROOT; }; 37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binsequencecommand.cpp; sourceTree = SOURCE_ROOT; }; 37C753CC0FB3415200DBD02E /* distancecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distancecommand.h; sourceTree = SOURCE_ROOT; }; @@ -788,6 +791,8 @@ 37D927E30F21331F001D4494 /* helpcommand.cpp */, 375873F40F7D648F0040F377 /* libshuffcommand.h */, 375873F30F7D648F0040F377 /* libshuffcommand.cpp */, + 37B73C741004BEFD008C4B41 /* listseqscommand.h */, + 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */, 21E859D60FC4632E005E1A48 /* matrixoutputcommand.h */, 21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */, 7E5A17AD0FE57292003C6A03 /* mergefilecommand.h */, @@ -1111,6 +1116,7 @@ 378599100FDD7E8E00EF9D03 /* optionparser.cpp in Sources */, 7E5A17AF0FE57292003C6A03 /* mergefilecommand.cpp in Sources */, 379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */, + 37B73C761004BEFD008C4B41 /* listseqscommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 0f5c86f..5af8e95 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -195,10 +195,12 @@ int AlignCommand::execute(){ ifstream inFASTA; openInputFile(candidateFileName, inFASTA); + string input; while(!inFASTA.eof()){ - char c = inFASTA.get(); - if(c == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos-1); } - while (!inFASTA.eof()) { c = inFASTA.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + getline(inFASTA, input); + if (input.length() != 0) { + if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } } inFASTA.close(); diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 182c25e..f19fe11 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -8,6 +8,7 @@ */ #include "chimeraseqscommand.h" +#include "eachgapdist.h" //*************************************************************************************************************** @@ -20,7 +21,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -39,16 +40,24 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; } string temp; - temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } + temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; } filter = isTrue(temp); temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; } correction = isTrue(temp); - temp = validParameter.validFile(parameters, "processors", true); if (temp == "not found") { temp = "1"; } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } convert(temp, processors); + temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; } + convert(temp, window); + + temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; } + convert(temp, increment); + method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; } + + if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; } } } @@ -77,6 +86,11 @@ void ChimeraSeqsCommand::help(){ exit(1); } } +//******************************************************************************************************************** +//sorts highest score to lowest +inline bool comparePref(Preference left, Preference right){ + return (left.score[0] > right.score[0]); +} //*************************************************************************************************************** @@ -89,9 +103,10 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } + //do soft filter if (filter) { - string optionString = "fasta=" + fastafile + ", soft=50.0, vertical=F"; + string optionString = "fasta=" + fastafile + ", soft=50, vertical=F"; filterSeqs = new FilterSeqsCommand(optionString); filterSeqs->execute(); delete filterSeqs; @@ -100,76 +115,169 @@ int ChimeraSeqsCommand::execute(){ fastafile = getRootName(fastafile) + "filter.fasta"; } + distCalculator = new eachGapDist(); + //read in sequences readSeqs(); - //int numSeqs = seqs.size(); + int numSeqs = seqs.size(); - //find average midpoint of seqs - midpoint = findAverageMidPoint(); + if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; } - //create 2 vectors of sequences, 1 for left side and one for right side - vector left; vector right; + //set default window to 25% of sequence length + string seq0 = seqs[0].getAligned(); + if (window == 0) { window = seq0.length() / 4; } + else if (window > (seq0.length() / 2)) { + mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine(); + window = (seq0.length() / 2); + } - for (int i = 0; i < seqs.size(); i++) { - //save left side - string seqLeft = seqs[i].getAligned(); - seqLeft = seqLeft.substr(0, midpoint); - Sequence tempLeft(seqs[i].getName(), seqLeft); - left.push_back(tempLeft); + if (increment > (seqs[0].getAlignLength() - (2*window))) { + if (increment != 10) { - //save right side - string seqRight = seqs[i].getAligned(); - seqRight = seqRight.substr(midpoint+1, (seqRight.length()-midpoint-1)); - Sequence tempRight(seqs[i].getName(), seqRight); - right.push_back(tempRight); + mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine(); + increment = 10; + if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; } + + }else{ increment = 0; } + } +cout << "increment = " << increment << endl; + if (increment == 0) { iters = 1; } + else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); } + + //initialize pref + pref.resize(numSeqs); + + for (int i = 0; i < numSeqs; i++ ) { + pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3); + pref[i].name = seqs[i].getName(); + pref[i].score[0] = 0.0; pref[i].score[1] = 0.0; + pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0; + pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0; } + + int midpoint = window; + int count = 0; + while (count < iters) { + + //create 2 vectors of sequences, 1 for left side and one for right side + vector left; vector right; + + for (int i = 0; i < seqs.size(); i++) { +//cout << "whole = " << seqs[i].getAligned() << endl; + //save left side + string seqLeft = seqs[i].getAligned().substr(midpoint-window, window); + Sequence tempLeft; + tempLeft.setName(seqs[i].getName()); + tempLeft.setAligned(seqLeft); + left.push_back(tempLeft); +//cout << "left = " << tempLeft.getAligned() << endl; + //save right side + string seqRight = seqs[i].getAligned().substr(midpoint, window); + Sequence tempRight; + tempRight.setName(seqs[i].getName()); + tempRight.setAligned(seqRight); + right.push_back(tempRight); +//cout << "right = " << seqRight << endl; + } + + //adjust midpoint by increment + midpoint += increment; + + + //this should be parallelized + //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) + //create a matrix containing the distance from left to left and right to right + //calculate distances + SparseMatrix* SparseLeft = new SparseMatrix(); + SparseMatrix* SparseRight = new SparseMatrix(); + + createSparseMatrix(0, left.size(), SparseLeft, left); + createSparseMatrix(0, right.size(), SparseRight, right); + + vector distMapRight; + vector distMapLeft; + + // Create a data structure to quickly access the distance information. + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + distMapRight = vector(numSeqs); + distMapLeft = vector(numSeqs); + //cout << "left" << endl << endl; + for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { + distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + //cout << "right" << endl << endl; + for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { + distMapRight[currentCell->row][currentCell->column] = currentCell->dist; + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + + delete SparseLeft; + delete SparseRight; - //this should be parallelized - //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) - //create a matrix containing the distance from left to left and right to right - //calculate distances - SparseMatrix* SparseLeft = new SparseMatrix(); - SparseMatrix* SparseRight = new SparseMatrix(); + + //fill preference structure + generatePreferences(distMapLeft, distMapRight, midpoint); + + count++; + + } - createSparseMatrix(0, left.size(), SparseLeft, left); - createSparseMatrix(0, right.size(), SparseRight, right); + delete distCalculator; + //find average pref score across windows + //if (increment != 0) { + + //for (int i = 0; i < pref.size(); i++) { + //pref[i].score[0] = pref[i].score[0] / iters; + //} + //} - //vector distMapRight; - //vector distMapLeft; + //sort Preferences highest to lowest + sort(pref.begin(), pref.end(), comparePref); - // Create a data structure to quickly access the distance information. - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - //distMapRight = vector(globaldata->gListVector->size()); - //distMapLeft = vector(globaldata->gListVector->size()); - for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { - //distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; - } - for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { - //distMapRight[currentCell->row][currentCell->column] = currentCell->dist; - } - + string outputFileName = getRootName(fastafile) + "chimeras"; + ofstream out; + openOutputFile(outputFileName, out); - //fill preference structure - //generatePreferences(distMapLeft, distMapRight); + int above1 = 0; + out << "Name\tScore\tLeft\tRight\t" << endl; + //output prefenence structure to .chimeras file + for (int i = 0; i < pref.size(); i++) { + out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl; + + //calc # of seqs with preference above 1.0 + if (pref[i].score[0] > 1.0) { + above1++; + mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine(); + mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine(); + } + + + } - - //output results to screen + //output results to screen mothurOutEndLine(); - mothurOut("\t\t"); mothurOutEndLine(); - //mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine(); - //mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); mothurOutEndLine(); - //mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); mothurOutEndLine(); - //mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); mothurOutEndLine(); - //mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); mothurOutEndLine(); - //mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); mothurOutEndLine(); - //mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); mothurOutEndLine(); - //mothurOut("# of Seqs:\t" + toString(numSeqs)); mothurOutEndLine(); + mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine(); + int spot; + spot = pref.size()-1; + mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.975; + mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.75; + mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.50; + mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.25; + mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.025; + mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = 0; + mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - //outSummary.close(); return 0; } catch(exception& e) { @@ -188,6 +296,8 @@ void ChimeraSeqsCommand::readSeqs(){ while(!inFASTA.eof()){ Sequence current(inFASTA); + if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); } + seqs.push_back(current); gobble(inFASTA); @@ -201,53 +311,6 @@ void ChimeraSeqsCommand::readSeqs(){ } } - -//*************************************************************************************************************** -int ChimeraSeqsCommand::findAverageMidPoint(){ - try { - int totalMids = 0; - int averageMid = 0; - - //loop through the seqs and find midpoint - for (int i = 0; i < seqs.size(); i++) { - - //get unaligned sequence - seqs[i].setUnaligned(seqs[i].getUnaligned()); //if you read an aligned file the unaligned is really aligned, so we need to make sure its unaligned - - string unaligned = seqs[i].getUnaligned(); - string aligned = seqs[i].getAligned(); - - //find midpoint of this seq - int count = 0; - int thismid = 0; - for (int j = 0; j < aligned.length(); j++) { - - thismid++; - - //if you are part of the unaligned sequence increment - if (isalpha(aligned[j])) { count++; } - - //if you have reached the halfway point stop - if (count >= (unaligned.length() / 2)) { break; } - } - - //add this mid to total - totalMids += thismid; - - } - - averageMid = (totalMids / seqs.size()); - - return averageMid; - - - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "findAverageMidPoint"); - exit(1); - } -} - /***************************************************************************************************************/ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector s){ try { @@ -256,9 +319,9 @@ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatri for(int j=0;jcalcDist(s.get(i), s.get(j)); + distCalculator->calcDist(s[i], s[j]); float dist = distCalculator->getDist(); - + PCell temp(i, j, dist); sparse->addCell(temp); @@ -273,23 +336,123 @@ int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatri exit(1); } } -/*************************************************************************************************************** -void ChimeraSeqsCommand::generatePreferences(vector left, vector right){ +/***************************************************************************************************************/ +void ChimeraSeqsCommand::generatePreferences(vector left, vector right, int mid){ try { + + float dme = 0.0; + SeqMap::iterator itR; + SeqMap::iterator itL; + + //initialize pref[i] + for (int i = 0; i < pref.size(); i++) { + pref[i].score[1] = 0.0; + pref[i].closestLeft[1] = 100000.0; + pref[i].closestRight[1] = 100000.0; + pref[i].leftParent[1] = ""; + pref[i].rightParent[1] = ""; + } +//cout << "in generate left.size() = " << left.size() << endl; for (int i = 0; i < left.size(); i++) { - int iscore = 0; - float closestLeft = 100000.0; - float closestRight = 100000.0; + SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0, + // 1 to the distance of sequence 3 to sequence 1, + // 2 to the distance of sequence 3 to sequence 2. + SeqMap currentRight = right[i]; // same as left but with distances on the right side. - for (int j = 0; j < left.size(); j++) { + for (int j = 0; j < i; j++) { - //iscore += abs(left - + itL = currentLeft.find(j); + itR = currentRight.find(j); +cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl; +cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl; + + //if you can find this entry update the preferences + if ((itL != currentLeft.end()) && (itR != currentRight.end())) { + + if (!correction) { + pref[i].score[1] += abs((itL->second - itR->second)); + pref[j].score[1] += abs((itL->second - itR->second)); +cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; +cout << "abs = " << abs((itL->second - itR->second)) << endl; +cout << i << " score = " << pref[i].score[1] << endl; +cout << j << " score = " << pref[j].score[1] << endl; + }else { + pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); + pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); +cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; +cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl; +cout << i << " score = " << pref[i].score[1] << endl; +cout << j << " score = " << pref[j].score[1] << endl; + } +cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl; + //are you the closest left sequence + if (itL->second < pref[i].closestLeft[1]) { + + pref[i].closestLeft[1] = itL->second; + pref[i].leftParent[1] = seqs[j].getName(); +cout << "updating closest left to " << pref[i].leftParent[1] << endl; + } +cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl; + if (itL->second < pref[j].closestLeft[1]) { + pref[j].closestLeft[1] = itL->second; + pref[j].leftParent[1] = seqs[i].getName(); +cout << "updating closest left to " << pref[j].leftParent[1] << endl; + } + + //are you the closest right sequence + if (itR->second < pref[i].closestRight[1]) { + pref[i].closestRight[1] = itR->second; + pref[i].rightParent[1] = seqs[j].getName(); + } + if (itR->second < pref[j].closestRight[1]) { + pref[j].closestRight[1] = itR->second; + pref[j].rightParent[1] = seqs[i].getName(); + } + + } } } + + + + //calculate the dme + int count0 = 0; + for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } + + float expectedPercent = 1 / (float) (pref.size() - count0); +cout << endl << "dme = " << dme << endl; + //recalculate prefernences based on dme + for (int i = 0; i < pref.size(); i++) { +cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; + // gives the actual percentage of the dme this seq adds + pref[i].score[1] = pref[i].score[1] / dme; + + //how much higher or lower is this than expected + pref[i].score[1] = pref[i].score[1] / expectedPercent; + + //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher. +cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl; + } + + //is this score bigger then the last score + for (int i = 0; i < pref.size(); i++) { + + //update biggest score + if (pref[i].score[1] > pref[i].score[0]) { + pref[i].score[0] = pref[i].score[1]; + pref[i].leftParent[0] = pref[i].leftParent[1]; + pref[i].rightParent[0] = pref[i].rightParent[1]; + pref[i].closestLeft[0] = pref[i].closestLeft[1]; + pref[i].closestRight[0] = pref[i].closestRight[1]; + pref[i].midpoint = mid; + } + + //total of preference scores across windows + //pref[i].score[0] += pref[i].score[1]; + } } catch(exception& e) { diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 5b55120..2398ce6 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -20,6 +20,19 @@ typedef list::iterator MatData; typedef map SeqMap; //maps sequence to all distance for that seqeunce +struct Preference { + string name; + vector leftParent; //keep the name of closest left associated with the two scores + vector rightParent; //keep the name of closest right associated with the two scores + vector score; //so you can keep last score and calc this score and keep whichever is bigger. + vector closestLeft; //keep the closest left associated with the two scores + vector closestRight; //keep the closest right associated with the two scores + int midpoint; + +}; + + + /***********************************************************/ class ChimeraSeqsCommand : public Command { @@ -29,28 +42,21 @@ public: int execute(); void help(); + private: - //Dist* distCalculator; - struct Preference { - string leftParent; - string rightParent; - float score; - - }; - Dist* distCalculator; bool abort; string method, fastafile; bool filter, correction; - int processors, midpoint; + int processors, midpoint, averageLeft, averageRight, window, iters, increment; FilterSeqsCommand* filterSeqs; + ListVector* list; vector seqs; vector pref; - int findAverageMidPoint(); void readSeqs(); - void generatePreferences(SparseMatrix*, SparseMatrix*); + void generatePreferences(vector, vector, int); int createSparseMatrix(int, int, SparseMatrix*, vector); diff --git a/commandfactory.cpp b/commandfactory.cpp index 43f6769..3d90269 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -49,6 +49,8 @@ #include "reversecommand.h" #include "trimseqscommand.h" #include "mergefilecommand.h" +#include "chimeraseqscommand.h" +#include "listseqscommand.h" /***********************************************************/ @@ -96,6 +98,8 @@ CommandFactory::CommandFactory(){ commands["screen.seqs"] = "screen.seqs"; commands["reverse.seqs"] = "reverse.seqs"; commands["trim.seqs"] = "trim.seqs"; + commands["chimera.seqs"] = "chimera.seqs"; + commands["list.seqs"] = "list.seqs"; commands["quit"] = "quit"; } @@ -145,13 +149,15 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "tree.shared") { command = new TreeGroupCommand(optionString); } else if(commandName == "dist.shared") { command = new MatrixOutputCommand(optionString); } else if(commandName == "bootstrap.shared") { command = new BootSharedCommand(optionString); } - //else if(commandName == "consensus") { command = new ConcensusCommand(optionString); } + //else if(commandName == "consensus") { command = new ConcensusCommand(optionString); } else if(commandName == "dist.seqs") { command = new DistanceCommand(optionString); } else if(commandName == "align.seqs") { command = new AlignCommand(optionString); } else if(commandName == "summary.seqs") { command = new SeqSummaryCommand(optionString); } else if(commandName == "screen.seqs") { command = new ScreenSeqsCommand(optionString); } else if(commandName == "reverse.seqs") { command = new ReverseSeqsCommand(optionString); } else if(commandName == "trim.seqs") { command = new TrimSeqsCommand(optionString); } + else if(commandName == "chimera.seqs") { command = new ChimeraSeqsCommand(optionString); } + else if(commandName == "list.seqs") { command = new ListSeqsCommand(optionString); } else if(commandName == "merge.files") { command = new MergeFileCommand(optionString); } else { command = new NoCommand(optionString); } diff --git a/eachgapdist.h b/eachgapdist.h index d21e55c..ac9b453 100644 --- a/eachgapdist.h +++ b/eachgapdist.h @@ -23,6 +23,7 @@ public: string seqA = A.getAligned(); string seqB = B.getAligned(); + int alignLength = seqA.length(); for(int i=0; i Estimators, Groups; //holds estimators to be used set lines; //hold lines to be used set labels; //holds labels to be used @@ -57,18 +57,13 @@ public: string getNameFile(); //do we need this? string getGroupFile(); //do we need this? string getOrderFile(); -// string getFastaFile(); string getTreeFile(); string getSharedFile(); string getFormat(); //do we need this? -// string getCandidateFile(); -// string getTemplateFile(); + void setListFile(string); -// void setFastaFile(string); void setTreeFile(string); -// void setCandidateFile(string); -// void setTemplateFile(string); void setGroupFile(string); //do we need this? void setPhylipFile(string); void setColumnFile(string); diff --git a/listseqscommand.cpp b/listseqscommand.cpp new file mode 100644 index 0000000..a7d79f3 --- /dev/null +++ b/listseqscommand.cpp @@ -0,0 +1,240 @@ +/* + * listseqscommand.cpp + * Mothur + * + * Created by Sarah Westcott on 7/8/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "listseqscommand.h" +#include "sequence.hpp" + +//********************************************************************************************************************** + +ListSeqsCommand::ListSeqsCommand(string option){ + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta","name", "group", "align" }; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not open") { abort = true; } + else if (fastafile == "not found") { fastafile = ""; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + + alignfile = validParameter.validFile(parameters, "align", true); + if (alignfile == "not open") { abort = true; } + else if (alignfile == "not found") { alignfile = ""; } + + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "")) { mothurOut("You must provide a file."); mothurOutEndLine(); abort = true; } + + if (parameters.size() > 1) { mothurOut("You may only enter one file."); mothurOutEndLine(); abort = true; } + } + + } + catch(exception& e) { + errorOut(e, "ListSeqsCommand", "ListSeqsCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void ListSeqsCommand::help(){ + try { + mothurOut("The list.seqs command reads a fasta, name, group or alignreport file and outputs a .accnos file containing sequence names.\n"); + mothurOut("The list.seqs command parameters are fasta, name, group and align. You must provide one of these parameters.\n"); + mothurOut("The list.seqs command should be in the following format: list.seqs(fasta=yourFasta).\n"); + mothurOut("Example list.seqs(fasta=amazon.fasta).\n"); + mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); + } + catch(exception& e) { + errorOut(e, "ListSeqsCommand", "help"); + exit(1); + } +} + +//********************************************************************************************************************** + +int ListSeqsCommand::execute(){ + try { + + if (abort == true) { return 0; } + + //read functions fill names vector + if (fastafile != "") { inputFileName = fastafile; readFasta(); } + else if (namefile != "") { inputFileName = namefile; readName(); } + else if (groupfile != "") { inputFileName = groupfile; readGroup(); } + else if (alignfile != "") { inputFileName = alignfile; readAlign(); } + + //sort in alphabetical order + sort(names.begin(), names.end()); + + string outputFileName = getRootName(inputFileName) + "accnos"; + ofstream out; + openOutputFile(outputFileName, out); + + //output to .accnos file + for (int i = 0; i < names.size(); i++) { + out << names[i] << endl; + } + out.close(); + + return 0; + } + + catch(exception& e) { + errorOut(e, "ListSeqsCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** +void ListSeqsCommand::readFasta(){ + try { + + ifstream in; + openInputFile(fastafile, in); + string name; + + while(!in.eof()){ + Sequence currSeq(in); + name = currSeq.getName(); + + names.push_back(name); + + gobble(in); + } + in.close(); + + } + catch(exception& e) { + errorOut(e, "ListSeqsCommand", "readFasta"); + exit(1); + } +} + +//********************************************************************************************************************** +void ListSeqsCommand::readName(){ + try { + + ifstream in; + openInputFile(namefile, in); + string name, firstCol, secondCol; + + while(!in.eof()){ + + in >> firstCol; + in >> secondCol; + + //parse second column saving each name + while (secondCol.find_first_of(',') != -1) { + name = secondCol.substr(0,secondCol.find_first_of(',')); + secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); + names.push_back(name); + } + + //get name after last , + names.push_back(secondCol); + + gobble(in); + } + in.close(); + + } + catch(exception& e) { + errorOut(e, "ListSeqsCommand", "readName"); + exit(1); + } +} + +//********************************************************************************************************************** +void ListSeqsCommand::readGroup(){ + try { + + ifstream in; + openInputFile(groupfile, in); + string name, group; + + while(!in.eof()){ + + in >> name; //read from first column + in >> group; //read from second column + + names.push_back(name); + + gobble(in); + } + in.close(); + + } + catch(exception& e) { + errorOut(e, "ListSeqsCommand", "readGroup"); + exit(1); + } +} + +//********************************************************************************************************************** +//alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name +void ListSeqsCommand::readAlign(){ + try { + + ifstream in; + openInputFile(alignfile, in); + string name, junk; + + //read column headers + for (int i = 0; i < 16; i++) { + if (!in.eof()) { in >> junk; } + else { break; } + } + + + while(!in.eof()){ + + in >> name; //read from first column + + //read rest + for (int i = 0; i < 15; i++) { + if (!in.eof()) { in >> junk; } + else { break; } + } + + names.push_back(name); + + gobble(in); + } + in.close(); + + + } + catch(exception& e) { + errorOut(e, "ListSeqsCommand", "readAlign"); + exit(1); + } +} +//********************************************************************************************************************** diff --git a/listseqscommand.h b/listseqscommand.h new file mode 100644 index 0000000..4e0febd --- /dev/null +++ b/listseqscommand.h @@ -0,0 +1,37 @@ +#ifndef LISTSEQSCOMMAND_H +#define LISTSEQSCOMMAND_H + +/* + * listseqscommand.h + * Mothur + * + * Created by Sarah Westcott on 7/8/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "command.hpp" + +class ListSeqsCommand : public Command { + + public: + + ListSeqsCommand(string); + ~ListSeqsCommand(){}; + int execute(); + void help(); + + private: + vector names; + string fastafile, namefile, groupfile, alignfile, inputFileName; + bool abort; + + void readFasta(); + void readName(); + void readGroup(); + void readAlign(); + +}; + +#endif + diff --git a/mothur.h b/mothur.h index 4d22e98..5d9dc96 100644 --- a/mothur.h +++ b/mothur.h @@ -231,7 +231,7 @@ inline void errorOut(exception& e, string object, string function) { mothurOut("Error: "); mothurOut(toString(e.what())); - mothurOut(" has occurred in the " + object + " class function " + function + "Please contact Pat Schloss at pschloss@microbio.umass.edu, and be sure to include the mothur.logFile with your inquiry."); + mothurOut(" has occurred in the " + object + " class function " + function + ". Please contact Pat Schloss at pschloss@microbio.umass.edu, and be sure to include the mothur.logFile with your inquiry."); mothurOutEndLine(); } diff --git a/nastreport.cpp b/nastreport.cpp index d7be4dd..aa7d393 100644 --- a/nastreport.cpp +++ b/nastreport.cpp @@ -41,7 +41,7 @@ void NastReport::print(){ candidateReportFile << alignmentMethod << '\t' << candidateStartPosition << "\t" << candidateEndPosition << '\t'; candidateReportFile << templateStartPosition << "\t" << templateEndPosition << '\t'; candidateReportFile << pairwiseAlignmentLength << '\t' << totalGapsInQuery << '\t' << totalGapsInTemplate << '\t'; - candidateReportFile << longestInsert << '\t';; + candidateReportFile << longestInsert << '\t'; candidateReportFile << setprecision(2) << similarityToTemplate; candidateReportFile << endl; diff --git a/rarefact.cpp b/rarefact.cpp index f1b43fc..5cd9b99 100644 --- a/rarefact.cpp +++ b/rarefact.cpp @@ -81,14 +81,19 @@ try { rcd->registerDisplay(displays[i]); } + //if jumble is false all iters will be the same + if (globaldata->jumble == false) { nIters = 1; } + for(int iter=0;iterinit(label); } - - //randomize the groups - random_shuffle(lookup.begin(), lookup.end()); + + if (globaldata->jumble == true) { + //randomize the groups + random_shuffle(lookup.begin(), lookup.end()); + } //make merge the size of lookup[0] SharedRAbundVector* merge = new SharedRAbundVector(lookup[0]->size()); diff --git a/rarefact.h b/rarefact.h index aff5cb6..9507d5a 100644 --- a/rarefact.h +++ b/rarefact.h @@ -5,6 +5,7 @@ #include "raredisplay.h" #include "ordervector.hpp" #include "mothur.h" +#include "globaldata.hpp" class Rarefact { @@ -13,13 +14,14 @@ public: Rarefact(OrderVector* o, vector disp) : numSeqs(o->getNumSeqs()), order(o), displays(disp), label(o->getLabel()) {}; Rarefact(vector shared, vector disp) : - lookup(shared), displays(disp) {}; + lookup(shared), displays(disp) { globaldata = GlobalData::getInstance(); }; ~Rarefact(){}; void getCurve(int, int); void getSharedCurve(int, int); private: + GlobalData* globaldata; OrderVector* order; vector displays; int numSeqs, numGroupComb; diff --git a/rarefactsharedcommand.cpp b/rarefactsharedcommand.cpp index 71bfaf1..5914f15 100644 --- a/rarefactsharedcommand.cpp +++ b/rarefactsharedcommand.cpp @@ -29,7 +29,7 @@ RareFactSharedCommand::RareFactSharedCommand(string option){ else { //valid paramters for this command - string Array[] = {"iters","line","label","calc","groups"}; + string Array[] = {"iters","line","label","calc","groups", "jumble"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -92,6 +92,11 @@ RareFactSharedCommand::RareFactSharedCommand(string option){ temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } convert(temp, nIters); + temp = validParameter.validFile(parameters, "jumble", false); if (temp == "not found") { temp = "T"; } + if (isTrue(temp)) { jumble = true; } + else { jumble = false; } + globaldata->jumble = jumble; + if (abort == false) { string fileNameRoot = getRootName(globaldata->inputFileName); @@ -125,12 +130,12 @@ RareFactSharedCommand::RareFactSharedCommand(string option){ void RareFactSharedCommand::help(){ try { mothurOut("The rarefaction.shared command can only be executed after a successful read.otu command.\n"); - mothurOut("The rarefaction.shared command parameters are label, line, iters, groups and calc. No parameters are required, but you may not use \n"); + mothurOut("The rarefaction.shared command parameters are label, line, iters, groups, jumble and calc. No parameters are required, but you may not use \n"); mothurOut("both the line and label parameters at the same time. The rarefaction command should be in the following format: \n"); - mothurOut("rarefaction.shared(label=yourLabel, line=yourLines, iters=yourIters, calc=yourEstimators, groups=yourGroups).\n"); - mothurOut("Example rarefaction.shared(label=unique-.01-.03, line=0-5-10, iters=10000, groups=B-C, calc=sharedobserved).\n"); + mothurOut("rarefaction.shared(label=yourLabel, line=yourLines, iters=yourIters, calc=yourEstimators, jumble=yourJumble, groups=yourGroups).\n"); + mothurOut("Example rarefaction.shared(label=unique-.01-.03, line=0-5-10, iters=10000, groups=B-C, jumble=T, calc=sharedobserved).\n"); mothurOut("The default values for iters is 1000, freq is 100, and calc is sharedobserved which calculates the shared rarefaction curve for the observed richness.\n"); - mothurOut("The default value for groups is all the groups in your groupfile.\n"); + mothurOut("The default value for groups is all the groups in your groupfile, and jumble is true.\n"); validCalculator->printCalc("sharedrarefaction", cout); mothurOut("The label and line parameters are used to analyze specific lines in your input.\n"); mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n"); diff --git a/rarefactsharedcommand.h b/rarefactsharedcommand.h index c7290f4..b010957 100644 --- a/rarefactsharedcommand.h +++ b/rarefactsharedcommand.h @@ -49,7 +49,7 @@ private: int freq, nIters; string format; - bool abort, allLines; + bool abort, allLines, jumble; set lines; //hold lines to be used set labels; //holds labels to be used string line, label, calc, groups; diff --git a/sequence.cpp b/sequence.cpp index aa1f1c7..685f072 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -53,7 +53,7 @@ Sequence::Sequence(ifstream& fastaFile){ } } - if(sequence.find_first_of('-') != string::npos){ // if there are any gaps in the sequence, assume that it is + if((sequence.find_first_of('-') != string::npos) || (sequence.find_first_of('.') != string::npos)) { // if there are any gaps in the sequence, assume that it is setAligned(sequence); // an alignment file } setUnaligned(sequence); // also set the unaligned sequence file -- 2.39.2