From: pschloss Date: Wed, 3 Jun 2009 01:39:31 +0000 (+0000) Subject: addition of summary.seq command [pds] X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=5b7ac70116137b52dd7884b76c5bca660a5fea38 addition of summary.seq command [pds] --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 6c86778..245ed55 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -145,6 +145,7 @@ 7E412F490F8D21B600381DD0 /* slibshuff.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E412F480F8D21B600381DD0 /* slibshuff.cpp */; }; 7E412FEA0F8D3E2C00381DD0 /* libshuff.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */; }; 7E4130F80F8E58FA00381DD0 /* dlibshuff.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */; }; + 7E9214820FD2081200699BF3 /* seqsummarycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7E9214810FD2081200699BF3 /* seqsummarycommand.cpp */; }; 7EC3D4550FA0FFF900338DA5 /* coverage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4500FA0FFF900338DA5 /* coverage.cpp */; }; 7EC3D4560FA0FFF900338DA5 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */; }; 8DD76F6A0486A84900D96B5E /* Mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* Mothur.1 */; }; @@ -468,6 +469,8 @@ 7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = libshuff.cpp; sourceTree = SOURCE_ROOT; }; 7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = dlibshuff.cpp; sourceTree = SOURCE_ROOT; }; 7E4130F70F8E58FA00381DD0 /* dlibshuff.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dlibshuff.h; sourceTree = SOURCE_ROOT; }; + 7E9214800FD2081200699BF3 /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = ""; }; + 7E9214810FD2081200699BF3 /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = ""; }; 7EC3D4500FA0FFF900338DA5 /* coverage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = coverage.cpp; sourceTree = SOURCE_ROOT; }; 7EC3D4510FA0FFF900338DA5 /* coverage.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = coverage.h; sourceTree = SOURCE_ROOT; }; 7EC3D4520FA0FFF900338DA5 /* sharedanderberg.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedanderberg.h; sourceTree = SOURCE_ROOT; }; @@ -744,6 +747,8 @@ 37D928A90F2133E5001D4494 /* commands */ = { isa = PBXGroup; children = ( + 7E9214800FD2081200699BF3 /* seqsummarycommand.h */, + 7E9214810FD2081200699BF3 /* seqsummarycommand.cpp */, 37D927CD0F21331F001D4494 /* command.hpp */, 378DC5CD0FBDE1C8003B8607 /* aligncommand.h */, 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */, @@ -1093,6 +1098,7 @@ 373C699C0FC1E63600137ACD /* solow.cpp in Sources */, EB72FE260FC1F5CA0051AC11 /* shen.cpp in Sources */, 21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */, + 7E9214820FD2081200699BF3 /* seqsummarycommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/commandfactory.cpp b/commandfactory.cpp index 06e7e84..0edb024 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -43,7 +43,7 @@ #include "distancecommand.h" #include "aligncommand.h" #include "matrixoutputcommand.h" - +#include "seqsummarycommand.h" /***********************************************************/ @@ -97,6 +97,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "concensus") { command = new ConcensusCommand(); } else if(commandName == "dist.seqs") { command = new DistanceCommand(); } else if(commandName == "align.seqs") { command = new AlignCommand(); } + else if(commandName == "summary.seqs") { command = new SeqSummaryCommand(); } else { command = new NoCommand(); } return command; diff --git a/database.cpp b/database.cpp index 8c080db..5b93321 100644 --- a/database.cpp +++ b/database.cpp @@ -30,10 +30,10 @@ Database::Database(string fastaFileName){ // This assumes that the template dat string seqName, sequence; for(int i=0;i> seqName; - templateSequences[i]->setName(seqName); +// templateSequences[i]->setName(seqName); char letter; string aligned; @@ -45,8 +45,9 @@ Database::Database(string fastaFileName){ // This assumes that the template dat aligned += letter; } } - templateSequences[i]->setAligned(aligned); // Obviously, we need the fully aligned template sequence - templateSequences[i]->setUnaligned(aligned);// We will also need the unaligned sequence for pairwise alignments + templateSequences[i] = new Sequence(seqName, aligned); +// templateSequences[i]->setAligned(aligned); // Obviously, we need the fully aligned template sequence +// templateSequences[i]->setUnaligned(aligned);// We will also need the unaligned sequence for pairwise alignments fastaFile.putback(letter); // and database searches } diff --git a/distancecommand.cpp b/distancecommand.cpp index 6fee651..1bc16f3 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -33,7 +33,7 @@ DistanceCommand::DistanceCommand(){ }else if (globaldata->Estimators[i] == "eachgap") { distCalculator = new eachGapDist(); }else if (globaldata->Estimators[i] == "onegap") { - distCalculator = new oneGapDist(); } + distCalculator = new oneGapDist(); } } } }else { @@ -49,7 +49,7 @@ DistanceCommand::DistanceCommand(){ } } } - + //reset calc for next command globaldata->setCalc(""); } @@ -71,113 +71,190 @@ int DistanceCommand::execute(){ string filename = globaldata->inputFileName; if(globaldata->getFastaFile() != "") { - readSeqs = new ReadFasta(filename); } + readSeqs = new ReadFasta(filename); } else if(globaldata->getNexusFile() != "") { - readSeqs = new ReadNexus(filename); } + readSeqs = new ReadNexus(filename); } else if(globaldata->getClustalFile() != "") { - readSeqs = new ReadClustal(filename); } + readSeqs = new ReadClustal(filename); } else if(globaldata->getPhylipFile() != "") { - readSeqs = new ReadPhylip(filename); } - + readSeqs = new ReadPhylip(filename); } + readSeqs->read(); seqDB = readSeqs->getDB(); - + int numSeqs = seqDB->getNumSeqs(); cutoff += 0.005; - + string distFile = getRootName(globaldata->getFastaFile()) + "dist"; remove(distFile.c_str()); //# if defined (_WIN32) - //figure out how to implement the fork and wait commands in windows + //figure out how to implement the fork and wait commands in windows // driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); //# endif - # if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ - driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); - } - else if(processors == 2){ - - int pid = fork(); - if(pid > 0){ - driver(distCalculator, seqDB, 0, (numSeqs/sqrt(2)), distFile + "tempa", cutoff); - appendFiles((distFile+"tempa"), distFile); +#if defined (__APPLE__) || (__MACH__) + if(processors == 1){ + driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); + } + else if(processors == 2){ + + int pid = fork(); + if(pid > 0){ + driver(distCalculator, seqDB, 0, (numSeqs/sqrt(2)), distFile + "tempa", cutoff); + appendFiles((distFile+"tempa"), distFile); + remove((distFile + "tempa").c_str()); + } + else{ + driver(distCalculator, seqDB, (numSeqs/sqrt(2)), numSeqs, distFile + "tempb", cutoff); + appendFiles((distFile+"tempb"), distFile); + remove((distFile + "tempb").c_str()); + } + wait(NULL); + + } + else if(processors == 3){ + int pid1 = fork(); + if(pid1 > 0){ + int pid2 = fork(); + if(pid2 > 0){ + driver(distCalculator, seqDB, 0, sqrt(3) * numSeqs / 3, distFile + "tempa", cutoff); + appendFiles(distFile+"tempa", distFile); remove((distFile + "tempa").c_str()); } else{ - driver(distCalculator, seqDB, (numSeqs/sqrt(2)), numSeqs, distFile + "tempb", cutoff); - appendFiles((distFile+"tempb"), distFile); - remove((distFile + "tempb").c_str()); + driver(distCalculator, seqDB, sqrt(3) * numSeqs / 3, sqrt(6) * numSeqs / 3, distFile + "tempb", cutoff); + appendFiles(distFile+"tempb", distFile); + remove((distFile + "tempb").c_str()); } wait(NULL); - } - else if(processors == 3){ - int pid1 = fork(); - if(pid1 > 0){ - int pid2 = fork(); - if(pid2 > 0){ - driver(distCalculator, seqDB, 0, sqrt(3) * numSeqs / 3, distFile + "tempa", cutoff); - appendFiles(distFile+"tempa", distFile); - remove((distFile + "tempa").c_str()); - } - else{ - driver(distCalculator, seqDB, sqrt(3) * numSeqs / 3, sqrt(6) * numSeqs / 3, distFile + "tempb", cutoff); - appendFiles(distFile+"tempb", distFile); - remove((distFile + "tempb").c_str()); - } - wait(NULL); + else{ + driver(distCalculator, seqDB, sqrt(6) * numSeqs / 3, numSeqs, distFile + "tempc", cutoff); + appendFiles(distFile+"tempc", distFile); + remove((distFile + "tempc").c_str()); + } + wait(NULL); + } + else if(processors == 4){ + int pid1 = fork(); + if(pid1 > 0){ + int pid2 = fork(); + if(pid2 > 0){ + driver(distCalculator, seqDB, 0, numSeqs / 2, distFile + "tempa", cutoff); + appendFiles(distFile+"tempa", distFile); + remove((distFile + "tempa").c_str()); } else{ - driver(distCalculator, seqDB, sqrt(6) * numSeqs / 3, numSeqs, distFile + "tempc", cutoff); - appendFiles(distFile+"tempc", distFile); - remove((distFile + "tempc").c_str()); + driver(distCalculator, seqDB, numSeqs / 2, (numSeqs/sqrt(2)), distFile + "tempb", cutoff); + appendFiles(distFile+"tempb", distFile); + remove((distFile + "tempb").c_str()); } wait(NULL); } - else if(processors == 4){ - int pid1 = fork(); - if(pid1 > 0){ - int pid2 = fork(); - if(pid2 > 0){ - driver(distCalculator, seqDB, 0, numSeqs / 2, distFile + "tempa", cutoff); - appendFiles(distFile+"tempa", distFile); - remove((distFile + "tempa").c_str()); - } - else{ - driver(distCalculator, seqDB, numSeqs / 2, (numSeqs/sqrt(2)), distFile + "tempb", cutoff); - appendFiles(distFile+"tempb", distFile); - remove((distFile + "tempb").c_str()); - } - wait(NULL); + else{ + int pid3 = fork(); + if(pid3 > 0){ + driver(distCalculator, seqDB, (numSeqs/sqrt(2)), (sqrt(3) * numSeqs / 2), distFile + "tempc", cutoff); + appendFiles(distFile+"tempc", distFile); + remove((distFile + "tempc").c_str()); } else{ - int pid3 = fork(); - if(pid3 > 0){ - driver(distCalculator, seqDB, (numSeqs/sqrt(2)), (sqrt(3) * numSeqs / 2), distFile + "tempc", cutoff); - appendFiles(distFile+"tempc", distFile); - remove((distFile + "tempc").c_str()); - } - else{ - driver(distCalculator, seqDB, (sqrt(3) * numSeqs / 2), numSeqs, distFile + "tempd", cutoff); - appendFiles(distFile+"tempd", distFile); - remove((distFile + "tempd").c_str()); - } - wait(NULL); + driver(distCalculator, seqDB, (sqrt(3) * numSeqs / 2), numSeqs, distFile + "tempd", cutoff); + appendFiles(distFile+"tempd", distFile); + remove((distFile + "tempd").c_str()); } wait(NULL); } wait(NULL); - # else - driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); - # endif - + } + wait(NULL); +#elif (linux) || (__linux) + if(processors == 1){ + driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); + } + else if(processors == 2){ + + int pid = fork(); + if(pid > 0){ + driver(distCalculator, seqDB, 0, (numSeqs/sqrt(2)), distFile + "tempa", cutoff); + appendFiles((distFile+"tempa"), distFile); + remove((distFile + "tempa").c_str()); + } + else{ + driver(distCalculator, seqDB, (numSeqs/sqrt(2)), numSeqs, distFile + "tempb", cutoff); + appendFiles((distFile+"tempb"), distFile); + remove((distFile + "tempb").c_str()); + } + wait(); + + } + else if(processors == 3){ + int pid1 = fork(); + if(pid1 > 0){ + int pid2 = fork(); + if(pid2 > 0){ + driver(distCalculator, seqDB, 0, sqrt(3) * numSeqs / 3, distFile + "tempa", cutoff); + appendFiles(distFile+"tempa", distFile); + remove((distFile + "tempa").c_str()); + } + else{ + driver(distCalculator, seqDB, sqrt(3) * numSeqs / 3, sqrt(6) * numSeqs / 3, distFile + "tempb", cutoff); + appendFiles(distFile+"tempb", distFile); + remove((distFile + "tempb").c_str()); + } + wait(); + } + else{ + driver(distCalculator, seqDB, sqrt(6) * numSeqs / 3, numSeqs, distFile + "tempc", cutoff); + appendFiles(distFile+"tempc", distFile); + remove((distFile + "tempc").c_str()); + } + wait(); + } + else if(processors == 4){ + int pid1 = fork(); + if(pid1 > 0){ + int pid2 = fork(); + if(pid2 > 0){ + driver(distCalculator, seqDB, 0, numSeqs / 2, distFile + "tempa", cutoff); + appendFiles(distFile+"tempa", distFile); + remove((distFile + "tempa").c_str()); + } + else{ + driver(distCalculator, seqDB, numSeqs / 2, (numSeqs/sqrt(2)), distFile + "tempb", cutoff); + appendFiles(distFile+"tempb", distFile); + remove((distFile + "tempb").c_str()); + } + wait(); + } + else{ + int pid3 = fork(); + if(pid3 > 0){ + driver(distCalculator, seqDB, (numSeqs/sqrt(2)), (sqrt(3) * numSeqs / 2), distFile + "tempc", cutoff); + appendFiles(distFile+"tempc", distFile); + remove((distFile + "tempc").c_str()); + } + else{ + driver(distCalculator, seqDB, (sqrt(3) * numSeqs / 2), numSeqs, distFile + "tempd", cutoff); + appendFiles(distFile+"tempd", distFile); + remove((distFile + "tempd").c_str()); + } + wait(); + } + wait(); + } + wait(); + +#else + driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); +#endif + delete distCalculator; - + return 0; - + } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the DistanceCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -194,29 +271,29 @@ int DistanceCommand::execute(){ int DistanceCommand::driver(Dist* distCalculator, SequenceDB* align, int startLine, int endLine, string dFileName, float cutoff){ try { int startTime = time(NULL); - + ofstream distFile(dFileName.c_str(), ios::trunc); distFile.setf(ios::fixed, ios::showpoint); distFile << setprecision(4); - - for(int i=startLine;icalcDist(align->get(i), align->get(j)); double dist = distCalculator->getDist(); - + if(dist <= cutoff){ distFile << align->get(i).getName() << ' ' << align->get(j).getName() << ' ' << dist << endl; } - + } if(i % 100 == 0){ cout << i << '\t' << time(NULL) - startTime << endl; } - + } cout << endLine-1 << '\t' << time(NULL) - startTime << endl; - + return 1; } catch(exception& e) { @@ -227,7 +304,7 @@ int DistanceCommand::driver(Dist* distCalculator, SequenceDB* align, int startLi cout << "An unknown error has occurred in the DistanceCommand class function driver. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; exit(1); } - + } /**************************************************************************************************/ @@ -250,7 +327,7 @@ void DistanceCommand::appendFiles(string temp, string filename) { output << line << endl; // Appending back newline char } } - + input.close(); output.close(); } diff --git a/engine.cpp b/engine.cpp index add84b9..ddd6776 100644 --- a/engine.cpp +++ b/engine.cpp @@ -46,8 +46,8 @@ bool InteractEngine::getInput(){ bool errorFree; ErrorCheck* errorCheckor = new ErrorCheck(); - cout << "mothur v1.2.0" << endl; - cout << "Last updated: 4/14/2009" << endl << endl; + cout << "mothur v.1.3.0" << endl; + cout << "Last updated: 4/25/2009" << endl << endl; cout << "by" << endl; cout << "Patrick D. Schloss" << endl << endl; cout << "Department of Microbiology" << endl; diff --git a/fastamap.cpp b/fastamap.cpp index a1beda3..ebfeced 100644 --- a/fastamap.cpp +++ b/fastamap.cpp @@ -19,7 +19,7 @@ void FastaMap::readFastaFile(ifstream& in) { name = line.substr(1, line.length()); //rips off '>' //read through file - while (in.eof() != true) { + while (!in.eof()) { in >> line; if (line != "") { if (isalnum(line.at(0))) { //if it's a sequence line @@ -41,19 +41,9 @@ void FastaMap::readFastaFile(ifstream& in) { sequence = ""; } } + gobble(in); } - //store last sequence and name info. - seqmap[name] = sequence; - it = data.find(sequence); - if (it == data.end()) { //it's unique. - data[sequence].groupname = name; //group name will be the name of the first duplicate sequence found. - data[sequence].groupnumber = 1; - data[sequence].names = name; - }else { // its a duplicate. - data[sequence].names += "," + name; - data[sequence].groupnumber++; - } } catch(exception& e) { diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 6c02242..a6bd549 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -23,7 +23,7 @@ FilterSeqsCommand::FilterSeqsCommand(){ db = readSeqs->getDB(); numSeqs = db->size(); - alignmentLength = db->get(0).getLength(); + alignmentLength = db->get(0).getAlignLength(); filter = string(alignmentLength, '1'); } diff --git a/filterseqscommand.h b/filterseqscommand.h index 2279689..af4c03a 100644 --- a/filterseqscommand.h +++ b/filterseqscommand.h @@ -44,4 +44,4 @@ private: }; -#endif \ No newline at end of file +#endif diff --git a/nastreport.cpp b/nastreport.cpp index 67ec702..799793e 100644 --- a/nastreport.cpp +++ b/nastreport.cpp @@ -48,14 +48,14 @@ void NastReport::print(){ void NastReport::setCandidate(Sequence* candSeq){ queryName = candSeq->getName(); - queryLength = candSeq->getUnalignLength(); + queryLength = candSeq->getNumBases(); } /******************************************************************************************************************/ void NastReport::setTemplate(Sequence* tempSeq){ templateName = tempSeq->getName(); - templateLength = tempSeq->getUnalignLength(); + templateLength = tempSeq->getNumBases(); } /******************************************************************************************************************/ diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp new file mode 100644 index 0000000..e71c2af --- /dev/null +++ b/seqsummarycommand.cpp @@ -0,0 +1,132 @@ +/* + * seqcoordcommand.cpp + * Mothur + * + * Created by Pat Schloss on 5/30/09. + * Copyright 2009 Patrick D. Schloss. All rights reserved. + * + */ + +#include "seqsummarycommand.h" + +//*************************************************************************************************************** + +SeqSummaryCommand::SeqSummaryCommand(){ + try { + globaldata = GlobalData::getInstance(); + + if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); } + else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); } + else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); } + else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); } + + readSeqs->read(); + db = readSeqs->getDB(); + numSeqs = db->size(); + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the SeqCoordCommand class function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +//*************************************************************************************************************** + +SeqSummaryCommand::~SeqSummaryCommand(){ +} + +//*************************************************************************************************************** + +int SeqSummaryCommand::execute(){ + try{ + + ofstream outfile; + string summaryFile = getRootName(globaldata->inputFileName) + "summary"; + openOutputFile(summaryFile, outfile); + + vector startPosition(numSeqs, 0); + vector endPosition(numSeqs, 0); + vector seqLength(numSeqs, 0); + vector ambigBases(numSeqs, 0); + vector longHomoPolymer(numSeqs, 0); + + if(db->get(0).getIsAligned() == 1){ + outfile << "seqname\tstart\tend\tlength\tambiguities\tlonghomopolymer" << endl; + for(int i = 0; i < numSeqs; i++) { + Sequence current = db->get(i); + startPosition[i] = current.getStartPos(); + endPosition[i] = current.getEndPos(); + seqLength[i] = current.getNumBases(); + ambigBases[i] = current.getAmbigBases(); + longHomoPolymer[i] = current.getLongHomoPolymer(); + outfile << current.getName() << '\t' << startPosition[i] << '\t' << endPosition[i] << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl; + } + } + else{ + outfile << "seqname\tlength\tambiguities\tlonghomopolymer" << endl; + for(int i=0;iget(i); + seqLength[i] = current.getNumBases(); + ambigBases[i] = current.getAmbigBases(); + longHomoPolymer[i] = current.getLongHomoPolymer(); + outfile << current.getName() << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl; + } + } + + sort(seqLength.begin(), seqLength.end()); + sort(ambigBases.begin(), ambigBases.end()); + sort(longHomoPolymer.begin(), longHomoPolymer.end()); + + int median = int(numSeqs * 0.500); + int lowestPtile = int(numSeqs * 0.025); + int lowPtile = int(numSeqs * 0.250); + int highPtile = int(numSeqs * 0.750); + int highestPtile = int(numSeqs * 0.975); + int max = numSeqs - 1; + + cout << endl; + if(db->get(0).getIsAligned() == 1){ + sort(startPosition.begin(), startPosition.end()); + sort(endPosition.begin(), endPosition.end()); + + cout << "\t\tStart\tEnd\tLength\tN's\tPolymer" << endl; + cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl; + cout << "2.5%-tile:\t" << startPosition[lowestPtile] << '\t' << endPosition[lowestPtile] << '\t' << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl; + cout << "25%-tile:\t" << startPosition[lowPtile] << '\t' << endPosition[lowPtile] << '\t' << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl; + cout << "Median: \t" << startPosition[median] << '\t' << endPosition[median] << '\t' << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl; + cout << "75%-tile:\t" << startPosition[highPtile] << '\t' << endPosition[highPtile] << '\t' << seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl; + cout << "97.5%-tile:\t" << startPosition[highestPtile] << '\t' << endPosition[highestPtile] << '\t' << seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl; + cout << "Maximum:\t" << startPosition[max] << '\t' << endPosition[max] << '\t' << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl; + } + else{ + cout << "\t\tLength\tN's\tPolymer" << endl; + cout << "Minimum:\t" << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl; + cout << "2.5%-tile:\t" << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl; + cout << "25%-tile:\t" << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl; + cout << "Median: \t" << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl; + cout << "75%-tile:\t"<< seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl; + cout << "97.5%-tile:\t"<< seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl; + cout << "Maximum:\t" << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl; + } + cout << "# of Seqs:\t" << numSeqs << endl; + + return 0; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the FilterSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the FilterSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + +} + +//*************************************************************************************************************** + + diff --git a/seqsummarycommand.h b/seqsummarycommand.h new file mode 100644 index 0000000..02103e7 --- /dev/null +++ b/seqsummarycommand.h @@ -0,0 +1,36 @@ +#ifndef SEQSUMMARYCOMMAND_H +#define SEQSUMMARYCOMMAND_H + +/* + * seqcoordcommand.h + * Mothur + * + * Created by Pat Schloss on 5/30/09. + * Copyright 2009 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "globaldata.hpp" +#include "readfasta.h" +#include "readnexus.h" +#include "readclustal.h" +#include "readseqsphylip.h" + +using namespace std; + +class SeqSummaryCommand : public Command { +public: + SeqSummaryCommand(); + ~SeqSummaryCommand(); + int execute(); + +private: + int numSeqs; + GlobalData* globaldata; + ReadSeqs* readSeqs; + SequenceDB* db; +}; + +#endif diff --git a/sequence.cpp b/sequence.cpp index 2f167c5..423b905 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -13,16 +13,22 @@ using namespace std; /***********************************************************************/ -Sequence::Sequence() {} +Sequence::Sequence(){ + initialize(); +} /***********************************************************************/ Sequence::Sequence(string newName, string sequence) { + + initialize(); name = newName; if(sequence.find_first_of('-') != string::npos) { setAligned(sequence); + isAligned = 1; } setUnaligned(sequence); + } //******************************************************************************************************************** @@ -57,22 +63,22 @@ Sequence::Sequence(ifstream& fastaFile){ //******************************************************************************************************************** -string Sequence::convert2ints() { +void Sequence::initialize(){ - if(unaligned == "") { /* need to throw an error */ } + name = ""; + unaligned = ""; + aligned = ""; + pairwise = ""; - string processed; + numBases = 0; + alignmentLength = 0; + isAligned = 0; + startPos = -1; + endPos = -1; + longHomoPolymer = 0; + ambigBases = 0; - for(int i=0;i=0;i--){ - if(sequence[i] == '-'){ - sequence[i] = '.'; + for(int i=alignmentLength-1;i>=0;i--){ + if(aligned[i] == '-'){ + aligned[i] = '.'; } else{ break; } } } - aligned = sequence; + } //******************************************************************************************************************** @@ -133,6 +142,25 @@ void Sequence::setPairwise(string sequence){ //******************************************************************************************************************** +string Sequence::convert2ints() { + + if(unaligned == "") { /* need to throw an error */ } + + string processed; + + for(int i=0;i aligned.length()) - return unaligned.length(); - return aligned.length(); +int Sequence::getNumBases(){ + return numBases; } //******************************************************************************************************************** void Sequence::printSequence(ostream& out){ - string toPrint = unaligned; - if(aligned.length() > unaligned.length()) - toPrint = aligned; - out << ">" << name << "\n" << toPrint << "\n"; + + out << ">" << name << endl; + if(isAligned){ + out << aligned << endl; + } + else{ + out << unaligned << endl; + } } //******************************************************************************************************************** -int Sequence::getUnalignLength(){ - return unaligned.length(); +int Sequence::getAlignLength(){ + return alignmentLength; } //******************************************************************************************************************** -int Sequence::getAlignLength(){ - return aligned.length(); +int Sequence::getAmbigBases(){ + if(ambigBases == -1){ + + for(int j=0;j longHomoPolymer){ longHomoPolymer = homoPolymer; } + homoPolymer = 1; + } + } + if(homoPolymer > longHomoPolymer){ longHomoPolymer = homoPolymer; } + } + return longHomoPolymer; +} + +//******************************************************************************************************************** +int Sequence::getStartPos(){ + if(endPos == -1){ + for(int j = 0; j < alignmentLength; j++) { + if(aligned[j] != '.'){ + startPos = j; + break; + } + } + } + return startPos; +} + +//******************************************************************************************************************** + +int Sequence::getEndPos(){ + if(endPos == -1){ + for(int j=alignmentLength-1;j>=0;j--){ + if(aligned[j] != '.'){ + endPos = j; + break; + } + } + } + return endPos; +} + +//******************************************************************************************************************** + +bool Sequence::getIsAligned(){ + return isAligned; +} + +//******************************************************************************************************************** diff --git a/sequence.hpp b/sequence.hpp index 37ca87c..929ca78 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -37,18 +37,27 @@ public: string getAligned(); string getPairwise(); string getUnaligned(); - int getLength(); //the greater of the lengths of unaligned and aligned - int getUnalignLength(); + int getNumBases(); + int getStartPos(); + int getEndPos(); int getAlignLength(); + int getAmbigBases(); + int getLongHomoPolymer(); + bool getIsAligned(); void printSequence(ostream&); private: + void initialize(); string name; string unaligned; string aligned; string pairwise; - int length; - int lengthAligned; + int numBases; + int alignmentLength; + bool isAligned; + int longHomoPolymer; + int ambigBases; + int startPos, endPos; }; /**************************************************************************************************/ diff --git a/validcommands.cpp b/validcommands.cpp index 6f2790d..544933c 100644 --- a/validcommands.cpp +++ b/validcommands.cpp @@ -44,6 +44,7 @@ ValidCommands::ValidCommands() { commands["help"] = "help"; commands["filter.seqs"] = "filter.seqs"; commands["align.seqs"] = "align.seqs"; + commands["summary.seqs"] = "summary.seqs"; commands["quit"] = "quit"; diff --git a/validparameter.cpp b/validparameter.cpp index 4aad1c2..6069a4f 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -270,6 +270,9 @@ void ValidParameters::initCommandParameters() { string filterseqsArray[] = {"fasta","phylip","clustal","nexus", "trump", "soft", "hard", "vertical"}; commandParameters["filter.seqs"] = addParameters(filterseqsArray, sizeof(filterseqsArray)/sizeof(string)); + string summaryseqsArray[] = {"fasta","phylip","clustal","nexus"}; + commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string)); + string vennArray[] = {"groups","line","label","calc"}; commandParameters["venn"] = addParameters(vennArray, sizeof(vennArray)/sizeof(string));