From 544469443afe44920bdf279aefd26d29534cabaf Mon Sep 17 00:00:00 2001 From: pschloss Date: Thu, 4 Jun 2009 13:59:25 +0000 Subject: [PATCH] pat's edits of screen.seqs and summary.seqs --- aligncommand.cpp | 40 ++++++----- aligncommand.h | 24 +++---- database.cpp | 12 ++-- filterseqscommand.cpp | 154 +++++++++++++++++++++++------------------- filterseqscommand.h | 11 +-- globaldata.cpp | 6 +- kmerdb.cpp | 25 ++++--- mothur.h | 10 +++ screenseqscommand.cpp | 77 +++++++++++++-------- screenseqscommand.h | 6 +- seqsummarycommand.cpp | 120 ++++++++++++++------------------ seqsummarycommand.h | 4 +- sequence.cpp | 30 ++++++-- sequence.hpp | 1 + validparameter.cpp | 2 +- 15 files changed, 284 insertions(+), 238 deletions(-) diff --git a/aligncommand.cpp b/aligncommand.cpp index 242e9a8..08c291d 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -41,9 +41,11 @@ AlignCommand::AlignCommand(){ try { globaldata = GlobalData::getInstance(); - candidateFileName = globaldata->getCandidateFile(); - templateFileName = globaldata->getFastaFile(); - openInputFile(candidateFileName, in); + if(globaldata->getFastaFile() == "" && globaldata->getPhylipFile() == "" && globaldata->getNexusFile() == "" && globaldata->getClustalFile() == ""){ + cout << "you forgot a template file" << endl; + } + openInputFile(globaldata->getCandidateFile(), in); + convert(globaldata->getKSize(), kmerSize); convert(globaldata->getMatch(), match); convert(globaldata->getMismatch(), misMatch); @@ -73,28 +75,33 @@ int AlignCommand::execute(){ srand( (unsigned)time( NULL ) ); //needed to assign names to temporary files Database* templateDB; - if(globaldata->getSearch() == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); } - else if(globaldata->getSearch() == "suffix") { templateDB = new SuffixDB(templateFileName); } - else if(globaldata->getSearch() == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); } - else { cout << globaldata->getSearch() << " is not a valid search option. I will run the command using suffix." << endl; - templateDB = new SuffixDB(templateFileName); } + if(globaldata->getSearch() == "kmer") { templateDB = new KmerDB(globaldata->getFastaFile() , kmerSize); } + else if(globaldata->getSearch() == "suffix") { templateDB = new SuffixDB(globaldata->getFastaFile()); } + else if(globaldata->getSearch() == "blast") { templateDB = new BlastDB(globaldata->getFastaFile(), gapOpen, gapExtend, match, misMatch); } + else { + cout << globaldata->getSearch() << " is not a valid search option. I will run the command using kmer, ksize=8." << endl; + templateDB = new KmerDB(globaldata->getFastaFile(), kmerSize); + } Alignment* alignment; - if(globaldata->getAlign() == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); } - else if(globaldata->getAlign() == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); } - else if(globaldata->getAlign() == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } - else if(globaldata->getAlign() == "noalign") { alignment = new NoAlign(); } - else { cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using blast." << endl; - alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + if(globaldata->getAlign() == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); } + else if(globaldata->getAlign() == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); } + else if(globaldata->getAlign() == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + else if(globaldata->getAlign() == "noalign") { alignment = new NoAlign(); } + else { + cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using needleman." << endl; + alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); + } int numFastaSeqs=count(istreambuf_iterator(in),istreambuf_iterator(), '>'); in.seekg(0); - string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.align"; + candidateFileName = globaldata->getCandidateFile(); + string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align"; ofstream candidateAlignmentFile; openOutputFile(candidateAligngmentFName, candidateAlignmentFile); - string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.report"; + string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align.report"; NastReport report(candidateReportFName); cout << "We are going to align the " << numFastaSeqs << " sequences in " << candidateFileName << "..." << endl; @@ -110,6 +117,7 @@ int AlignCommand::execute(){ Sequence* templateSeq = templateDB->findClosestSequence(candidateSeq); report.setTemplate(templateSeq); report.setSearchParameters(globaldata->getSearch(), templateDB->getSearchScore()); + Nast nast(alignment, candidateSeq, templateSeq); report.setAlignmentParameters(globaldata->getAlign(), alignment); diff --git a/aligncommand.h b/aligncommand.h index cf30a6b..b5ff782 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -15,18 +15,18 @@ class AlignCommand : public Command { - public: - AlignCommand(); - ~AlignCommand(); - int execute(); - - private: - GlobalData* globaldata; - string candidateFileName, templateFileName, distanceFileName; - int kmerSize; - float match, misMatch, gapOpen, gapExtend; - ofstream out; - ifstream in; +public: + AlignCommand(); + ~AlignCommand(); + int execute(); + +private: + GlobalData* globaldata; + string candidateFileName, templateFileName, distanceFileName; + int kmerSize; + float match, misMatch, gapOpen, gapExtend; + ofstream out; + ifstream in; }; diff --git a/database.cpp b/database.cpp index 5b93321..44e663b 100644 --- a/database.cpp +++ b/database.cpp @@ -23,6 +23,7 @@ Database::Database(string fastaFileName){ // This assumes that the template dat cout << endl << "Reading in the " << fastaFileName << " template sequences...\t"; cout.flush(); + //all of this is elsewhere already! numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); // count the number of fastaFile.seekg(0); // sequences @@ -30,11 +31,8 @@ Database::Database(string fastaFileName){ // This assumes that the template dat string seqName, sequence; for(int i=0;i> seqName; -// templateSequences[i]->setName(seqName); - + seqName = seqName.substr(1); char letter; string aligned; @@ -46,12 +44,12 @@ Database::Database(string fastaFileName){ // This assumes that the template dat } } templateSequences[i] = new Sequence(seqName, aligned); -// templateSequences[i]->setAligned(aligned); // Obviously, we need the fully aligned template sequence -// templateSequences[i]->setUnaligned(aligned);// We will also need the unaligned sequence for pairwise alignments - fastaFile.putback(letter); // and database searches + fastaFile.putback(letter); } fastaFile.close(); + //all of this is elsewhere already! + cout << "DONE." << endl; cout.flush(); } diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index a6bd549..8864bb3 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -14,46 +14,43 @@ FilterSeqsCommand::FilterSeqsCommand(){ globaldata = GlobalData::getInstance(); - if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); } - else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); } - else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); } - else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); } - - readSeqs->read(); - db = readSeqs->getDB(); - numSeqs = db->size(); - - alignmentLength = db->get(0).getAlignLength(); - - filter = string(alignmentLength, '1'); + if(globaldata->getFastaFile() == "") { cout << "You must enter a fasta formatted file" << endl; } + trump = globaldata->getTrump()[0]; + vertical = +// readSeqs->read(); +// db = readSeqs->getDB(); +// numSeqs = db->size(); +// +// alignmentLength = db->get(0).getAlignLength(); +// +// filter = string(alignmentLength, '1'); } /**************************************************************************************/ void FilterSeqsCommand::doHard() { - string hardName = globaldata->getHard(); - string hardFilter = ""; - - ifstream fileHandle; - openInputFile(hardName, fileHandle); - - fileHandle >> hardFilter; - - if(hardFilter.length() != filter.length()){ - cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl; - } - else{ - filter = hardFilter; - } - +// string hardName = globaldata->getHard(); +// string hardFilter = ""; +// +// ifstream fileHandle; +// openInputFile(hardName, fileHandle); +// +// fileHandle >> hardFilter; +// +// if(hardFilter.length() != filter.length()){ +// cout << "The hard filter is not the same length as the alignment: Hard filter will not be applied." << endl; +// } +// else{ +// filter = hardFilter; +// } + } /**************************************************************************************/ void FilterSeqsCommand::doTrump() { - char trump = globaldata->getTrump()[0]; for(int i = 0; i < numSeqs; i++) { string curAligned = db->get(i).getAligned();; @@ -71,62 +68,77 @@ void FilterSeqsCommand::doTrump() { void FilterSeqsCommand::doVertical() { - vector counts(alignmentLength, 0); - - for(int i = 0; i < numSeqs; i++) { - string curAligned = db->get(i).getAligned();; - - for(int j = 0; j < alignmentLength; j++) { - if(curAligned[j] == '-' || curAligned[j] == '.'){ - counts[j]++; - } - } - } - for(int i=0;i counts(alignmentLength, 0); +// +// for(int i = 0; i < numSeqs; i++) { +// string curAligned = db->get(i).getAligned();; +// +// for(int j = 0; j < alignmentLength; j++) { +// if(curAligned[j] == '-' || curAligned[j] == '.'){ +// counts[j]++; +// } +// } +// } +// for(int i=0;igetSoft().c_str()) / 100.0; - - vector a(alignmentLength, 0); - vector t(alignmentLength, 0); - vector g(alignmentLength, 0); - vector c(alignmentLength, 0); - vector x(alignmentLength, 0); - - for(int i=0;iget(i).getAligned();; - - for(int j=0;jgetSoft().c_str()) / 100.0; +// +// vector a(alignmentLength, 0); +// vector t(alignmentLength, 0); +// vector g(alignmentLength, 0); +// vector c(alignmentLength, 0); +// vector x(alignmentLength, 0); +// +// for(int i=0;iget(i).getAligned();; +// +// for(int j=0;jgetHard().compare("") != 0) { doHard(); } // has to be applied first! - if(globaldata->getTrump().compare("") != 0) { doTrump(); } - if(globaldata->getVertical() == "T") { doVertical(); } - if(globaldata->getSoft().compare("") != 0) { doSoft(); } + + ifstream inFASTA; + openInputFile(globaldata->getFastaFile(), inFASTA); + + Sequence currSequence(inFASTA); + alignmentLength = currSequence.getAlignLength(); + + //while + + + if(globaldata->getHard().compare("") != 0) { doHard(); } // has to be applied first! + if(globaldata->getTrump().compare("") != 0) { doTrump(); } + if(isTrue(globaldata->getVertical())) { doVertical(); } + if(globaldata->getSoft().compare("") != 0) { doSoft(); } + + + + + + ofstream outfile; string filterFile = getRootName(globaldata->inputFileName) + "filter"; openOutputFile(filterFile, outfile); diff --git a/filterseqscommand.h b/filterseqscommand.h index af4c03a..b59d0a4 100644 --- a/filterseqscommand.h +++ b/filterseqscommand.h @@ -32,15 +32,16 @@ private: void doSoft(); void doHard(); void doVertical(); - + string filter; int alignmentLength; - int numSeqs; + + char trump; + bool vertical; GlobalData* globaldata; - ReadSeqs* readSeqs; - SequenceDB* db; +// ReadSeqs* readSeqs; +// SequenceDB* db; - string filter; }; diff --git a/globaldata.cpp b/globaldata.cpp index ec937ea..62641be 100644 --- a/globaldata.cpp +++ b/globaldata.cpp @@ -141,7 +141,7 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ if (key == "nexus" ) { nexusfile = value; inputFileName = value; fileroot = value; format = "nexus"; } if (key == "clustal" ) { clustalfile = value; inputFileName = value; fileroot = value; format = "clustal"; } if (key == "tree" ) { treefile = value; inputFileName = value; fileroot = value; format = "tree"; } - if (key == "shared" ) { sharedfile = value; inputFileName = value; fileroot = value; format = "sharedfile"; } + if (key == "shared" ) { sharedfile = value; inputFileName = value; fileroot = value; format = "sharedfile"; } if (key == "name" ) { namefile = value; } if (key == "order" ) { orderfile = value; } if (key == "group" ) { groupfile = value; } @@ -413,7 +413,7 @@ void GlobalData::clear() { processors = "1"; size = "0"; search = "kmer"; - ksize = "7"; + ksize = "8"; align = "needleman"; match = "1.0"; mismatch = "-1.0"; @@ -449,7 +449,7 @@ void GlobalData::reset() { processors = "1"; size = "0"; search = "kmer"; - ksize = "7"; + ksize = "8"; align = "needleman"; match = "1.0"; mismatch = "-1.0"; diff --git a/kmerdb.cpp b/kmerdb.cpp index 27e6997..9652ac4 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -57,20 +57,19 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS /**************************************************************************************************/ Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){ - - vector matches(numSeqs, 0); // a record of the sequences with shared kmers - vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found + + Kmer kmer(kmerSize); searchScore = 0; int maxSequence = 0; - - string query = candidateSeq->getUnaligned(); // we want to search using an unaligned dna sequence - int numKmers = query.length() - kmerSize + 1; - Kmer kmer(kmerSize); - + vector matches(numSeqs, 0); // a record of the sequences with shared kmers + vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found + + int numKmers = candidateSeq->getNumBases() - kmerSize + 1; + for(int i=0;igetUnaligned(), i); // go through the query sequence and get a kmer number if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before... for(int j=0;j searchScore){ // the query sequence + if(matches[i] > searchScore){ // the query sequence searchScore = matches[i]; maxSequence = i; } } - searchScore = 100 * searchScore / (float)numKmers; - return templateSequences[maxSequence]; // return the Sequence object corresponding to the db - // sequence with the most shared kmers. + + searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db + return templateSequences[maxSequence]; // sequence with the most shared kmers. } /**************************************************************************************************/ diff --git a/mothur.h b/mothur.h index 670a32c..f07f343 100644 --- a/mothur.h +++ b/mothur.h @@ -316,6 +316,16 @@ inline int openOutputFileAppend(string fileName, ofstream& fileHandle){ } +/***********************************************************************/ + +inline int getNumSeqs(ifstream& file){ + + int numSeqs = count(istreambuf_iterator(file),istreambuf_iterator(), '>'); + file.seekg(0); + return numSeqs; + +} + /***********************************************************************/ //This function parses the estimator options and puts them in a vector diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 94094c7..586e436 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -14,15 +14,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){ try { globaldata = GlobalData::getInstance(); - - if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); } - else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); } - else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); } - else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); } - - readSeqs->read(); - db = readSeqs->getDB(); - numSeqs = db->size(); + if(globaldata->getFastaFile() == "") { cout << "you must provide a fasta formatted file" << endl; } } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -36,9 +28,7 @@ ScreenSeqsCommand::ScreenSeqsCommand(){ //*************************************************************************************************************** -ScreenSeqsCommand::~ScreenSeqsCommand(){ - delete readSeqs; -} +ScreenSeqsCommand::~ScreenSeqsCommand(){ /* do nothing */ } //*************************************************************************************************************** @@ -52,6 +42,9 @@ int ScreenSeqsCommand::execute(){ convert(globaldata->getMinLength(), minLength); convert(globaldata->getMaxLength(), maxLength); + ifstream inFASTA; + openInputFile(globaldata->getFastaFile(), inFASTA); + set badSeqNames; string goodSeqFile = getRootName(globaldata->inputFileName) + "good" + getExtension(globaldata->inputFileName); @@ -59,16 +52,16 @@ int ScreenSeqsCommand::execute(){ ofstream goodSeqOut; openOutputFile(goodSeqFile, goodSeqOut); ofstream badSeqOut; openOutputFile(badSeqFile, badSeqOut); - - for(int i=0;iget(i); + + while(!inFASTA.eof()){ + Sequence currSeq(inFASTA); bool goodSeq = 1; // innocent until proven guilty - if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } - if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; } - if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()){ goodSeq = 0; } - if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; } - if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; } + if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; } + if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; } + if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; } + if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; } + if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; } + if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; } if(goodSeq == 1){ currSeq.printSequence(goodSeqOut); @@ -77,12 +70,8 @@ int ScreenSeqsCommand::execute(){ currSeq.printSequence(badSeqOut); badSeqNames.insert(currSeq.getName()); } - } - - if(globaldata->getNameFile() != ""){ - screenNameGroupFile(badSeqNames); - - } + gobble(inFASTA); + } return 0; } @@ -173,4 +162,38 @@ void ScreenSeqsCommand::screenNameGroupFile(set badSeqNames){ //*************************************************************************************************************** +void ScreenSeqsCommand::screenGroupFile(set badSeqNames){ + + ifstream inputGroups; + openInputFile(globaldata->getGroupFile(), inputGroups); + string seqName, group; + set::iterator it; + + string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile()); + string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile()); + + ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut); + ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut); + + while(!inputGroups.eof()){ + inputGroups >> seqName >> group; + it = badSeqNames.find(seqName); + + if(it != badSeqNames.end()){ + badSeqNames.erase(it); + badGroupOut << seqName << '\t' << group << endl; + } + else{ + goodGroupOut << seqName << '\t' << group << endl; + } + gobble(inputGroups); + } + inputGroups.close(); + goodGroupOut.close(); + badGroupOut.close(); + +} + +//*************************************************************************************************************** + diff --git a/screenseqscommand.h b/screenseqscommand.h index f88fe2c..56c9d82 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -16,7 +16,6 @@ #include "readnexus.h" #include "readclustal.h" #include "readseqsphylip.h" -#include using namespace std; @@ -28,10 +27,9 @@ public: int execute(); private: void screenNameGroupFile(set); - int numSeqs; + void screenGroupFile(set); + GlobalData* globaldata; - ReadSeqs* readSeqs; - SequenceDB* db; }; #endif diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 2bffecb..18eddec 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -14,15 +14,7 @@ SeqSummaryCommand::SeqSummaryCommand(){ try { globaldata = GlobalData::getInstance(); - - if(globaldata->getFastaFile() != "") { readSeqs = new ReadFasta(globaldata->inputFileName); } - else if(globaldata->getNexusFile() != "") { readSeqs = new ReadNexus(globaldata->inputFileName); } - else if(globaldata->getClustalFile() != "") { readSeqs = new ReadClustal(globaldata->inputFileName); } - else if(globaldata->getPhylipFile() != "") { readSeqs = new ReadPhylip(globaldata->inputFileName); } - - readSeqs->read(); - db = readSeqs->getDB(); - numSeqs = db->size(); + if(globaldata->getFastaFile() == "") { cout << "you need to at least enter a fasta file name" << endl; } } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the SeqCoordCommand class Function SeqCoordCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -36,83 +28,69 @@ SeqSummaryCommand::SeqSummaryCommand(){ //*************************************************************************************************************** -SeqSummaryCommand::~SeqSummaryCommand(){ - delete readSeqs; -} +SeqSummaryCommand::~SeqSummaryCommand(){ /* do nothing */ } //*************************************************************************************************************** int SeqSummaryCommand::execute(){ try{ - - ofstream outfile; - string summaryFile = getRootName(globaldata->inputFileName) + "summary"; - openOutputFile(summaryFile, outfile); - vector startPosition(numSeqs, 0); - vector endPosition(numSeqs, 0); - vector seqLength(numSeqs, 0); - vector ambigBases(numSeqs, 0); - vector longHomoPolymer(numSeqs, 0); + ifstream inFASTA; + openInputFile(globaldata->getFastaFile(), inFASTA); + int numSeqs = 0; + + ofstream outSummary; + string summaryFile = globaldata->getFastaFile() + ".summary"; + openOutputFile(summaryFile, outSummary); - if(db->get(0).getIsAligned() == 1){ - outfile << "seqname\tstart\tend\tlength\tambiguities\tlonghomopolymer" << endl; - for(int i = 0; i < numSeqs; i++) { - Sequence current = db->get(i); - startPosition[i] = current.getStartPos(); - endPosition[i] = current.getEndPos(); - seqLength[i] = current.getNumBases(); - ambigBases[i] = current.getAmbigBases(); - longHomoPolymer[i] = current.getLongHomoPolymer(); - outfile << current.getName() << '\t' << startPosition[i] << '\t' << endPosition[i] << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl; - } - } - else{ - outfile << "seqname\tlength\tambiguities\tlonghomopolymer" << endl; - for(int i=0;iget(i); - seqLength[i] = current.getNumBases(); - ambigBases[i] = current.getAmbigBases(); - longHomoPolymer[i] = current.getLongHomoPolymer(); - outfile << current.getName() << '\t' << seqLength[i] << '\t' << ambigBases[i] << '\t' << longHomoPolymer[i] << endl; - } + vector startPosition; + vector endPosition; + vector seqLength; + vector ambigBases; + vector longHomoPolymer; + + outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer" << endl; + + while(!inFASTA.eof()){ + Sequence current(inFASTA); + startPosition.push_back(current.getStartPos()); + endPosition.push_back(current.getEndPos()); + seqLength.push_back(current.getNumBases()); + ambigBases.push_back(current.getAmbigBases()); + longHomoPolymer.push_back(current.getLongHomoPolymer()); + + outSummary << current.getName() << '\t'; + outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t'; + outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t'; + outSummary << current.getLongHomoPolymer() << endl; + + numSeqs++; + gobble(inFASTA); } + inFASTA.close(); + sort(startPosition.begin(), startPosition.end()); + sort(endPosition.begin(), endPosition.end()); sort(seqLength.begin(), seqLength.end()); sort(ambigBases.begin(), ambigBases.end()); sort(longHomoPolymer.begin(), longHomoPolymer.end()); - int median = int(numSeqs * 0.500); - int lowestPtile = int(numSeqs * 0.025); - int lowPtile = int(numSeqs * 0.250); - int highPtile = int(numSeqs * 0.750); - int highestPtile = int(numSeqs * 0.975); - int max = numSeqs - 1; + int ptile0_25 = int(numSeqs * 0.025); + int ptile25 = int(numSeqs * 0.250); + int ptile50 = int(numSeqs * 0.500); + int ptile75 = int(numSeqs * 0.750); + int ptile97_5 = int(numSeqs * 0.975); + int ptile100 = numSeqs - 1; cout << endl; - if(db->get(0).getIsAligned() == 1){ - sort(startPosition.begin(), startPosition.end()); - sort(endPosition.begin(), endPosition.end()); - - cout << "\t\tStart\tEnd\tLength\tN's\tPolymer" << endl; - cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl; - cout << "2.5%-tile:\t" << startPosition[lowestPtile] << '\t' << endPosition[lowestPtile] << '\t' << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl; - cout << "25%-tile:\t" << startPosition[lowPtile] << '\t' << endPosition[lowPtile] << '\t' << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl; - cout << "Median: \t" << startPosition[median] << '\t' << endPosition[median] << '\t' << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl; - cout << "75%-tile:\t" << startPosition[highPtile] << '\t' << endPosition[highPtile] << '\t' << seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl; - cout << "97.5%-tile:\t" << startPosition[highestPtile] << '\t' << endPosition[highestPtile] << '\t' << seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl; - cout << "Maximum:\t" << startPosition[max] << '\t' << endPosition[max] << '\t' << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl; - } - else{ - cout << "\t\tLength\tN's\tPolymer" << endl; - cout << "Minimum:\t" << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl; - cout << "2.5%-tile:\t" << seqLength[lowestPtile] << '\t' << ambigBases[lowestPtile] << '\t' << longHomoPolymer[lowestPtile] << endl; - cout << "25%-tile:\t" << seqLength[lowPtile] << '\t' << ambigBases[lowPtile] << '\t' << longHomoPolymer[lowPtile] << endl; - cout << "Median: \t" << seqLength[median] << '\t' << ambigBases[median] << '\t' << longHomoPolymer[median] << endl; - cout << "75%-tile:\t"<< seqLength[highPtile] << '\t' << ambigBases[highPtile] << '\t' << longHomoPolymer[highPtile] << endl; - cout << "97.5%-tile:\t"<< seqLength[highestPtile] << '\t' << ambigBases[highestPtile] << '\t' << longHomoPolymer[highestPtile] << endl; - cout << "Maximum:\t" << seqLength[max] << '\t' << ambigBases[max] << '\t' << longHomoPolymer[max] << endl; - } + cout << "\t\tStart\tEnd\tNBases\tAmbigs\tPolymer" << endl; + cout << "Minimum:\t" << startPosition[0] << '\t' << endPosition[0] << '\t' << seqLength[0] << '\t' << ambigBases[0] << '\t' << longHomoPolymer[0] << endl; + cout << "2.5%-tile:\t" << startPosition[ptile0_25] << '\t' << endPosition[ptile0_25] << '\t' << seqLength[ptile0_25] << '\t' << ambigBases[ptile0_25] << '\t' << longHomoPolymer[ptile0_25] << endl; + cout << "25%-tile:\t" << startPosition[ptile25] << '\t' << endPosition[ptile25] << '\t' << seqLength[ptile25] << '\t' << ambigBases[ptile25] << '\t' << longHomoPolymer[ptile25] << endl; + cout << "Median: \t" << startPosition[ptile50] << '\t' << endPosition[ptile50] << '\t' << seqLength[ptile50] << '\t' << ambigBases[ptile50] << '\t' << longHomoPolymer[ptile50] << endl; + cout << "75%-tile:\t" << startPosition[ptile75] << '\t' << endPosition[ptile75] << '\t' << seqLength[ptile75] << '\t' << ambigBases[ptile75] << '\t' << longHomoPolymer[ptile75] << endl; + cout << "97.5%-tile:\t" << startPosition[ptile97_5] << '\t' << endPosition[ptile97_5] << '\t' << seqLength[ptile97_5] << '\t' << ambigBases[ptile97_5] << '\t' << longHomoPolymer[ptile97_5] << endl; + cout << "Maximum:\t" << startPosition[ptile100] << '\t' << endPosition[ptile100] << '\t' << seqLength[ptile100] << '\t' << ambigBases[ptile100] << '\t' << longHomoPolymer[ptile100] << endl; cout << "# of Seqs:\t" << numSeqs << endl; return 0; diff --git a/seqsummarycommand.h b/seqsummarycommand.h index 02103e7..9b2be27 100644 --- a/seqsummarycommand.h +++ b/seqsummarycommand.h @@ -27,10 +27,8 @@ public: int execute(); private: - int numSeqs; GlobalData* globaldata; - ReadSeqs* readSeqs; - SequenceDB* db; + }; #endif diff --git a/sequence.cpp b/sequence.cpp index e9487c5..98aa7b0 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -25,7 +25,6 @@ Sequence::Sequence(string newName, string sequence) { name = newName; if(sequence.find_first_of('-') != string::npos) { setAligned(sequence); - isAligned = 1; } setUnaligned(sequence); @@ -33,6 +32,7 @@ Sequence::Sequence(string newName, string sequence) { //******************************************************************************************************************** Sequence::Sequence(ifstream& fastaFile){ + initialize(); string accession; // provided a file handle to a fasta-formatted sequence file, read in the next fastaFile >> accession; // accession number and sequence we find... @@ -131,7 +131,7 @@ void Sequence::setAligned(string sequence){ } } } - + isAligned = 1; } //******************************************************************************************************************** @@ -249,11 +249,13 @@ int Sequence::getStartPos(){ if(endPos == -1){ for(int j = 0; j < alignmentLength; j++) { if(aligned[j] != '.'){ - startPos = j; + startPos = j + 1; break; } } - } + } + if(isAligned == 0){ startPos = 1; } + return startPos; } @@ -263,11 +265,13 @@ int Sequence::getEndPos(){ if(endPos == -1){ for(int j=alignmentLength-1;j>=0;j--){ if(aligned[j] != '.'){ - endPos = j; + endPos = j + 1; break; } } } + if(isAligned == 0){ endPos = numBases; } + return endPos; } @@ -278,3 +282,19 @@ bool Sequence::getIsAligned(){ } //******************************************************************************************************************** + +void Sequence::reverseComplement(){ + + string temp; + for(int i=numBases-1;i>=0;i--){ + if(unaligned[i] == 'A') { temp += 'T'; } + else if(unaligned[i] == 'T'){ temp += 'A'; } + else if(unaligned[i] == 'G'){ temp += 'C'; } + else if(unaligned[i] == 'C'){ temp += 'G'; } + else { temp += 'N'; } + } + unaligned = temp; + +} + +//******************************************************************************************************************** diff --git a/sequence.hpp b/sequence.hpp index 929ca78..904e25a 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -31,6 +31,7 @@ public: void setPairwise(string); void setAligned(string); void setLength(); + void reverseComplement(); string convert2ints(); string getName(); diff --git a/validparameter.cpp b/validparameter.cpp index b67ee80..cd3b054 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -279,7 +279,7 @@ void ValidParameters::initCommandParameters() { string summaryseqsArray[] = {"fasta","phylip","clustal","nexus"}; commandParameters["summary.seqs"] = addParameters(summaryseqsArray, sizeof(summaryseqsArray)/sizeof(string)); - string screenseqsArray[] = {"fasta","phylip","clustal","nexus", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"}; + string screenseqsArray[] = {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"}; commandParameters["screen.seqs"] = addParameters(screenseqsArray, sizeof(screenseqsArray)/sizeof(string)); string vennArray[] = {"groups","line","label","calc"}; -- 2.39.2