From e0fbf58358a72f20352cf2a43922ab6b5bdf0cf8 Mon Sep 17 00:00:00 2001 From: westcott Date: Thu, 22 Jul 2010 18:52:22 +0000 Subject: [PATCH] fixes while testing 1.12.0 --- Mothur.xcodeproj/project.pbxproj | 6 +- bayesian.cpp | 185 ++--- chimera.h | 35 +- chimeracheckcommand.cpp | 1 - chimeracheckcommand.h | 1 + chimeracheckrdp.cpp | 17 +- consensuscommand.cpp | 8 +- endiannessmacros.h | 36 +- fullmatrix.cpp | 6 +- mgclustercommand.cpp | 15 +- mothur.cpp | 11 + onegapdist.h | 7 +- qualityscores.cpp | 27 +- qualityscores.h | 2 +- sensspeccommand.cpp | 3 + sffinfocommand.cpp | 1329 +++++++++++++++--------------- sffinfocommand.h | 2 +- trimseqscommand.cpp | 13 +- 18 files changed, 855 insertions(+), 849 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 74520c4..919bc12 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -827,8 +827,6 @@ A7CB594211402F430010EB83 /* containers */ = { isa = PBXGroup; children = ( - 7E85BD1C11EB5E9B00FD37C0 /* qualityscores.h */, - 7E85BD1D11EB5E9B00FD37C0 /* qualityscores.cpp */, A7DA1FF3113FECD400BF472F /* alignmentcell.hpp */, A7DA1FF2113FECD400BF472F /* alignmentcell.cpp */, A7DA1FF4113FECD400BF472F /* alignmentdb.cpp */, @@ -856,8 +854,10 @@ A7DA20A3113FECD400BF472F /* nameassignment.hpp */, A7DA20B6113FECD400BF472F /* ordervector.cpp */, A7DA20B7113FECD400BF472F /* ordervector.hpp */, - A7DA20D2113FECD400BF472F /* rabundvector.cpp */, + 7E85BD1C11EB5E9B00FD37C0 /* qualityscores.h */, + 7E85BD1D11EB5E9B00FD37C0 /* qualityscores.cpp */, A7DA20D3113FECD400BF472F /* rabundvector.hpp */, + A7DA20D2113FECD400BF472F /* rabundvector.cpp */, A7DA20F6113FECD400BF472F /* sabundvector.cpp */, A7DA20F7113FECD400BF472F /* sabundvector.hpp */, A7DA20FE113FECD400BF472F /* sequence.cpp */, diff --git a/bayesian.cpp b/bayesian.cpp index 655d702..d0c0a16 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -51,95 +51,108 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0); //prevents errors caused by creating shortcut files if you had an error in the sanity check. - if (m->control_pressed) { remove(phyloTreeName.c_str()); } - - genusNodes = phyloTree->getGenusNodes(); - genusTotals = phyloTree->getGenusTotals(); - - m->mothurOut("Calculating template taxonomy tree... "); cout.flush(); - - phyloTree->printTreeNodes(phyloTreeName); - - m->mothurOut("DONE."); m->mothurOutEndLine(); - - m->mothurOut("Calculating template probabilities... "); cout.flush(); - - numKmers = database->getMaxKmer() + 1; - - //initialze probabilities - wordGenusProb.resize(numKmers); - - for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } - - - #ifdef USE_MPI - int pid; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - if (pid == 0) { - #endif - - ofstream out; - openOutputFile(probFileName, out); - - out << numKmers << endl; - - ofstream out2; - openOutputFile(probFileName2, out2); + if (m->control_pressed) { remove(phyloTreeName.c_str()); remove(probFileName.c_str()); remove(probFileName2.c_str()); } + else{ + genusNodes = phyloTree->getGenusNodes(); + genusTotals = phyloTree->getGenusTotals(); + + m->mothurOut("Calculating template taxonomy tree... "); cout.flush(); + + phyloTree->printTreeNodes(phyloTreeName); + + m->mothurOut("DONE."); m->mothurOutEndLine(); + + m->mothurOut("Calculating template probabilities... "); cout.flush(); + + numKmers = database->getMaxKmer() + 1; - #ifdef USE_MPI - } - #endif - + //initialze probabilities + wordGenusProb.resize(numKmers); - //for each word - for (int i = 0; i < numKmers; i++) { - if (m->control_pressed) { break; } + for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } + #ifdef USE_MPI + int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are if (pid == 0) { #endif - out << i << '\t'; + ofstream out; + openOutputFile(probFileName, out); + + out << numKmers << endl; + + ofstream out2; + openOutputFile(probFileName2, out2); #ifdef USE_MPI } #endif + - vector seqsWithWordi = database->getSequencesWithKmer(i); - - map count; - for (int k = 0; k < genusNodes.size(); k++) { count[genusNodes[k]] = 0; } - - //for each sequence with that word - for (int j = 0; j < seqsWithWordi.size(); j++) { - int temp = phyloTree->getIndex(names[seqsWithWordi[j]]); - count[temp]++; //increment count of seq in this genus who have this word - } - - //probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1); - float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1); - - int numNotZero = 0; - for (int k = 0; k < genusNodes.size(); k++) { - //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); - wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); - if (count[genusNodes[k]] != 0) { - #ifdef USE_MPI - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - if (pid == 0) { - #endif + //for each word + for (int i = 0; i < numKmers; i++) { + if (m->control_pressed) { break; } + + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - out << k << '\t' << wordGenusProb[i][k] << '\t'; - - #ifdef USE_MPI - } - #endif + if (pid == 0) { + #endif - numNotZero++; + out << i << '\t'; + + #ifdef USE_MPI + } + #endif + + vector seqsWithWordi = database->getSequencesWithKmer(i); + + map count; + for (int k = 0; k < genusNodes.size(); k++) { count[genusNodes[k]] = 0; } + + //for each sequence with that word + for (int j = 0; j < seqsWithWordi.size(); j++) { + int temp = phyloTree->getIndex(names[seqsWithWordi[j]]); + count[temp]++; //increment count of seq in this genus who have this word } + + //probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1); + float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1); + + int numNotZero = 0; + for (int k = 0; k < genusNodes.size(); k++) { + //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); + wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); + if (count[genusNodes[k]] != 0) { + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + if (pid == 0) { + #endif + + out << k << '\t' << wordGenusProb[i][k] << '\t'; + + #ifdef USE_MPI + } + #endif + + numNotZero++; + } + } + + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + if (pid == 0) { + #endif + + out << endl; + out2 << probabilityInTemplate << '\t' << numNotZero << endl; + + #ifdef USE_MPI + } + #endif } #ifdef USE_MPI @@ -147,31 +160,19 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { if (pid == 0) { #endif - out << endl; - out2 << probabilityInTemplate << '\t' << numNotZero << endl; + out.close(); + out2.close(); #ifdef USE_MPI } #endif + + //read in new phylotree with less info. - its faster + ifstream phyloTreeTest(phyloTreeName.c_str()); + delete phyloTree; + + phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); } - - #ifdef USE_MPI - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - if (pid == 0) { - #endif - - out.close(); - out2.close(); - - #ifdef USE_MPI - } - #endif - - //read in new phylotree with less info. - its faster - ifstream phyloTreeTest(phyloTreeName.c_str()); - delete phyloTree; - - phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); } m->mothurOut("DONE."); m->mothurOutEndLine(); diff --git a/chimera.h b/chimera.h index 7120478..24ab3f7 100644 --- a/chimera.h +++ b/chimera.h @@ -90,41 +90,13 @@ class Chimera { public: Chimera(){ m = MothurOut::getInstance(); length = 0; unaligned = false; } - //Chimera(string) { m = MothurOut::getInstance(); } - //Chimera(string, bool, string) { m = MothurOut::getInstance(); } - //Chimera(string, string) { m = MothurOut::getInstance(); } virtual ~Chimera(){ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } }; - //virtual void setFilter(bool f) { filter = f; } - //virtual void setCorrection(bool c) { correction = c; } - //virtual void setProcessors(int p) { processors = p; } - //virtual void setWindow(int w) { window = w; } - //virtual void setIncrement(int i) { increment = i; } - //virtual void setNumWanted(int n) { numWanted = n; } - //virtual void setKmerSize(int k) { kmerSize = k; } - //virtual void setSVG(int s) { svg = s; } - //virtual void setName(string n) { name = n; } - //virtual void setMatch(int m) { match = m; } - //virtual void setMisMatch(int m) { misMatch = m; } - //virtual void setDivR(float d) { divR = d; } - //virtual void setParents(int p) { parents = p; } - //virtual void setMinSim(int s) { minSim = s; } - //virtual void setMinCoverage(int c) { minCov = c; } - //virtual void setMinBS(int b) { minBS = b; } - //virtual void setMinSNP(int s) { minSNP = s; } - //virtual void setIters(int i) { iters = i; } virtual bool getUnaligned() { return unaligned; } - //virtual void setTemplateFile(string t) { templateFileName = t; templateSeqs = readSeqs(t); } virtual int getLength() { return length; } - - //virtual void setCons(string){}; - //virtual void setQuantiles(string){}; - //virtual int doPrep(){ return 0; } virtual vector readSeqs(string); - //virtual vector< vector > readQuantiles(); virtual void setMask(string); virtual map runFilter(Sequence*); virtual string createFilter(vector, float); - virtual void printHeader(ostream&){}; virtual int getChimeras(Sequence*){ return 0; } virtual int getChimeras(){ return 0; } @@ -138,10 +110,9 @@ class Chimera { protected: vector templateSeqs; - bool filter, unaligned; // correction, svg, - int length; //processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, - //float divR; - string seqMask, filterString, outputDir, templateFileName; //quanfile, name, + bool filter, unaligned; + int length; + string seqMask, filterString, outputDir, templateFileName; Sequence* getSequence(string); //find sequence from name MothurOut* m; }; diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp index f002a02..44696cb 100644 --- a/chimeracheckcommand.cpp +++ b/chimeracheckcommand.cpp @@ -8,7 +8,6 @@ */ #include "chimeracheckcommand.h" -#include "chimeracheckrdp.h" //*************************************************************************************************************** diff --git a/chimeracheckcommand.h b/chimeracheckcommand.h index 7c49344..89dc6aa 100644 --- a/chimeracheckcommand.h +++ b/chimeracheckcommand.h @@ -13,6 +13,7 @@ #include "mothur.h" #include "command.hpp" #include "chimera.h" +#include "chimeracheckrdp.h" /***********************************************************/ diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp index 4c93c1b..e89277b 100644 --- a/chimeracheckrdp.cpp +++ b/chimeracheckrdp.cpp @@ -315,25 +315,26 @@ int ChimeraCheckRDP::calcKmers(map query, map subject) { try{ int common = 0; - map::iterator small; - map::iterator large; + map::iterator smallone; + map::iterator largeone; + if (query.size() < subject.size()) { - for (small = query.begin(); small != query.end(); small++) { - large = subject.find(small->first); + for (smallone = query.begin(); smallone != query.end(); smallone++) { + largeone = subject.find(smallone->first); //if you found it they have that kmer in common - if (large != subject.end()) { common++; } + if (largeone != subject.end()) { common++; } } }else { - for (small = subject.begin(); small != subject.end(); small++) { - large = query.find(small->first); + for (smallone = subject.begin(); smallone != subject.end(); smallone++) { + largeone = query.find(smallone->first); //if you found it they have that kmer in common - if (large != query.end()) { common++; } + if (largeone != query.end()) { common++; } } } diff --git a/consensuscommand.cpp b/consensuscommand.cpp index 37cd395..2fef5b1 100644 --- a/consensuscommand.cpp +++ b/consensuscommand.cpp @@ -262,13 +262,13 @@ int ConcensusCommand::getSets() { while (nodePairsCopy.size() != 0) { if (m->control_pressed) { return 1; } - vector small = getSmallest(nodePairsCopy); + vector smallOne = getSmallest(nodePairsCopy); - int subgrouprate = getSubgroupRating(small); + int subgrouprate = getSubgroupRating(smallOne); - nodePairsInitialRate[small] = nodePairs[small] + subgrouprate; + nodePairsInitialRate[smallOne] = nodePairs[smallOne] + subgrouprate; - nodePairsCopy.erase(small); + nodePairsCopy.erase(smallOne); } return 0; diff --git a/endiannessmacros.h b/endiannessmacros.h index 1e997f5..581fd6b 100644 --- a/endiannessmacros.h +++ b/endiannessmacros.h @@ -27,40 +27,21 @@ * */ -#include -#include - -/*----------------------------------------------------------------------------- - * Detection of endianness. The main part of this is done in autoconf, but - * for the case of MacOS FAT binaries we fall back on auto-sensing based on - * processor type too. - */ - -/* Set by autoconf */ -#define SP_LITTLE_ENDIAN /* Mac FAT binaries or unknown. Auto detect based on CPU type */ #if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN) - + /* * x86 equivalents */ -#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686) -# if defined(SP_BIG_ENDIAN) -# undef SP_BIG_ENDIAN -# endif -# define SP_LITTLE_ENDIAN +#if defined(__i386) || defined(__i386__) || defined(__ia64__) || defined(WIN32) || defined(__arm__) || (defined(__mips__) && defined(__MIPSEL__)) || defined(__SYMBIAN32__) || \ + defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686) || defined(__amd64__) || defined(__amd64) || defined(__LITTLE_ENDIAN__) +#define SP_LITTLE_ENDIAN +#else +#define SP_BIG_ENDIAN #endif -/* - * DEC Alpha - */ -#if defined(__alpha__) || defined(__alpha) -# if defined(SP_LITTLE_ENDIAN) -# undef SP_LITTLE_ENDIAN -# endif -# define SP_BIG_ENDIAN -#endif + /* * SUN Sparc @@ -190,4 +171,5 @@ #define le_int1(x) (x) #endif -#endif \ No newline at end of file +#endif + diff --git a/fullmatrix.cpp b/fullmatrix.cpp index 8091554..fe61307 100644 --- a/fullmatrix.cpp +++ b/fullmatrix.cpp @@ -56,7 +56,7 @@ FullMatrix::FullMatrix(ifstream& filehandle) { break; } } - cout << "here" << endl; + //read rest of matrix if (square == true) { readSquareMatrix(filehandle); } else { readLTMatrix(filehandle); } @@ -74,7 +74,7 @@ FullMatrix::FullMatrix(ifstream& filehandle) { /**************************************************************************/ int FullMatrix::readSquareMatrix(ifstream& filehandle) { try { - cout << "square" << endl; + Progress* reading; reading = new Progress("Reading matrix: ", numSeqs * numSeqs); @@ -117,7 +117,7 @@ int FullMatrix::readSquareMatrix(ifstream& filehandle) { /**************************************************************************/ int FullMatrix::readLTMatrix(ifstream& filehandle) { try { - cout << "lt" << endl; + Progress* reading; reading = new Progress("Reading matrix: ", numSeqs * (numSeqs - 1) / 2); diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index b92927b..269e088 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -517,14 +517,17 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ string name2 = nameMap->get(overlapNode.seq2); //use binInfo to find out if they are already in the same bin - map::iterator itBin1 = binInfo.find(name1); - map::iterator itBin2 = binInfo.find(name2); + //map::iterator itBin1 = binInfo.find(name1); + //map::iterator itBin2 = binInfo.find(name2); - if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); } - if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); } + //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); } + //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); } - int binKeep = itBin1->second; - int binRemove = itBin2->second; + //int binKeep = itBin1->second; + //int binRemove = itBin2->second; + + int binKeep = binInfo[name1]; + int binRemove = binInfo[name2]; //if not merge bins and update binInfo if(binKeep != binRemove) { diff --git a/mothur.cpp b/mothur.cpp index 6e61f87..a95c381 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -73,7 +73,18 @@ int main(int argc, char *argv[]){ #ifdef MOTHUR_FILES string temp = MOTHUR_FILES; + + //add / to name if needed + string lastChar = temp.substr(temp.length()-1); + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if (lastChar != "/") { temp += "/"; } + #else + if (lastChar != "\\") { temp += "\\"; } + #endif + + temp = getFullPathName(temp); m->setDefaultPath(temp); + m->mothurOutJustToLog("Using default file location " + temp); m->mothurOutEndLine(); m->mothurOutEndLine(); #endif diff --git a/onegapdist.h b/onegapdist.h index c466c53..cb63ae6 100644 --- a/onegapdist.h +++ b/onegapdist.h @@ -38,6 +38,9 @@ public: for(int i=start;i> seqName; seqName = seqName.substr(1); - getline(qFile, line); - istringstream qualStream(line); + //getline(qFile, line); + //istringstream qualStream(line); - while(qualStream){ - qualStream >> score; + //while(qualStream){ + // qualStream >> score; + // qScores.push_back(score); + //} + //qScores.pop_back(); + + //seqLength = qScores.size(); + + for(int i=0;i> score; qScores.push_back(score); } - qScores.pop_back(); - - seqLength = qScores.size(); + gobble(qFile); + } catch(exception& e) { m->errorOut(e, "QualityScores", "QualityScores"); diff --git a/qualityscores.h b/qualityscores.h index 710200a..5db4311 100644 --- a/qualityscores.h +++ b/qualityscores.h @@ -20,7 +20,7 @@ class QualityScores { public: QualityScores(); - QualityScores(ifstream&); + QualityScores(ifstream&, int); string getName(); void printQScores(ofstream&); void trimQScores(int, int); diff --git a/sensspeccommand.cpp b/sensspeccommand.cpp index 3a147c5..e647bf2 100644 --- a/sensspeccommand.cpp +++ b/sensspeccommand.cpp @@ -457,3 +457,6 @@ void SensSpecCommand::outputStatistics(string label, string cutoff){ } //*************************************************************************************************************** + + + diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp index 01338ca..e2527ff 100644 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -1,655 +1,674 @@ -/* - * sffinfocommand.cpp - * Mothur - * - * Created by westcott on 7/7/10. - * Copyright 2010 Schloss Lab. All rights reserved. - * - */ - -#include "sffinfocommand.h" -#include "endiannessmacros.h" - -//********************************************************************************************************************** - -SffInfoCommand::SffInfoCommand(string option) { - try { - abort = false; - hasAccnos = false; - - //allow user to run help - if(option == "help") { help(); abort = true; } - - else { - //valid paramters for this command - string Array[] = {"sff","qfile","fasta","flow","trim","accnos","sfftxt","outputdir","inputdir", "outputdir"}; - vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); - - OptionParser parser(option); - map parameters = parser.getParameters(); - - ValidParameters validParameter; - //check to make sure all parameters are valid for command - for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { - if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } - } - - //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } - - //if the user changes the input directory command factory will send this info to us in the output parameter - string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } - - sffFilename = validParameter.validFile(parameters, "sff", false); - if (sffFilename == "not found") { m->mothurOut("sff is a required parameter for the sffinfo command."); m->mothurOutEndLine(); abort = true; } - else { - splitAtDash(sffFilename, filenames); - - //go through files and make sure they are good, if not, then disregard them - for (int i = 0; i < filenames.size(); i++) { - if (inputDir != "") { - string path = hasPath(filenames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { filenames[i] = inputDir + filenames[i]; } - } - - ifstream in; - int ableToOpen = openInputFile(filenames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + getSimpleName(filenames[i]); - m->mothurOut("Unable to open " + filenames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ableToOpen = openInputFile(tryPath, in, "noerror"); - filenames[i] = tryPath; - } - } - in.close(); - - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + filenames[i] + ". It will be disregarded."); m->mothurOutEndLine(); - //erase from file list - filenames.erase(filenames.begin()+i); - i--; - } - } - - //make sure there is at least one valid file left - if (filenames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } - } - - accnosName = validParameter.validFile(parameters, "accnos", false); - if (accnosName == "not found") { accnosName = ""; } - else { - hasAccnos = true; - splitAtDash(accnosName, accnosFileNames); - - //go through files and make sure they are good, if not, then disregard them - for (int i = 0; i < accnosFileNames.size(); i++) { - if (inputDir != "") { - string path = hasPath(accnosFileNames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { accnosFileNames[i] = inputDir + accnosFileNames[i]; } - } - - ifstream in; - int ableToOpen = openInputFile(accnosFileNames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + getSimpleName(accnosFileNames[i]); - m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ableToOpen = openInputFile(tryPath, in, "noerror"); - accnosFileNames[i] = tryPath; - } - } - in.close(); - - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + accnosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); - //erase from file list - accnosFileNames.erase(accnosFileNames.begin()+i); - i--; - } - } - - //make sure there is at least one valid file left - if (accnosFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } - } - - if (hasAccnos) { - if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); } - } - - string temp = validParameter.validFile(parameters, "qfile", false); if (temp == "not found"){ temp = "T"; } - qual = isTrue(temp); - - temp = validParameter.validFile(parameters, "fasta", false); if (temp == "not found"){ temp = "T"; } - fasta = isTrue(temp); - - temp = validParameter.validFile(parameters, "flow", false); if (temp == "not found"){ temp = "F"; } - flow = isTrue(temp); - - temp = validParameter.validFile(parameters, "trim", false); if (temp == "not found"){ temp = "T"; } - trim = isTrue(temp); - - temp = validParameter.validFile(parameters, "sfftxt", false); if (temp == "not found"){ temp = "F"; } - sfftxt = isTrue(temp); - } - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); - exit(1); - } -} -//********************************************************************************************************************** - -void SffInfoCommand::help(){ - try { - m->mothurOut("The sffinfo command reads a sff file and extracts the sequence data.\n"); - m->mothurOut("The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, and trim. sff is required. \n"); - m->mothurOut("The sff parameter allows you to enter the sff file you would like to extract data from. You may enter multiple files by separating them by -'s.\n"); - m->mothurOut("The fasta parameter allows you to indicate if you would like a fasta formatted file generated. Default=True. \n"); - m->mothurOut("The qfile parameter allows you to indicate if you would like a quality file generated. Default=True. \n"); - m->mothurOut("The flow parameter allows you to indicate if you would like a flowgram file generated. Default=False. \n"); - m->mothurOut("The sfftxt parameter allows you to indicate if you would like a sff.txt file generated. Default=False. \n"); - m->mothurOut("The trim parameter allows you to indicate if you would like a sequences and quality scores trimmed to the clipQualLeft and clipQualRight values. Default=True. \n"); - m->mothurOut("The accnos parameter allows you to provide a accnos file containing the names of the sequences you would like extracted. You may enter multiple files by separating them by -'s. \n"); - m->mothurOut("Example sffinfo(sff=mySffFile.sff, trim=F).\n"); - m->mothurOut("Note: No spaces between parameter labels (i.e. sff), '=' and parameters (i.e.yourSffFileName).\n\n"); - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "help"); - exit(1); - } -} -//********************************************************************************************************************** - -SffInfoCommand::~SffInfoCommand(){} - -//********************************************************************************************************************** -int SffInfoCommand::execute(){ - try { - - if (abort == true) { return 0; } - - for (int s = 0; s < filenames.size(); s++) { - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - - int start = time(NULL); - - m->mothurOut("Extracting info from " + filenames[s] + " ..." ); m->mothurOutEndLine(); - - string accnos = ""; - if (hasAccnos) { accnos = accnosFileNames[s]; } - - int numReads = extractSffInfo(filenames[s], accnos); - - m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + "."); - } - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - - //report output filenames - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "execute"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::extractSffInfo(string input, string accnos){ - try { - - if (outputDir == "") { outputDir += hasPath(input); } - - if (accnos != "") { readAccnosFile(accnos); } - else { seqNames.clear(); } - - ofstream outSfftxt, outFasta, outQual, outFlow; - string outFastaFileName, outQualFileName; - string sfftxtFileName = outputDir + getRootName(getSimpleName(input)) + "sff.txt"; - string outFlowFileName = outputDir + getRootName(getSimpleName(input)) + "flow"; - if (trim) { - outFastaFileName = outputDir + getRootName(getSimpleName(input)) + "fasta"; - outQualFileName = outputDir + getRootName(getSimpleName(input)) + "qual"; - }else{ - outFastaFileName = outputDir + getRootName(getSimpleName(input)) + "raw.fasta"; - outQualFileName = outputDir + getRootName(getSimpleName(input)) + "raw.qual"; - } - - if (sfftxt) { openOutputFile(sfftxtFileName, outSfftxt); outSfftxt.setf(ios::fixed, ios::floatfield); outSfftxt.setf(ios::showpoint); outputNames.push_back(sfftxtFileName); } - if (fasta) { openOutputFile(outFastaFileName, outFasta); outputNames.push_back(outFastaFileName); } - if (qual) { openOutputFile(outQualFileName, outQual); outputNames.push_back(outQualFileName); } - if (flow) { openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); } - - ifstream in; - in.open(input.c_str(), ios::binary); - - CommonHeader header; - readCommonHeader(in, header); - - int count = 0; - - //check magic number and version - if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); return count; } - if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); return count; } - - //print common header - if (sfftxt) { printCommonHeader(outSfftxt, header); } - - //read through the sff file - while (!in.eof()) { - - bool print = true; - - //read header - Header readheader; - readHeader(in, readheader); - - //read data - seqRead read; - readSeqData(in, read, header.numFlowsPerRead, readheader.numBases); - - //if you have provided an accosfile and this seq is not in it, then dont print - if (seqNames.size() != 0) { if (seqNames.count(readheader.name) == 0) { print = false; } } - - //print - if (print) { - if (sfftxt) { printHeader(outSfftxt, readheader); printSffTxtSeqData(outSfftxt, read); } - if (fasta) { printFastaSeqData(outFasta, read, readheader); } - if (qual) { printQualSeqData(outQual, read, readheader); } - if (flow) { printFlowSeqData(outFlow, read, readheader); } - } - - count++; - - //report progress - if((count+1) % 500 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } - - if (m->control_pressed) { count = 0; break; } - - if (count >= header.numReads) { break; } - } - - //report progress - if (!m->control_pressed) { if((count) % 500 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } - - in.close(); - - if (sfftxt) { outSfftxt.close(); } - if (fasta) { outFasta.close(); } - if (qual) { outQual.close(); } - if (flow) { outFlow.close(); } - - return count; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "extractSffInfo"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ - try { - - if (!in.eof()) { - - //read magic number - char buffer[sizeof(header.magicNumber)]; - in.read(buffer, sizeof(header.magicNumber)); - header.magicNumber = be_int4(*(unsigned int *)(&buffer)); - - //read version - char buffer9[4]; - in.read(buffer9, 4); - header.version = ""; - for (int i = 0; i < 4; i++) { header.version += toString((int)(buffer9[i])); } - - //read offset - char buffer2 [sizeof(header.indexOffset)]; - in.read(buffer2, sizeof(header.indexOffset)); - header.indexOffset = be_int8(*(unsigned long int *)(&buffer2)); - - //read index length - char buffer3 [sizeof(header.indexLength)]; - in.read(buffer3, sizeof(header.indexLength)); - header.indexLength = be_int4(*(unsigned int *)(&buffer3)); - - //read num reads - char buffer4 [sizeof(header.numReads)]; - in.read(buffer4, sizeof(header.numReads)); - header.numReads = be_int4(*(unsigned int *)(&buffer4)); - - //read header length - char buffer5 [sizeof(header.headerLength)]; - in.read(buffer5, sizeof(header.headerLength)); - header.headerLength = be_int2(*(unsigned short *)(&buffer5)); - - //read key length - char buffer6 [sizeof(header.keyLength)]; - in.read(buffer6, sizeof(header.keyLength)); - header.keyLength = be_int2(*(unsigned short *)(&buffer6)); - - //read number of flow reads - char buffer7 [sizeof(header.numFlowsPerRead)]; - in.read(buffer7, sizeof(header.numFlowsPerRead)); - header.numFlowsPerRead = be_int2(*(unsigned short *)(&buffer7)); - - //read format code - char buffer8 [1]; - in.read(buffer8, 1); - header.flogramFormatCode = (int)(buffer8[0]); - - //read flow chars - char tempBuffer [header.numFlowsPerRead]; - in.read(tempBuffer, header.numFlowsPerRead); - header.flowChars = tempBuffer; - if (header.flowChars.length() > header.numFlowsPerRead) { header.flowChars = header.flowChars.substr(0, header.numFlowsPerRead); } - - //read key - char tempBuffer2 [header.keyLength]; - in.read(tempBuffer2, header.keyLength); - header.keySequence = tempBuffer2; - if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength); } - - /* Pad to 8 chars */ - unsigned long int spotInFile = in.tellg(); - unsigned long int spot = (spotInFile + 7)& ~7; // ~ inverts - in.seekg(spot); - - }else{ - m->mothurOut("Error reading sff common header."); m->mothurOutEndLine(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readCommonHeader"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readHeader(ifstream& in, Header& header){ - try { - - if (!in.eof()) { - - //read header length - char buffer [sizeof(header.headerLength)]; - in.read(buffer, sizeof(header.headerLength)); - header.headerLength = be_int2(*(unsigned short *)(&buffer)); - - //read name length - char buffer2 [sizeof(header.nameLength)]; - in.read(buffer2, sizeof(header.nameLength)); - header.nameLength = be_int2(*(unsigned short *)(&buffer2)); - - //read num bases - char buffer3 [sizeof(header.numBases)]; - in.read(buffer3, sizeof(header.numBases)); - header.numBases = be_int4(*(unsigned int *)(&buffer3)); - - //read clip qual left - char buffer4 [sizeof(header.clipQualLeft)]; - in.read(buffer4, sizeof(header.clipQualLeft)); - header.clipQualLeft = be_int2(*(unsigned short *)(&buffer4)); - - //read clip qual right - char buffer5 [sizeof(header.clipQualRight)]; - in.read(buffer5, sizeof(header.clipQualRight)); - header.clipQualRight = be_int2(*(unsigned short *)(&buffer5)); - - //read clipAdapterLeft - char buffer6 [sizeof(header.clipAdapterLeft)]; - in.read(buffer6, sizeof(header.clipAdapterLeft)); - header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6)); - - //read clipAdapterRight - char buffer7 [sizeof(header.clipAdapterRight)]; - in.read(buffer7, sizeof(header.clipAdapterRight)); - header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7)); - - //read name - char tempBuffer [header.nameLength]; - in.read(tempBuffer, header.nameLength); - header.name = tempBuffer; - if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength); } - - /* Pad to 8 chars */ - unsigned long int spotInFile = in.tellg(); - unsigned long int spot = (spotInFile + 7)& ~7; - in.seekg(spot); - - }else{ - m->mothurOut("Error reading sff header info."); m->mothurOutEndLine(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readHeader"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, int numBases){ - try { - - if (!in.eof()) { - - //read flowgram - read.flowgram.resize(numFlowReads); - for (int i = 0; i < numFlowReads; i++) { - char buffer [sizeof(unsigned short)]; - in.read(buffer, (sizeof(unsigned short))); - read.flowgram[i] = be_int2(*(unsigned short *)(&buffer)); - } - - //read flowIndex - read.flowIndex.resize(numBases); - for (int i = 0; i < numBases; i++) { - char temp[1]; - in.read(temp, 1); - read.flowIndex[i] = be_int1(*(unsigned char *)(&temp)); - } - - //read bases - char tempBuffer[numBases]; - in.read(tempBuffer, numBases); - read.bases = tempBuffer; - if (read.bases.length() > numBases) { read.bases = read.bases.substr(0, numBases); } - - //read flowgram - read.qualScores.resize(numBases); - for (int i = 0; i < numBases; i++) { - char temp[1]; - in.read(temp, 1); - read.qualScores[i] = be_int1(*(unsigned char *)(&temp)); - } - - /* Pad to 8 chars */ - unsigned long int spotInFile = in.tellg(); - unsigned long int spot = (spotInFile + 7)& ~7; - in.seekg(spot); - - }else{ - m->mothurOut("Error reading."); m->mothurOutEndLine(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readSeqData"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::printCommonHeader(ofstream& out, CommonHeader& header) { - try { - - out << "Common Header:\nMagic Number: " << header.magicNumber << endl; - out << "Version: " << header.version << endl; - out << "Index Offset: " << header.indexOffset << endl; - out << "Index Length: " << header.indexLength << endl; - out << "Number of Reads: " << header.numReads << endl; - out << "Header Length: " << header.headerLength << endl; - out << "Key Length: " << header.keyLength << endl; - out << "Number of Flows: " << header.numFlowsPerRead << endl; - out << "Format Code: " << header.flogramFormatCode << endl; - out << "Flow Chars: " << header.flowChars << endl; - out << "Key Sequence: " << header.keySequence << endl << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printCommonHeader"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::printHeader(ofstream& out, Header& header) { - try { - - out << ">" << header.name << endl; - out << "Run Prefix: " << endl; - out << "Region #: " << endl; - out << "XY Location: " << endl << endl; - - out << "Run Name: " << endl; - out << "Analysis Name: " << endl; - out << "Full Path: " << endl << endl; - - out << "Read Header Len: " << header.headerLength << endl; - out << "Name Length: " << header.nameLength << endl; - out << "# of Bases: " << header.numBases << endl; - out << "Clip Qual Left: " << header.clipQualLeft << endl; - out << "Clip Qual Right: " << header.clipQualRight << endl; - out << "Clip Adap Left: " << header.clipAdapterLeft << endl; - out << "Clip Adap Right: " << header.clipAdapterRight << endl << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printHeader"); - exit(1); - } -} - -//********************************************************************************************************************** -int SffInfoCommand::printSffTxtSeqData(ofstream& out, seqRead& read) { - try { - - out << "FlowGram: "; - for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; } - - out << endl << "Flow Indexes: "; - int sum = 0; - for (int i = 0; i < read.flowIndex.size(); i++) { sum += read.flowIndex[i]; out << sum << '\t'; } - - out << endl << "Bases: " << read.bases << endl << "Quality Scores: "; - for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } - out << endl << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printSffTxtSeqData"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::printFastaSeqData(ofstream& out, seqRead& read, Header& header) { - try { - - string seq = read.bases; - - - if (trim) { - seq = seq.substr(header.clipQualLeft, (header.clipQualRight-header.clipQualLeft)); - } - - out << ">" << header.name << endl; - out << seq << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printFastaSeqData"); - exit(1); - } -} - -//********************************************************************************************************************** -int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& header) { - try { - - if (trim) { - out << ">" << header.name << " length=" << (header.clipQualRight-header.clipQualLeft) << endl; - for (int i = header.clipQualLeft; i < (header.clipQualRight-header.clipQualLeft); i++) { out << read.qualScores[i] << '\t'; } - }else{ - out << ">" << header.name << " length=" << read.qualScores.size() << endl; - for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } - } - - out << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printQualSeqData"); - exit(1); - } -} - -//********************************************************************************************************************** -int SffInfoCommand::printFlowSeqData(ofstream& out, seqRead& read, Header& header) { - try { - - out << ">" << header.name << endl; - for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; } - out << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printFlowSeqData"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readAccnosFile(string filename) { - try { - //remove old names - seqNames.clear(); - - ifstream in; - openInputFile(filename, in); - string name; - - while(!in.eof()){ - in >> name; gobble(in); - - seqNames.insert(name); - - if (m->control_pressed) { seqNames.clear(); break; } - } - in.close(); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readAccnosFile"); - exit(1); - } -} -//**********************************************************************************************************************/ +/* + * sffinfocommand.cpp + * Mothur + * + * Created by westcott on 7/7/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "sffinfocommand.h" +#include "endiannessmacros.h" + +//********************************************************************************************************************** + +SffInfoCommand::SffInfoCommand(string option) { + try { + abort = false; + hasAccnos = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"sff","qfile","fasta","flow","trim","accnos","sfftxt","outputdir","inputdir", "outputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } + + sffFilename = validParameter.validFile(parameters, "sff", false); + if (sffFilename == "not found") { m->mothurOut("sff is a required parameter for the sffinfo command."); m->mothurOutEndLine(); abort = true; } + else { + splitAtDash(sffFilename, filenames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < filenames.size(); i++) { + if (inputDir != "") { + string path = hasPath(filenames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { filenames[i] = inputDir + filenames[i]; } + } + + ifstream in; + int ableToOpen = openInputFile(filenames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + getSimpleName(filenames[i]); + m->mothurOut("Unable to open " + filenames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ableToOpen = openInputFile(tryPath, in, "noerror"); + filenames[i] = tryPath; + } + } + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + filenames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + filenames.erase(filenames.begin()+i); + i--; + } + } + + //make sure there is at least one valid file left + if (filenames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + } + + accnosName = validParameter.validFile(parameters, "accnos", false); + if (accnosName == "not found") { accnosName = ""; } + else { + hasAccnos = true; + splitAtDash(accnosName, accnosFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < accnosFileNames.size(); i++) { + if (inputDir != "") { + string path = hasPath(accnosFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { accnosFileNames[i] = inputDir + accnosFileNames[i]; } + } + + ifstream in; + int ableToOpen = openInputFile(accnosFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + getSimpleName(accnosFileNames[i]); + m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ableToOpen = openInputFile(tryPath, in, "noerror"); + accnosFileNames[i] = tryPath; + } + } + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + accnosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + accnosFileNames.erase(accnosFileNames.begin()+i); + i--; + } + } + + //make sure there is at least one valid file left + if (accnosFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + } + + if (hasAccnos) { + if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); } + } + + string temp = validParameter.validFile(parameters, "qfile", false); if (temp == "not found"){ temp = "T"; } + qual = isTrue(temp); + + temp = validParameter.validFile(parameters, "fasta", false); if (temp == "not found"){ temp = "T"; } + fasta = isTrue(temp); + + temp = validParameter.validFile(parameters, "flow", false); if (temp == "not found"){ temp = "F"; } + flow = isTrue(temp); + + temp = validParameter.validFile(parameters, "trim", false); if (temp == "not found"){ temp = "T"; } + trim = isTrue(temp); + + temp = validParameter.validFile(parameters, "sfftxt", false); if (temp == "not found"){ temp = "F"; } + sfftxt = isTrue(temp); + } + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void SffInfoCommand::help(){ + try { + m->mothurOut("The sffinfo command reads a sff file and extracts the sequence data.\n"); + m->mothurOut("The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, and trim. sff is required. \n"); + m->mothurOut("The sff parameter allows you to enter the sff file you would like to extract data from. You may enter multiple files by separating them by -'s.\n"); + m->mothurOut("The fasta parameter allows you to indicate if you would like a fasta formatted file generated. Default=True. \n"); + m->mothurOut("The qfile parameter allows you to indicate if you would like a quality file generated. Default=True. \n"); + m->mothurOut("The flow parameter allows you to indicate if you would like a flowgram file generated. Default=False. \n"); + m->mothurOut("The sfftxt parameter allows you to indicate if you would like a sff.txt file generated. Default=False. \n"); + m->mothurOut("The trim parameter allows you to indicate if you would like a sequences and quality scores trimmed to the clipQualLeft and clipQualRight values. Default=True. \n"); + m->mothurOut("The accnos parameter allows you to provide a accnos file containing the names of the sequences you would like extracted. You may enter multiple files by separating them by -'s. \n"); + m->mothurOut("Example sffinfo(sff=mySffFile.sff, trim=F).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. sff), '=' and parameters (i.e.yourSffFileName).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "help"); + exit(1); + } +} +//********************************************************************************************************************** + +SffInfoCommand::~SffInfoCommand(){} + +//********************************************************************************************************************** +int SffInfoCommand::execute(){ + try { + + if (abort == true) { return 0; } + + for (int s = 0; s < filenames.size(); s++) { + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + int start = time(NULL); + + m->mothurOut("Extracting info from " + filenames[s] + " ..." ); m->mothurOutEndLine(); + + string accnos = ""; + if (hasAccnos) { accnos = accnosFileNames[s]; } + + int numReads = extractSffInfo(filenames[s], accnos); + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + "."); + } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + //report output filenames + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::extractSffInfo(string input, string accnos){ + try { + + if (outputDir == "") { outputDir += hasPath(input); } + + if (accnos != "") { readAccnosFile(accnos); } + else { seqNames.clear(); } + + ofstream outSfftxt, outFasta, outQual, outFlow; + string outFastaFileName, outQualFileName; + string sfftxtFileName = outputDir + getRootName(getSimpleName(input)) + "sff.txt"; + string outFlowFileName = outputDir + getRootName(getSimpleName(input)) + "flow"; + if (trim) { + outFastaFileName = outputDir + getRootName(getSimpleName(input)) + "fasta"; + outQualFileName = outputDir + getRootName(getSimpleName(input)) + "qual"; + }else{ + outFastaFileName = outputDir + getRootName(getSimpleName(input)) + "raw.fasta"; + outQualFileName = outputDir + getRootName(getSimpleName(input)) + "raw.qual"; + } + + if (sfftxt) { openOutputFile(sfftxtFileName, outSfftxt); outSfftxt.setf(ios::fixed, ios::floatfield); outSfftxt.setf(ios::showpoint); outputNames.push_back(sfftxtFileName); } + if (fasta) { openOutputFile(outFastaFileName, outFasta); outputNames.push_back(outFastaFileName); } + if (qual) { openOutputFile(outQualFileName, outQual); outputNames.push_back(outQualFileName); } + if (flow) { openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); } + + ifstream in; + in.open(input.c_str(), ios::binary); + + CommonHeader header; + readCommonHeader(in, header); + + int count = 0; + + //check magic number and version + if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); return count; } + if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); return count; } + + //print common header + if (sfftxt) { printCommonHeader(outSfftxt, header); } + + //read through the sff file + while (!in.eof()) { + + bool print = true; + + //read header + Header readheader; + readHeader(in, readheader); + + //read data + seqRead read; + readSeqData(in, read, header.numFlowsPerRead, readheader.numBases); + + //if you have provided an accosfile and this seq is not in it, then dont print + if (seqNames.size() != 0) { if (seqNames.count(readheader.name) == 0) { print = false; } } + + //print + if (print) { + if (sfftxt) { printHeader(outSfftxt, readheader); printSffTxtSeqData(outSfftxt, read, readheader); } + if (fasta) { printFastaSeqData(outFasta, read, readheader); } + if (qual) { printQualSeqData(outQual, read, readheader); } + if (flow) { printFlowSeqData(outFlow, read, readheader); } + } + + count++; + + //report progress + if((count+1) % 10000 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } + + if (m->control_pressed) { count = 0; break; } + + if (count >= header.numReads) { break; } + } + + //report progress + if (!m->control_pressed) { if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } + + in.close(); + + if (sfftxt) { outSfftxt.close(); } + if (fasta) { outFasta.close(); } + if (qual) { outQual.close(); } + if (flow) { outFlow.close(); } + + return count; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "extractSffInfo"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ + try { + + if (!in.eof()) { + + //read magic number + char buffer[4]; + in.read(buffer, 4); + header.magicNumber = be_int4(*(unsigned int *)(&buffer)); + + //read version + char buffer9[4]; + in.read(buffer9, 4); + header.version = ""; + for (int i = 0; i < 4; i++) { header.version += toString((int)(buffer9[i])); } + + //read offset + char buffer2 [8]; + in.read(buffer2, 8); + header.indexOffset = be_int8(*(unsigned long int *)(&buffer2)); + + //read index length + char buffer3 [4]; + in.read(buffer3, 4); + header.indexLength = be_int4(*(unsigned int *)(&buffer3)); + + //read num reads + char buffer4 [4]; + in.read(buffer4, 4); + header.numReads = be_int4(*(unsigned int *)(&buffer4)); + + //read header length + char buffer5 [2]; + in.read(buffer5, 2); + header.headerLength = be_int2(*(unsigned short *)(&buffer5)); + + //read key length + char buffer6 [2]; + in.read(buffer6, 2); + header.keyLength = be_int2(*(unsigned short *)(&buffer6)); + + //read number of flow reads + char buffer7 [2]; + in.read(buffer7, 2); + header.numFlowsPerRead = be_int2(*(unsigned short *)(&buffer7)); + + //read format code + char buffer8 [1]; + in.read(buffer8, 1); + header.flogramFormatCode = (int)(buffer8[0]); + + //read flow chars + char* tempBuffer = new char[header.numFlowsPerRead]; + in.read(&(*tempBuffer), header.numFlowsPerRead); + header.flowChars = tempBuffer; + if (header.flowChars.length() > header.numFlowsPerRead) { header.flowChars = header.flowChars.substr(0, header.numFlowsPerRead); } + delete[] tempBuffer; + + //read key + char* tempBuffer2 = new char[header.keyLength]; + in.read(&(*tempBuffer2), header.keyLength); + header.keySequence = tempBuffer2; + if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength); } + delete[] tempBuffer2; + + /* Pad to 8 chars */ + unsigned long int spotInFile = in.tellg(); + unsigned long int spot = (spotInFile + 7)& ~7; // ~ inverts + in.seekg(spot); + + }else{ + m->mothurOut("Error reading sff common header."); m->mothurOutEndLine(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::readHeader(ifstream& in, Header& header){ + try { + + if (!in.eof()) { + + //read header length + char buffer [2]; + in.read(buffer, 2); + header.headerLength = be_int2(*(unsigned short *)(&buffer)); + + //read name length + char buffer2 [2]; + in.read(buffer2, 2); + header.nameLength = be_int2(*(unsigned short *)(&buffer2)); + + //read num bases + char buffer3 [4]; + in.read(buffer3, 4); + header.numBases = be_int4(*(unsigned int *)(&buffer3)); + + //read clip qual left + char buffer4 [2]; + in.read(buffer4, 2); + header.clipQualLeft = be_int2(*(unsigned short *)(&buffer4)); + + //read clip qual right + char buffer5 [2]; + in.read(buffer5, 2); + header.clipQualRight = be_int2(*(unsigned short *)(&buffer5)); + + //read clipAdapterLeft + char buffer6 [2]; + in.read(buffer6, 2); + header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6)); + + //read clipAdapterRight + char buffer7 [2]; + in.read(buffer7, 2); + header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7)); + + //read name + char* tempBuffer = new char[header.nameLength]; + in.read(&(*tempBuffer), header.nameLength); + header.name = tempBuffer; + if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength); } + delete[] tempBuffer; + + /* Pad to 8 chars */ + unsigned long int spotInFile = in.tellg(); + unsigned long int spot = (spotInFile + 7)& ~7; + in.seekg(spot); + + }else{ + m->mothurOut("Error reading sff header info."); m->mothurOutEndLine(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, int numBases){ + try { + + if (!in.eof()) { + + //read flowgram + read.flowgram.resize(numFlowReads); + for (int i = 0; i < numFlowReads; i++) { + char buffer [2]; + in.read(buffer, 2); + read.flowgram[i] = be_int2(*(unsigned short *)(&buffer)); + } + + //read flowIndex + read.flowIndex.resize(numBases); + for (int i = 0; i < numBases; i++) { + char temp[1]; + in.read(temp, 1); + read.flowIndex[i] = be_int1(*(unsigned char *)(&temp)); + } + + //read bases + char* tempBuffer = new char[numBases]; + in.read(&(*tempBuffer), numBases); + read.bases = tempBuffer; + if (read.bases.length() > numBases) { read.bases = read.bases.substr(0, numBases); } + delete[] tempBuffer; + + //read qual scores + read.qualScores.resize(numBases); + for (int i = 0; i < numBases; i++) { + char temp[1]; + in.read(temp, 1); + read.qualScores[i] = be_int1(*(unsigned char *)(&temp)); + } + + /* Pad to 8 chars */ + unsigned long int spotInFile = in.tellg(); + unsigned long int spot = (spotInFile + 7)& ~7; + in.seekg(spot); + + }else{ + m->mothurOut("Error reading."); m->mothurOutEndLine(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printCommonHeader(ofstream& out, CommonHeader& header) { + try { + + out << "Common Header:\nMagic Number: " << header.magicNumber << endl; + out << "Version: " << header.version << endl; + out << "Index Offset: " << header.indexOffset << endl; + out << "Index Length: " << header.indexLength << endl; + out << "Number of Reads: " << header.numReads << endl; + out << "Header Length: " << header.headerLength << endl; + out << "Key Length: " << header.keyLength << endl; + out << "Number of Flows: " << header.numFlowsPerRead << endl; + out << "Format Code: " << header.flogramFormatCode << endl; + out << "Flow Chars: " << header.flowChars << endl; + out << "Key Sequence: " << header.keySequence << endl << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printHeader(ofstream& out, Header& header) { + try { + + out << ">" << header.name << endl; + out << "Run Prefix: " << endl; + out << "Region #: " << endl; + out << "XY Location: " << endl << endl; + + out << "Run Name: " << endl; + out << "Analysis Name: " << endl; + out << "Full Path: " << endl << endl; + + out << "Read Header Len: " << header.headerLength << endl; + out << "Name Length: " << header.nameLength << endl; + out << "# of Bases: " << header.numBases << endl; + out << "Clip Qual Left: " << header.clipQualLeft << endl; + out << "Clip Qual Right: " << header.clipQualRight << endl; + out << "Clip Adap Left: " << header.clipAdapterLeft << endl; + out << "Clip Adap Right: " << header.clipAdapterRight << endl << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printHeader"); + exit(1); + } +} + +//********************************************************************************************************************** +int SffInfoCommand::printSffTxtSeqData(ofstream& out, seqRead& read, Header& header) { + try { + + out << "FlowGram: "; + for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; } + + out << endl << "Flow Indexes: "; + int sum = 0; + for (int i = 0; i < read.flowIndex.size(); i++) { sum += read.flowIndex[i]; out << sum << '\t'; } + + //make the bases you want to clip lowercase and the bases you want to keep upper case + for (int i = 0; i < header.clipQualLeft; i++) { read.bases[i] = tolower(read.bases[i]); } + for (int i = header.clipQualLeft; i < (header.clipQualRight-header.clipQualLeft); i++) { read.bases[i] = toupper(read.bases[i]); } + for (int i = (header.clipQualRight-header.clipQualLeft); i < read.bases.length(); i++) { read.bases[i] = tolower(read.bases[i]); } + + out << endl << "Bases: " << read.bases << endl << "Quality Scores: "; + for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } + + + out << endl << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printSffTxtSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printFastaSeqData(ofstream& out, seqRead& read, Header& header) { + try { + + string seq = read.bases; + + + if (trim) { + seq = seq.substr(header.clipQualLeft, (header.clipQualRight-header.clipQualLeft)); + }else{ + //if you wanted the sfftxt then you already converted the bases to the right case + if (!sfftxt) { + //make the bases you want to clip lowercase and the bases you want to keep upper case + for (int i = 0; i < header.clipQualLeft; i++) { seq[i] = tolower(seq[i]); } + for (int i = header.clipQualLeft; i < (header.clipQualRight-header.clipQualLeft); i++) { seq[i] = toupper(seq[i]); } + for (int i = (header.clipQualRight-header.clipQualLeft); i < seq.length(); i++) { seq[i] = tolower(seq[i]); } + } + } + + out << ">" << header.name << endl; + out << seq << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printFastaSeqData"); + exit(1); + } +} + +//********************************************************************************************************************** +int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& header) { + try { + + if (trim) { + out << ">" << header.name << " length=" << (header.clipQualRight-header.clipQualLeft) << endl; + for (int i = header.clipQualLeft; i < (header.clipQualRight-header.clipQualLeft); i++) { out << read.qualScores[i] << '\t'; } + }else{ + out << ">" << header.name << " length=" << read.qualScores.size() << endl; + for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } + } + + out << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printQualSeqData"); + exit(1); + } +} + +//********************************************************************************************************************** +int SffInfoCommand::printFlowSeqData(ofstream& out, seqRead& read, Header& header) { + try { + + out << ">" << header.name << endl; + for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; } + out << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printFlowSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::readAccnosFile(string filename) { + try { + //remove old names + seqNames.clear(); + + ifstream in; + openInputFile(filename, in); + string name; + + while(!in.eof()){ + in >> name; gobble(in); + + seqNames.insert(name); + + if (m->control_pressed) { seqNames.clear(); break; } + } + in.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readAccnosFile"); + exit(1); + } +} +//**********************************************************************************************************************/ diff --git a/sffinfocommand.h b/sffinfocommand.h index b533f28..939a1c7 100644 --- a/sffinfocommand.h +++ b/sffinfocommand.h @@ -77,7 +77,7 @@ private: int printCommonHeader(ofstream&, CommonHeader&); int printHeader(ofstream&, Header&); - int printSffTxtSeqData(ofstream&, seqRead&); + int printSffTxtSeqData(ofstream&, seqRead&, Header&); int printFlowSeqData(ofstream&, seqRead&, Header&); int printFastaSeqData(ofstream&, seqRead&, Header&); int printQualSeqData(ofstream&, seqRead&, Header&); diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 5a01249..a3abd59 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -322,11 +322,20 @@ int TrimSeqsCommand::execute(){ openInputFile(fastaFile, inFASTA); getNumSeqs(inFASTA, numSeqs); inFASTA.close(); - qFile.close(); + lines.push_back(new linePair(0, numSeqs)); driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], lines[0]); + for (int j = 0; j < fastaFileNames.size(); j++) { + rename((fastaFileNames[j] + toString(j) + ".temp").c_str(), fastaFileNames[j].c_str()); + } + if(qFileName != ""){ + for (int j = 0; j < qualFileNames.size(); j++) { + rename((qualFileNames[j] + toString(j) + ".temp").c_str(), qualFileNames[j].c_str()); + } + } + if (m->control_pressed) { return 0; } #endif @@ -479,7 +488,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string Sequence currSeq(inFASTA); QualityScores currQual; if(qFileName != ""){ - currQual = QualityScores(qFile); + currQual = QualityScores(qFile, currSeq.getNumBases()); } string origSeq = currSeq.getUnaligned(); -- 2.39.2