From 956cdff34f2d609a7736838b1631cd7957580b8b Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 14 Jul 2010 11:45:40 +0000 Subject: [PATCH] started work on sffinfo command. fixed bug across all paralellized commands if the file position was too big to fit in an int. --- Mothur.xcodeproj/project.pbxproj | 10 +- aligncommand.cpp | 6 +- aligncommand.h | 4 +- alignmentdb.cpp | 2 +- bayesian.cpp | 4 +- bellerophon.cpp | 4 +- bellerophon.h | 4 +- chimera.cpp | 2 +- chimera.h | 6 +- chimeraccodecommand.cpp | 6 +- chimeraccodecommand.h | 4 +- chimeracheckcommand.cpp | 6 +- chimeracheckcommand.h | 4 +- chimerapintailcommand.cpp | 6 +- chimerapintailcommand.h | 4 +- chimeraslayercommand.cpp | 6 +- chimeraslayercommand.h | 4 +- classify.cpp | 4 +- classifyseqscommand.cpp | 14 +- classifyseqscommand.h | 4 +- commandfactory.cpp | 3 + endiannessmacros.h | 189 +++++++++++++ filterseqscommand.cpp | 10 +- filterseqscommand.h | 4 +- makefile | 1 + mothur.cpp | 8 + mothur.h | 18 +- phylosummary.cpp | 3 + phylotree.cpp | 2 +- screenseqscommand.cpp | 13 +- screenseqscommand.h | 4 +- seqsummarycommand.cpp | 10 +- seqsummarycommand.h | 4 +- sffinfocommand.cpp | 469 +++++++++++++++++++++++++++++++ sffinfocommand.h | 84 ++++++ trimseqscommand.cpp | 10 +- trimseqscommand.h | 4 +- 37 files changed, 852 insertions(+), 88 deletions(-) create mode 100644 endiannessmacros.h create mode 100644 sffinfocommand.cpp create mode 100644 sffinfocommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 6770cec..dae9fe3 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -26,6 +26,8 @@ A72B3A63118B37FD004B9F8D /* phylodiversitycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversitycommand.cpp; sourceTree = ""; }; A72B3A7B118B4D1B004B9F8D /* phylodiversity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylodiversity.h; sourceTree = ""; }; A72B3A7C118B4D1B004B9F8D /* phylodiversity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylodiversity.cpp; sourceTree = ""; }; + A732505E11E49EF100484B90 /* sffinfocommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sffinfocommand.h; sourceTree = ""; }; + A732505F11E49EF100484B90 /* sffinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sffinfocommand.cpp; sourceTree = ""; }; A73953DA11987ED100B0B160 /* chopseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chopseqscommand.h; sourceTree = ""; }; A73953DB11987ED100B0B160 /* chopseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chopseqscommand.cpp; sourceTree = ""; }; A747E79B1163442A00FB9042 /* chimeracheckcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckcommand.h; sourceTree = ""; }; @@ -37,6 +39,7 @@ A76AAD03117F322B003D8DA1 /* phylosummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylosummary.cpp; sourceTree = ""; }; A76C4A1011876BAF0009460B /* setlogfilecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = setlogfilecommand.h; sourceTree = SOURCE_ROOT; }; A76C4A1111876BAF0009460B /* setlogfilecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = setlogfilecommand.cpp; sourceTree = SOURCE_ROOT; }; + A780E6CB11E7745D00BB5718 /* endiannessmacros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = endiannessmacros.h; sourceTree = ""; }; A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = ""; }; A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = ""; }; A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = ""; }; @@ -480,6 +483,7 @@ A7DA203C113FECD400BF472F /* display.h */, A7DA2042113FECD400BF472F /* dlibshuff.cpp */, A7DA2043113FECD400BF472F /* dlibshuff.h */, + A780E6CB11E7745D00BB5718 /* endiannessmacros.h */, A7DA2048113FECD400BF472F /* engine.cpp */, A7DA2049113FECD400BF472F /* engine.hpp */, A7DA204C113FECD400BF472F /* fileoutput.cpp */, @@ -657,8 +661,6 @@ A7CB593E11402F110010EB83 /* commands */ = { isa = PBXGroup; children = ( - 7EA299BA11E384940022D8D3 /* sensspeccommand.h */, - 7EA299BB11E384940022D8D3 /* sensspeccommand.cpp */, A7DA202B113FECD400BF472F /* command.hpp */, A7DA1FEF113FECD400BF472F /* aligncommand.h */, A7DA1FEE113FECD400BF472F /* aligncommand.cpp */, @@ -782,12 +784,16 @@ A7DA20F8113FECD400BF472F /* screenseqscommand.cpp */, A7DA20FB113FECD400BF472F /* secondarystructurecommand.h */, A7DA20FA113FECD400BF472F /* secondarystructurecommand.cpp */, + 7EA299BA11E384940022D8D3 /* sensspeccommand.h */, + 7EA299BB11E384940022D8D3 /* sensspeccommand.cpp */, A7DA20FD113FECD400BF472F /* seqsummarycommand.h */, A7DA20FC113FECD400BF472F /* seqsummarycommand.cpp */, A7DA2103113FECD400BF472F /* setdircommand.h */, A7DA2102113FECD400BF472F /* setdircommand.cpp */, A76C4A1011876BAF0009460B /* setlogfilecommand.h */, A76C4A1111876BAF0009460B /* setlogfilecommand.cpp */, + A732505E11E49EF100484B90 /* sffinfocommand.h */, + A732505F11E49EF100484B90 /* sffinfocommand.cpp */, A7DA2110113FECD400BF472F /* sharedcommand.h */, A7DA210F113FECD400BF472F /* sharedcommand.cpp */, A71D924511AEB42400D00CBC /* splitabundcommand.h */, diff --git a/aligncommand.cpp b/aligncommand.cpp index 16006f7..8d1b40f 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -386,7 +386,7 @@ int AlignCommand::execute(){ } } else{ - vector positions; + vector positions; processIDS.resize(0); ifstream inFASTA; @@ -396,7 +396,7 @@ int AlignCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -406,7 +406,7 @@ int AlignCommand::execute(){ int numSeqsPerProcessor = numFastaSeqs / processors; for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } diff --git a/aligncommand.h b/aligncommand.h index b100287..186ca63 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -26,9 +26,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int numSeqs; - linePair(long int i, int j) : start(i), numSeqs(j) {} + linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/alignmentdb.cpp b/alignmentdb.cpp index 9b2977d..bf46bec 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -26,7 +26,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap #ifdef USE_MPI int pid, processors; - vector positions; + vector positions; MPI_Status status; MPI_File inMPI; diff --git a/bayesian.cpp b/bayesian.cpp index 1726477..655d702 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -368,8 +368,8 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string #ifdef USE_MPI int pid, num, num2, processors; - vector positions; - vector positions2; + vector positions; + vector positions2; MPI_Status status; MPI_File inMPI; diff --git a/bellerophon.cpp b/bellerophon.cpp index aa26d29..743c6d7 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -246,7 +246,7 @@ int Bellerophon::getChimeras() { numSeqsPerProcessor = iters / processors; //each process hits this only once - int startPos = pid * numSeqsPerProcessor; + unsigned long int startPos = pid * numSeqsPerProcessor; if(pid == processors - 1){ numSeqsPerProcessor = iters - pid * numSeqsPerProcessor; } @@ -326,7 +326,7 @@ int Bellerophon::getChimeras() { int numSeqsPerProcessor = iters / processors; for (int i = 0; i < processors; i++) { - int startPos = i * numSeqsPerProcessor; + unsigned long int startPos = i * numSeqsPerProcessor; if(i == processors - 1){ numSeqsPerProcessor = iters - i * numSeqsPerProcessor; } diff --git a/bellerophon.h b/bellerophon.h index 1333ec8..41c1019 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -36,9 +36,9 @@ class Bellerophon : public Chimera { private: struct linePair { - int start; + unsigned long int start; int num; - linePair(long int i, int j) : start(i), num(j) {} + linePair(unsigned long int i, int j) : start(i), num(j) {} }; vector lines; diff --git a/chimera.cpp b/chimera.cpp index 3ab9589..685d6db 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -104,7 +104,7 @@ vector Chimera::readSeqs(string file) { #ifdef USE_MPI int pid, processors; - vector positions; + vector positions; int numSeqs; int tag = 2001; diff --git a/chimera.h b/chimera.h index 11bc435..7120478 100644 --- a/chimera.h +++ b/chimera.h @@ -77,9 +77,9 @@ struct sim { }; struct linePair { - int start; - int end; - linePair(int i, int j) : start(i), end(j) {} + unsigned long int start; + unsigned long int end; + linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {} linePair(){} }; diff --git a/chimeraccodecommand.cpp b/chimeraccodecommand.cpp index 3d3c9ae..578c7d6 100644 --- a/chimeraccodecommand.cpp +++ b/chimeraccodecommand.cpp @@ -332,7 +332,7 @@ int ChimeraCcodeCommand::execute(){ } }else{ - vector positions; + vector positions; processIDS.resize(0); ifstream inFASTA; @@ -342,7 +342,7 @@ int ChimeraCcodeCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -352,7 +352,7 @@ int ChimeraCcodeCommand::execute(){ int numSeqsPerProcessor = numSeqs / processors; for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } diff --git a/chimeraccodecommand.h b/chimeraccodecommand.h index c3d71c9..c740a2f 100644 --- a/chimeraccodecommand.h +++ b/chimeraccodecommand.h @@ -28,9 +28,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int numSeqs; - linePair(long int i, int j) : start(i), numSeqs(j) {} + linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp index 25f9c0f..098896e 100644 --- a/chimeracheckcommand.cpp +++ b/chimeracheckcommand.cpp @@ -338,7 +338,7 @@ int ChimeraCheckCommand::execute(){ } }else{ - vector positions; + vector positions; processIDS.resize(0); ifstream inFASTA; @@ -348,7 +348,7 @@ int ChimeraCheckCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -358,7 +358,7 @@ int ChimeraCheckCommand::execute(){ int numSeqsPerProcessor = numSeqs / processors; for (int j = 0; j < processors; j++) { - long int startPos = positions[ j * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ j * numSeqsPerProcessor ]; if(j == processors - 1){ numSeqsPerProcessor = numSeqs - j * numSeqsPerProcessor; } diff --git a/chimeracheckcommand.h b/chimeracheckcommand.h index b3b49d3..7c49344 100644 --- a/chimeracheckcommand.h +++ b/chimeracheckcommand.h @@ -28,9 +28,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int numSeqs; - linePair(long int i, int j) : start(i), numSeqs(j) {} + linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp index 6cf581b..8dffd3a 100644 --- a/chimerapintailcommand.cpp +++ b/chimerapintailcommand.cpp @@ -359,7 +359,7 @@ int ChimeraPintailCommand::execute(){ } }else{ - vector positions; + vector positions; processIDS.resize(0); ifstream inFASTA; @@ -369,7 +369,7 @@ int ChimeraPintailCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -379,7 +379,7 @@ int ChimeraPintailCommand::execute(){ int numSeqsPerProcessor = numSeqs / processors; for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } diff --git a/chimerapintailcommand.h b/chimerapintailcommand.h index 65e67fe..9fc6f43 100644 --- a/chimerapintailcommand.h +++ b/chimerapintailcommand.h @@ -29,9 +29,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int numSeqs; - linePair(long int i, int j) : start(i), numSeqs(j) {} + linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index ae98ff3..74fd3aa 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -357,7 +357,7 @@ int ChimeraSlayerCommand::execute(){ } }else{ - vector positions; + vector positions; processIDS.resize(0); ifstream inFASTA; @@ -367,7 +367,7 @@ int ChimeraSlayerCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -377,7 +377,7 @@ int ChimeraSlayerCommand::execute(){ int numSeqsPerProcessor = numSeqs / processors; for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor; } diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index 7c20fb9..c2d9c45 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -28,9 +28,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int numSeqs; - linePair(long int i, int j) : start(i), numSeqs(j) {} + linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/classify.cpp b/classify.cpp index 2eb3ec1..9a2de1f 100644 --- a/classify.cpp +++ b/classify.cpp @@ -28,7 +28,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me m->mothurOut("Generating search database... "); cout.flush(); #ifdef USE_MPI int pid, processors; - vector positions; + vector positions; int tag = 2001; MPI_Status status; @@ -183,7 +183,7 @@ int Classify::readTaxonomy(string file) { #ifdef USE_MPI int pid, num, processors; - vector positions; + vector positions; int tag = 2001; MPI_Status status; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 6c5d827..2a210b0 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -478,7 +478,7 @@ int ClassifySeqsCommand::execute(){ driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); } else{ - vector positions; + vector positions; processIDS.resize(0); ifstream inFASTA; @@ -488,7 +488,7 @@ int ClassifySeqsCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -496,9 +496,9 @@ int ClassifySeqsCommand::execute(){ numFastaSeqs = positions.size(); int numSeqsPerProcessor = numFastaSeqs / processors; - + for (int i = 0; i < processors; i++) { - int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } @@ -761,9 +761,9 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa for(int i=0;inumSeqs;i++){ if (m->control_pressed) { return 0; } - - Sequence* candidateSeq = new Sequence(inFASTA); - + + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + if (candidateSeq->getName() != "") { taxonomy = classify->getTaxonomy(candidateSeq); diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 1dc4c1b..3294118 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -35,9 +35,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int numSeqs; - linePair(int i, int j) : start(i), numSeqs(j) {} + linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/commandfactory.cpp b/commandfactory.cpp index 725ce8c..fc20ee5 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -83,6 +83,7 @@ #include "degapseqscommand.h" #include "getrelabundcommand.h" #include "sensspeccommand.h" +#include "sffinfocommand.h" /*******************************************************/ @@ -171,6 +172,7 @@ CommandFactory::CommandFactory(){ commands["classify.otu"] = "classify.otu"; commands["degap.seqs"] = "degap.seqs"; commands["get.relabund"] = "get.relabund"; + commands["sffinfo"] = "sffinfo"; commands["classify.seqs"] = "MPIEnabled"; commands["dist.seqs"] = "MPIEnabled"; commands["filter.seqs"] = "MPIEnabled"; @@ -300,6 +302,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "degap.seqs") { command = new DegapSeqsCommand(optionString); } else if(commandName == "get.relabund") { command = new GetRelAbundCommand(optionString); } else if(commandName == "sens.spec") { command = new SensSpecCommand(optionString); } + else if(commandName == "sffinfo") { command = new SffInfoCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/endiannessmacros.h b/endiannessmacros.h new file mode 100644 index 0000000..07981ba --- /dev/null +++ b/endiannessmacros.h @@ -0,0 +1,189 @@ +#ifndef EDIANNESSMACROS_H +#define EDIANNESSMACROS_H + +/* + * endiannessmacros.h + * Mothur + * + * Created by westcott on 7/9/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +/*********************************************************************/ +// The following is copied from the staden io_lib-1.12.4 os.h +/* + * Author: + * MRC Laboratory of Molecular Biology + * Hills Road + * Cambridge CB2 2QH + * United Kingdom + * + * Description: operating system specific type definitions + * + */ + +#include +#include + +/*----------------------------------------------------------------------------- + * Detection of endianness. The main part of this is done in autoconf, but + * for the case of MacOS FAT binaries we fall back on auto-sensing based on + * processor type too. + */ + +/* Set by autoconf */ +#define SP_LITTLE_ENDIAN + +/* Mac FAT binaries or unknown. Auto detect based on CPU type */ +#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN) + +/* + * x86 equivalents + */ +#if defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686) +# if defined(SP_BIG_ENDIAN) +# undef SP_BIG_ENDIAN +# endif +# define SP_LITTLE_ENDIAN +#endif + +/* + * DEC Alpha + */ +#if defined(__alpha__) || defined(__alpha) +# if defined(SP_LITTLE_ENDIAN) +# undef SP_LITTLE_ENDIAN +# endif +# define SP_BIG_ENDIAN +#endif + +/* + * SUN Sparc + */ +#if defined(__sparc__) || defined(__sparc) +# if defined(SP_LITTLE_ENDIAN) +# undef SP_LITTLE_ENDIAN +# endif +# define SP_BIG_ENDIAN +#endif + +/* + * PowerPC + */ +#if defined(__ppc__) || defined(__ppc) +# if defined(SP_LITTLE_ENDIAN) +# undef SP_LITTLE_ENDIAN +# endif +# define SP_BIG_ENDIAN +#endif + +/* Some catch-alls */ +#if defined(__LITTLE_ENDIAN__) || defined(__LITTLEENDIAN__) +# define SP_LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN__) || defined(__BIGENDIAN__) +# define SP_BIG_ENDIAN +#endif + +#if defined(SP_BIG_ENDIAN) && defined(SP_LITTLE_ENDIAN) +# error Both BIG and LITTLE endian defined. Fix os.h and/or Makefile +#endif + +#if !defined(SP_BIG_ENDIAN) && !defined(SP_LITTLE_ENDIAN) +# error Neither BIG nor LITTLE endian defined. Fix os.h and/or Makefile +#endif + +#endif + +/*----------------------------------------------------------------------------- + * Byte swapping macros + */ + +/* + * Our new swap runs at the same speed on Ultrix, but substantially faster + * (300% for swap_int4, ~50% for swap_int2) on an Alpha (due to the lack of + * decent 'char' support). + * + * They also have the ability to swap in situ (src == dst). Newer code now + * relies on this so don't change back! + */ +#define iswap_int8(x) \ + (((x & 0x00000000000000ffLL) << 56) + \ + ((x & 0x000000000000ff00LL) << 40) + \ + ((x & 0x0000000000ff0000LL) << 24) + \ + ((x & 0x00000000ff000000LL) << 8) + \ + ((x & 0x000000ff00000000LL) >> 8) + \ + ((x & 0x0000ff0000000000LL) >> 24) + \ + ((x & 0x00ff000000000000LL) >> 40) + \ + ((x & 0xff00000000000000LL) >> 56)) + +#define iswap_int4(x) \ + (((x & 0x000000ff) << 24) + \ + ((x & 0x0000ff00) << 8) + \ + ((x & 0x00ff0000) >> 8) + \ + ((x & 0xff000000) >> 24)) + +#define iswap_int2(x) \ + (((x & 0x00ff) << 8) + \ + ((x & 0xff00) >> 8)) + +#define swap_int8(src, dst) ((dst) = iswap_int8(src)) +#define swap_int4(src, dst) ((dst) = iswap_int4(src)) +#define swap_int2(src, dst) ((dst) = iswap_int2(src)) + + +/* + * Linux systems may use byteswap.h to get assembly versions of byte-swap + * on intel systems. This can be as trivial as the bswap opcode, which works + * out at over 2-times faster than iswap_int4 above. + */ +#if 0 +#if defined(__linux__) +# include +# undef iswap_int8 +# undef iswap_int4 +# undef iswap_int2 +# define iswap_int8 bswap_64 +# define iswap_int4 bswap_32 +# define iswap_int2 bswap_16 +#endif +#endif + + +/* + * Macros to specify that data read in is of a particular endianness. + * The macros here swap to the appropriate order for the particular machine + * running the macro and return the new answer. These may also be used when + * writing to a file to specify that we wish to write in (eg) big endian + * format. + * + * This leads to efficient code as most of the time these macros are + * trivial. + */ +#ifdef SP_BIG_ENDIAN +#define be_int8(x) (x) +#define be_int4(x) (x) +#define be_int2(x) (x) +#define be_int1(x) (x) + +#define le_int8(x) iswap_int8((x)) +#define le_int4(x) iswap_int4((x)) +#define le_int2(x) iswap_int2((x)) +#define le_int1(x) (x) +#endif + +#ifdef SP_LITTLE_ENDIAN +#define be_int8(x) iswap_int8((x)) +#define be_int4(x) iswap_int4((x)) +#define be_int2(x) iswap_int2((x)) +#define be_int1(x) (x) + +#define le_int8(x) (x) +#define le_int4(x) (x) +#define le_int2(x) (x) +#define le_int1(x) (x) +#endif + +#endif \ No newline at end of file diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 33790fe..fada19a 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -872,7 +872,7 @@ int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) int FilterSeqsCommand::setLines(string filename) { try { - vector positions; + vector positions; bufferSizes.clear(); ifstream inFASTA; @@ -883,7 +883,7 @@ int FilterSeqsCommand::setLines(string filename) { input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -891,7 +891,7 @@ int FilterSeqsCommand::setLines(string filename) { int numFastaSeqs = positions.size(); FILE * pFile; - long size; + unsigned long int size; //get num bytes in file pFile = fopen (filename.c_str(),"rb"); @@ -908,12 +908,12 @@ int FilterSeqsCommand::setLines(string filename) { for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; bufferSizes.push_back(size - startPos); }else{ - long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; + unsigned long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; bufferSizes.push_back(myEnd-startPos); } lines.push_back(new linePair(startPos, numSeqsPerProcessor)); diff --git a/filterseqscommand.h b/filterseqscommand.h index 848d08a..2ec152b 100644 --- a/filterseqscommand.h +++ b/filterseqscommand.h @@ -24,9 +24,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int num; - linePair(long int i, long int j) : start(i), num(j) {} + linePair(unsigned long int i, long int j) : start(i), num(j) {} }; vector lines; vector processIDS; diff --git a/makefile b/makefile index 33cce4d..efac3fb 100644 --- a/makefile +++ b/makefile @@ -23,6 +23,7 @@ endif ifeq ($(strip $(64BIT_VERSION)),yes) TARGET_ARCH += -arch x86_64 + CXXFLAGS += -DBIT_VERSION endif # if you do not want to use the readline library, set this to no. diff --git a/mothur.cpp b/mothur.cpp index d5f8387..1e5b41b 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -72,6 +72,14 @@ int main(int argc, char *argv[]){ m->mothurOutEndLine(); m->mothurOutEndLine(); #endif + #ifdef BIT_VERSION + m->mothurOutJustToLog("Running 64Bit Version"); + m->mothurOutEndLine(); m->mothurOutEndLine(); + #else + m->mothurOutJustToLog("Running 32Bit Version"); + m->mothurOutEndLine(); m->mothurOutEndLine(); + #endif + //header m->mothurOut("mothur v.1.11.0"); m->mothurOutEndLine(); diff --git a/mothur.h b/mothur.h index 0a73f1c..1434482 100644 --- a/mothur.h +++ b/mothur.h @@ -992,9 +992,9 @@ inline string sortFile(string distFile, string outputDir){ } } /**************************************************************************************************/ -inline vector setFilePosFasta(string filename, int& num) { +inline vector setFilePosFasta(string filename, int& num) { - vector positions; + vector positions; ifstream inFASTA; openInputFile(filename, inFASTA); @@ -1002,7 +1002,7 @@ inline vector setFilePosFasta(string filename, int& num) { while(!inFASTA.eof()){ input = getline(inFASTA); gobble(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -1021,7 +1021,7 @@ inline vector setFilePosFasta(string filename, int& num) { fclose (pFile); }*/ - long size = positions[(positions.size()-1)]; + unsigned long int size = positions[(positions.size()-1)]; ifstream in; openInputFile(filename, in); @@ -1038,18 +1038,18 @@ inline vector setFilePosFasta(string filename, int& num) { return positions; } /**************************************************************************************************/ -inline vector setFilePosEachLine(string filename, int& num) { +inline vector setFilePosEachLine(string filename, int& num) { - vector positions; + vector positions; ifstream in; openInputFile(filename, in); string input; while(!in.eof()){ - long lastpos = in.tellg(); + unsigned long int lastpos = in.tellg(); input = getline(in); gobble(in); if (input.length() != 0) { - long pos = in.tellg(); + unsigned long int pos = in.tellg(); if (pos != -1) { positions.push_back(pos - input.length() - 1); } else { positions.push_back(lastpos); } } @@ -1059,7 +1059,7 @@ inline vector setFilePosEachLine(string filename, int& num) { num = positions.size(); FILE * pFile; - long size; + unsigned long int size; //get num bytes in file pFile = fopen (filename.c_str(),"rb"); diff --git a/phylosummary.cpp b/phylosummary.cpp index 84b0861..004c131 100644 --- a/phylosummary.cpp +++ b/phylosummary.cpp @@ -172,6 +172,9 @@ void PhyloSummary::print(ofstream& out){ //print labels out << "taxlevel\t rankID\t taxon\t daughterlevels\t total\t"; if (groupmap != NULL) { + //so the labels match the counts below, since the map sorts them automatically... + sort(groupmap->namesOfGroups.begin(), groupmap->namesOfGroups.end()); + for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { out << groupmap->namesOfGroups[i] << '\t'; } diff --git a/phylotree.cpp b/phylotree.cpp index d6c740a..d085f6c 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -125,7 +125,7 @@ PhyloTree::PhyloTree(string tfile){ #ifdef USE_MPI int pid, num, processors; - vector positions; + vector positions; MPI_Status status; MPI_File inMPI; diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 66e0cb0..09b82ce 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -305,7 +305,7 @@ int ScreenSeqsCommand::execute(){ if (m->control_pressed) { remove(goodSeqFile.c_str()); remove(badSeqFile.c_str()); return 0; } }else{ - vector positions; + vector positions; processIDS.resize(0); ifstream inFASTA; @@ -315,21 +315,22 @@ int ScreenSeqsCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); numFastaSeqs = positions.size(); - + int numSeqsPerProcessor = numFastaSeqs / processors; - + for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } createProcesses(goodSeqFile, badSeqFile, badAccnosFile, fastafile, badSeqNames); @@ -708,7 +709,7 @@ int ScreenSeqsCommand::driver(linePair* line, string goodFName, string badFName, inFASTA.seekg(line->start); for(int i=0;inumSeqs;i++){ - + if (m->control_pressed) { return 0; } Sequence currSeq(inFASTA); diff --git a/screenseqscommand.h b/screenseqscommand.h index f1b7205..314e661 100644 --- a/screenseqscommand.h +++ b/screenseqscommand.h @@ -23,9 +23,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int numSeqs; - linePair(long int i, int j) : start(i), numSeqs(j) {} + linePair(unsigned long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 7226f9d..6640539 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -453,7 +453,7 @@ int SeqSummaryCommand::createProcessesCreateSummary(vector& startPosition, int SeqSummaryCommand::setLines(string filename) { try { - vector positions; + vector positions; ifstream inFASTA; openInputFile(filename, inFASTA); @@ -463,7 +463,7 @@ int SeqSummaryCommand::setLines(string filename) { input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -471,7 +471,7 @@ int SeqSummaryCommand::setLines(string filename) { int numFastaSeqs = positions.size(); FILE * pFile; - long size; + unsigned long int size; //get num bytes in file pFile = fopen (filename.c_str(),"rb"); @@ -486,11 +486,11 @@ int SeqSummaryCommand::setLines(string filename) { for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }else{ - long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; + unsigned long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; } lines.push_back(new linePair(startPos, numSeqsPerProcessor)); } diff --git a/seqsummarycommand.h b/seqsummarycommand.h index 58fbc0a..3e576a0 100644 --- a/seqsummarycommand.h +++ b/seqsummarycommand.h @@ -26,9 +26,9 @@ private: int processors; struct linePair { - int start; + unsigned long int start; int num; - linePair(long int i, long int j) : start(i), num(j) {} + linePair(unsigned long int i, long int j) : start(i), num(j) {} }; vector lines; vector processIDS; diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp new file mode 100644 index 0000000..2d6af87 --- /dev/null +++ b/sffinfocommand.cpp @@ -0,0 +1,469 @@ +/* + * sffinfocommand.cpp + * Mothur + * + * Created by westcott on 7/7/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "sffinfocommand.h" +#include "endiannessmacros.h" + +//********************************************************************************************************************** + +SffInfoCommand::SffInfoCommand(string option) { + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"sff","outputdir","inputdir", "outputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } + + sffFilename = validParameter.validFile(parameters, "sff", false); + if (sffFilename == "not found") { m->mothurOut("sff is a required parameter for the sffinfo command."); m->mothurOutEndLine(); abort = true; } + else { + splitAtDash(sffFilename, filenames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < filenames.size(); i++) { + if (inputDir != "") { + string path = hasPath(filenames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { filenames[i] = inputDir + filenames[i]; } + } + + ifstream in; + int ableToOpen = openInputFile(filenames[i], in); + in.close(); + + if (ableToOpen == 1) { + m->mothurOut(filenames[i] + " will be disregarded."); m->mothurOutEndLine(); + //erase from file list + filenames.erase(filenames.begin()+i); + i--; + } + } + + //make sure there is at least one valid file left + if (filenames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + } + } + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void SffInfoCommand::help(){ + try { + m->mothurOut("The sffinfo command reads a sff file and outputs a .sff.txt file.\n"); + + m->mothurOut("Example sffinfo(sff=...).\n"); + m->mothurOut("Note: No spaces between parameter labels (i.e. sff), '=' and parameters (i.e.yourSffFileName).\n\n"); + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "help"); + exit(1); + } +} +//********************************************************************************************************************** + +SffInfoCommand::~SffInfoCommand(){} + +//********************************************************************************************************************** +int SffInfoCommand::execute(){ + try { + + if (abort == true) { return 0; } + + for (int s = 0; s < filenames.size(); s++) { + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + m->mothurOut("Extracting info from " + filenames[s] + " ..." ); m->mothurOutEndLine(); + + if (outputDir == "") { outputDir += hasPath(filenames[s]); } + string outputFileName = outputDir + getRootName(getSimpleName(filenames[s])) + "sff.txt"; + + extractSffInfo(filenames[s], outputFileName); + + outputNames.push_back(outputFileName); + + m->mothurOut("Done."); m->mothurOutEndLine(); + } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + + //report output filenames + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::extractSffInfo(string input, string output){ + try { + ofstream out; + openOutputFile(output, out); + + ifstream in; + in.open(input.c_str(), ios::binary); + + CommonHeader* header = readCommonHeader(in); + + //cout << "magic = " << header->magicNumber << endl << "version = " << header->version << endl << "index offset = " << header->indexOffset << endl << "index length = "<< header->indexLength << endl << "numreads = " << header->numReads << endl << "header length = " << header->headerLength << endl << "key length = " << header->keyLength << endl; +//cout << "numflowreads = "<< header->numFlowsPerRead << endl << "flow format code = "<< header->flogramFormatCode << endl << "flow chars = " << header->flowChars << endl << "key sequence = " << header->keySequence << endl << endl; + cout << in.tellg() << endl; + //read through the sff file + while (!in.eof()) { + + //read header + Header* readheader = readHeader(in); + + //read data + seqRead* read = readSeqData(in, header->numFlowsPerRead, readheader->numBases); + + cout << in.tellg() << endl; + + //print common header + printCommonHeader(out, header, true); + + //print header + printHeader(out, readheader, true); + + //print data + printSeqData(out, read, true); + + } + + + in.close(); + out.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "extractSffInfo"); + exit(1); + } +} +//********************************************************************************************************************** +CommonHeader* SffInfoCommand::readCommonHeader(ifstream& in){ + try { + CommonHeader* header = new CommonHeader(); + + if (!in.eof()) { + string tempBuf = ""; + + //read magic number + char* buffer = new char(sizeof(header->magicNumber)); + in.read(buffer, sizeof(header->magicNumber)); + header->magicNumber = be_int4(*(uint32_t *)(buffer)); + delete[] buffer; + + //read version + header->version = new char(4); + in.read(header->version, 4); + tempBuf = buffer; + if (tempBuf.length() > 4) { tempBuf = tempBuf.substr(0, 4); strcpy(header->version, tempBuf.c_str()); } + + //read offset + buffer = new char(sizeof(header->indexOffset)); + in.read(buffer, sizeof(header->indexOffset)); + header->indexOffset = be_int8(*(uint64_t *)(buffer)); + delete[] buffer; + + //read index length + buffer = new char(sizeof(header->indexLength)); + in.read(buffer, sizeof(header->indexLength)); + header->indexLength = be_int4(*(uint32_t *)(buffer)); + delete[] buffer; + + //read num reads + buffer = new char(sizeof(header->numReads)); + in.read(buffer, sizeof(header->numReads)); + header->numReads = be_int4(*(uint32_t *)(buffer)); + delete[] buffer; + + //read header length + buffer = new char(sizeof(header->headerLength)); + in.read(buffer, sizeof(header->headerLength)); + header->headerLength = be_int2(*(uint16_t *)(buffer)); + delete[] buffer; + + //read key length + buffer = new char(sizeof(header->keyLength)); + in.read(buffer, sizeof(header->keyLength)); + header->keyLength = be_int2(*(uint16_t *)(buffer)); + delete[] buffer; + + //read number of flow reads + buffer = new char(sizeof(header->numFlowsPerRead)); + in.read(buffer, sizeof(header->numFlowsPerRead)); + header->numFlowsPerRead = be_int2(*(uint16_t *)(buffer)); + delete[] buffer; + + //read format code + buffer = new char(sizeof(header->flogramFormatCode)); + in.read(buffer, sizeof(header->flogramFormatCode)); + header->flogramFormatCode = be_int1(*(uint8_t *)(buffer)); + delete[] buffer; + + //read flow chars + header->flowChars = new char(header->numFlowsPerRead); + in.read(header->flowChars, header->numFlowsPerRead); + tempBuf = buffer; + if (tempBuf.length() > header->numFlowsPerRead) { tempBuf = tempBuf.substr(0, header->numFlowsPerRead); strcpy(header->flowChars, tempBuf.c_str()); } + + //read key + header->keySequence = new char(header->keyLength); + in.read(header->keySequence, header->keyLength); + tempBuf = header->keySequence; + if (tempBuf.length() > header->keyLength) { tempBuf = tempBuf.substr(0, header->keyLength); strcpy(header->keySequence, tempBuf.c_str()); } + + }else{ + m->mothurOut("Error reading sff common header."); m->mothurOutEndLine(); + } + + return header; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +Header* SffInfoCommand::readHeader(ifstream& in){ + try { + Header* header = new Header(); + + if (!in.eof()) { + string tempBuf = ""; + + //read header length + char* buffer = new char(sizeof(header->headerLength)); + in.read(buffer, sizeof(header->headerLength)); + header->headerLength = be_int2(*(unsigned short *)(buffer)); + delete[] buffer; + + //read name length + buffer = new char(sizeof(header->nameLength)); + in.read(buffer, sizeof(header->nameLength)); + header->nameLength = be_int2(*(unsigned short *)(buffer)); + delete[] buffer; + + //read num bases + buffer = new char(sizeof(header->numBases)); + in.read(buffer, sizeof(header->numBases)); + header->numBases = be_int4(*(unsigned int *)(buffer)); + delete[] buffer; + + //read clip qual left + buffer = new char(sizeof(header->clipQualLeft)); + in.read(buffer, sizeof(header->clipQualLeft)); + header->clipQualLeft = be_int2(*(unsigned short *)(buffer)); + delete[] buffer; + + //read clip qual right + buffer = new char(sizeof(header->clipQualRight)); + in.read(buffer, sizeof(header->clipQualRight)); + header->clipQualRight = be_int2(*(unsigned short *)(buffer)); + delete[] buffer; + + //read clipAdapterLeft + buffer = new char(sizeof(header->clipAdapterLeft)); + in.read(buffer, sizeof(header->clipAdapterLeft)); + header->clipAdapterLeft = be_int2(*(unsigned short *)(buffer)); + delete[] buffer; + + //read clipAdapterRight + buffer = new char(sizeof(header->clipAdapterRight)); + in.read(buffer, sizeof(header->clipAdapterRight)); + header->clipAdapterRight = be_int2(*(unsigned short *)(buffer)); + delete[] buffer; + + //read name + header->name = new char(header->nameLength); + in.read(header->name, header->nameLength); + tempBuf = header->name; + if (tempBuf.length() > header->nameLength) { tempBuf = tempBuf.substr(0, header->nameLength); strcpy(header->name, tempBuf.c_str()); } + + }else{ + m->mothurOut("Error reading sff header info."); m->mothurOutEndLine(); + } + + return header; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readHeader"); + exit(1); + } +} +//********************************************************************************************************************** +seqRead* SffInfoCommand::readSeqData(ifstream& in, int numFlowReads, int numBases){ + try { + seqRead* read = new seqRead(); + + if (!in.eof()) { + + string tempBuf = ""; + char* buffer; + + //read flowgram + read->flowgram.resize(numFlowReads); + for (int i = 0; i < numFlowReads; i++) { + buffer = new char((sizeof(unsigned short))); + in.read(buffer, (sizeof(unsigned short))); + read->flowgram[i] = be_int2(*(unsigned short *)(buffer)); + delete[] buffer; + } + + //read flowgram + read->flowIndex.resize(numBases); + for (int i = 0; i < numBases; i++) { + buffer = new char(1); + in.read(buffer, 1); + read->flowgram[i] = be_int1(*(unsigned int *)(buffer)); + delete[] buffer; + } + + //read bases + read->bases = new char(numBases); + in.read(read->bases, numBases); + tempBuf = buffer; + if (tempBuf.length() > numBases) { tempBuf = tempBuf.substr(0, numBases); strcpy(read->bases, tempBuf.c_str()); } + + + //read flowgram + read->qualScores.resize(numBases); + for (int i = 0; i < numBases; i++) { + buffer = new char(1); + in.read(buffer, 1); + read->qualScores[i] = be_int1(*(unsigned int *)(buffer)); + delete[] buffer; + } + + /* Pad to 8 chars */ + int spotInFile = in.tellg(); + cout << spotInFile << endl; + int spot = floor((spotInFile + 7) /(float) 8) * 8; + cout << spot << endl; + in.seekg(spot); + + }else{ + m->mothurOut("Error reading."); m->mothurOutEndLine(); + } + + return read; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printCommonHeader(ofstream& out, CommonHeader* header, bool debug) { + try { + + string output = "Common Header:\nMagic Number: "; + output += toString(header->magicNumber) + '\n'; + output += "Version: " + toString(header->version) + '\n'; + output += "Index Offset: " + toString(header->indexOffset) + '\n'; + output += "Index Length: " + toString(header->indexLength) + '\n'; + output += "Number of Reads: " + toString(header->numReads) + '\n'; + output += "Header Length: " + toString(header->headerLength) + '\n'; + output += "Key Length: " + toString(header->keyLength) + '\n'; + output += "Number of Flows: " + toString(header->numFlowsPerRead) + '\n'; + output += "Format Code: " + toString(header->flogramFormatCode) + '\n'; + output += "Flow Chars: " + toString(header->flowChars) + '\n'; + output += "Key Sequence: " + toString(header->keySequence) + '\n'; + + out << output << endl; + + if (debug) { cout << output << endl; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printHeader(ofstream& out, Header* header, bool debug) { + try { + string name = header->name; + string output = ">" + name + "\nRead Header Length: " + toString(header->headerLength) + '\n'; + output += "Name Length: " + toString(header->nameLength) + '\n'; + output += "Number of Bases: " + toString(header->numBases) + '\n'; + output += "Clip Qual Left: " + toString(header->clipQualLeft) + '\n'; + output += "Clip Qual Right: " + toString(header->clipQualLeft) + '\n'; + output += "Clip Adap Left: " + toString(header->clipQualLeft) + '\n'; + output += "Clip Adap Right: " + toString(header->clipQualLeft) + '\n'; + + out << output << endl; + + if (debug) { cout << output << endl; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printHeader"); + exit(1); + } +} + +//********************************************************************************************************************** +int SffInfoCommand::printSeqData(ofstream& out, seqRead* read, bool debug) { + try { + + string output = "FlowGram: "; + for (int i = 0; i < read->flowgram.size(); i++) { output += toString(read->flowgram[i]) +'\t'; } + output += "\nFlow Indexes: "; + for (int i = 0; i < read->flowIndex.size(); i++) { output += toString(read->flowIndex[i]) +'\t'; } + string bases = read->bases; + output += "\nBases: " + bases + '\n'; + for (int i = 0; i < read->qualScores.size(); i++) { output += toString(read->qualScores[i]) +'\t'; } + output += '\n'; + + out << output << endl; + + if (debug) { cout << output << endl; } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printSeqData"); + exit(1); + } +} +//**********************************************************************************************************************/ diff --git a/sffinfocommand.h b/sffinfocommand.h new file mode 100644 index 0000000..2385d58 --- /dev/null +++ b/sffinfocommand.h @@ -0,0 +1,84 @@ +#ifndef SFFINFOCOMMAND_H +#define SFFINFOCOMMAND_H + +/* + * sffinfocommand.h + * Mothur + * + * Created by westcott on 7/7/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" + +#define SFF_MAGIC 0x2e736666 /* ".sff" */ +#define SFF_VERSION "\0\0\0\1" + +/**********************************************************/ +struct CommonHeader { + uint32_t magicNumber; + char* version; + uint64_t indexOffset; + uint32_t indexLength; + uint32_t numReads; + uint16_t headerLength; + uint16_t keyLength; + uint16_t numFlowsPerRead; + uint8_t flogramFormatCode; + char* flowChars; //length depends on number flow reads + char* keySequence; //length depends on key length + + CommonHeader() { magicNumber=0; indexOffset=0; indexLength=0; numReads=0; headerLength=0; keyLength=0; numFlowsPerRead=0; flogramFormatCode='s'; } +}; +/**********************************************************/ +struct Header { + unsigned short headerLength; + unsigned short nameLength; + unsigned int numBases; + unsigned short clipQualLeft; + unsigned short clipQualRight; + unsigned short clipAdapterLeft; + unsigned short clipAdapterRight; + char* name; //length depends on nameLength + + Header() { headerLength=0; nameLength=0; numBases=0; clipQualLeft=0; clipQualRight=0; clipAdapterLeft=0; clipAdapterRight=0; } +}; +/**********************************************************/ +struct seqRead { + vector flowgram; + vector flowIndex; + char* bases; + vector qualScores; +}; +/**********************************************************/ + +class SffInfoCommand : public Command { + +public: + SffInfoCommand(string); + ~SffInfoCommand(); + int execute(); + void help(); + +private: + string sffFilename, outputDir; + vector filenames, outputNames; + bool abort; + + int extractSffInfo(string, string); + CommonHeader* readCommonHeader(ifstream&); + Header* readHeader(ifstream&); + seqRead* readSeqData(ifstream&, int, int); + + int printCommonHeader(ofstream&, CommonHeader*, bool); //bool is debug mode + int printHeader(ofstream&, Header*, bool); + int printSeqData(ofstream&, seqRead*, bool); + +}; + +/**********************************************************/ + +#endif + + diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index cbf5e13..af3bfc8 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -519,7 +519,7 @@ int TrimSeqsCommand::setLines(string filename, vector& lines) { lines.clear(); - vector positions; + vector positions; ifstream inFASTA; openInputFile(filename, inFASTA); @@ -529,7 +529,7 @@ int TrimSeqsCommand::setLines(string filename, vector& lines) { input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); @@ -537,7 +537,7 @@ int TrimSeqsCommand::setLines(string filename, vector& lines) { int numFastaSeqs = positions.size(); FILE * pFile; - long size; + unsigned long int size; //get num bytes in file pFile = fopen (filename.c_str(),"rb"); @@ -552,11 +552,11 @@ int TrimSeqsCommand::setLines(string filename, vector& lines) { for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; + unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }else{ - long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; + unsigned long int myEnd = positions[ (i+1) * numSeqsPerProcessor ]; } lines.push_back(new linePair(startPos, numSeqsPerProcessor)); } diff --git a/trimseqscommand.h b/trimseqscommand.h index fefe534..ab692d3 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -24,9 +24,9 @@ public: private: struct linePair { - int start; + unsigned long int start; int num; - linePair(long int i, int j) : start(i), num(j) {} + linePair(unsigned long int i, int j) : start(i), num(j) {} }; void getOligos(vector&); -- 2.39.2