working on windows paralellization, added trimOligos class to be used by trim.flows...

author westcott <westcott>

Tue, 13 Sep 2011 10:08:39 +0000 (10:08 +0000)

committer westcott <westcott>

Tue, 13 Sep 2011 10:08:39 +0000 (10:08 +0000)
author westcott <westcott>
Tue, 13 Sep 2011 10:08:39 +0000 (10:08 +0000)
committer westcott <westcott>
Tue, 13 Sep 2011 10:08:39 +0000 (10:08 +0000)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index c9681af0c64348e67d4fa25375ee6205e330520e..3ccc0b91d1f48140d421358f3a2983c13b3cb9aa 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -317,11 +317,13 @@
                 A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; };
                 A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; };
                 A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; };
+               A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */; };
                 A7FA10021302E097003860FE /* mantelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FA10011302E096003860FE /* mantelcommand.cpp */; };
                 A7FC480E12D788F20055BC5C /* linearalgebra.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC480D12D788F20055BC5C /* linearalgebra.cpp */; };
                 A7FC486712D795D60055BC5C /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FC486612D795D60055BC5C /* pcacommand.cpp */; };
                 A7FE7C401330EA1000F7B327 /* getcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */; };
                 A7FE7E6D13311EA400F7B327 /* setcurrentcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */; };
+               A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FF19F1140FFDA500AD216D /* trimoligos.cpp */; };
  /* End PBXBuildFile section */
  
  /* Begin PBXCopyFilesBuildPhase section */
@@ -985,6 +987,8 @@
                 A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; };
                 A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = "<group>"; };
                 A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = "<group>"; };
+               A7F9F5CD141A5E500032F693 /* sequenceparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequenceparser.h; sourceTree = "<group>"; };
+               A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequenceparser.cpp; sourceTree = "<group>"; };
                 A7FA10001302E096003860FE /* mantelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mantelcommand.h; sourceTree = "<group>"; };
                 A7FA10011302E096003860FE /* mantelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mantelcommand.cpp; sourceTree = "<group>"; };
                 A7FC480C12D788F20055BC5C /* linearalgebra.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = linearalgebra.h; sourceTree = "<group>"; };
@@ -995,6 +999,8 @@
                 A7FE7C3F1330EA1000F7B327 /* getcurrentcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcurrentcommand.cpp; sourceTree = "<group>"; };
                 A7FE7E6B13311EA400F7B327 /* setcurrentcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = setcurrentcommand.h; sourceTree = "<group>"; };
                 A7FE7E6C13311EA400F7B327 /* setcurrentcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = setcurrentcommand.cpp; sourceTree = "<group>"; };
+               A7FF19F0140FFDA500AD216D /* trimoligos.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trimoligos.h; sourceTree = "<group>"; };
+               A7FF19F1140FFDA500AD216D /* trimoligos.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trimoligos.cpp; sourceTree = "<group>"; };
                 C6A0FF2C0290799A04C91782 /* mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mothur.1; sourceTree = "<group>"; };
  /* End PBXFileReference section */
  
@@ -1103,6 +1109,8 @@
                                 A7E9B82D12D37EC400DA6239 /* singlelinkage.cpp */,
                                 A7E9B83012D37EC400DA6239 /* slibshuff.cpp */,
                                 A7E9B83112D37EC400DA6239 /* slibshuff.h */,
+                               A7FF19F0140FFDA500AD216D /* trimoligos.h */,
+                               A7FF19F1140FFDA500AD216D /* trimoligos.cpp */,
                                 A7E9B87412D37EC400DA6239 /* validcalculator.cpp */,
                                 A7E9B87512D37EC400DA6239 /* validcalculator.h */,
                                 A7E9B87612D37EC400DA6239 /* validparameter.cpp */,
@@ -1382,9 +1390,9 @@
                                 A7E9B7E412D37EC400DA6239 /* sffinfocommand.h */,
                                 A7E9B7E312D37EC400DA6239 /* sffinfocommand.cpp */,
                                 A7E9B7F312D37EC400DA6239 /* sharedcommand.h */,
-                               A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */,
                                 A7E9B82812D37EC400DA6239 /* shhhercommand.h */,
                                 A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */,
+                               A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */,
                                 A7E9B84012D37EC400DA6239 /* splitabundcommand.h */,
                                 A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */,
                                 A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */,
@@ -1625,6 +1633,8 @@
                                 A7E9B7DC12D37EC400DA6239 /* sequence.hpp */,
                                 A7E9B7DD12D37EC400DA6239 /* sequencedb.cpp */,
                                 A7E9B7DE12D37EC400DA6239 /* sequencedb.h */,
+                               A7F9F5CD141A5E500032F693 /* sequenceparser.h */,
+                               A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */,
                                 A7E9B80412D37EC400DA6239 /* sharedlistvector.cpp */,
                                 A7E9B80512D37EC400DA6239 /* sharedlistvector.h */,
                                 A7E9B80E12D37EC400DA6239 /* sharedordervector.h */,
@@ -2137,6 +2147,8 @@
                                 A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */,
                                 A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */,
                                 A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */,
+                               A7FF19F2140FFDA500AD216D /* trimoligos.cpp in Sources */,
+                               A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
diff --git a/aligncommand.cpp b/aligncommand.cpp

index 05ecd46ac6a3e192d834c6f26cdb5e1aa533cb71..960d70229a084923fd1cc8dc368ff4b71d815cbd 100644 (file)
--- a/aligncommand.cpp
+++ b/aligncommand.cpp
@@ -322,7 +322,7 @@ int AlignCommand::execute(){
  #ifdef USE_MPI 
                                 int pid, numSeqsPerProcessor; 
                                 int tag = 2001;
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                 MPIWroteAccnos = false;
                         
                                 MPI_Status status; 
@@ -426,7 +426,7 @@ int AlignCommand::execute(){
                                 
  #else
  
-                       vector<unsigned long int> positions; 
+                       vector<unsigned long long> positions; 
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                         positions = m->divideFile(candidateFileNames[s], processors);
                         for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
@@ -623,7 +623,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam
                         delete candidateSeq;
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (inFASTA.eof()) { break; }
@@ -650,7 +650,7 @@ int AlignCommand::driver(linePair* filePos, string alignFName, string reportFNam
  }
  //**********************************************************************************************************************
  #ifdef USE_MPI
-int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector<unsigned long int>& MPIPos){
+int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector<unsigned long long>& MPIPos){
         try {
                 string outputString = "";
                 MPI_Status statusReport; 
diff --git a/aligncommand.h b/aligncommand.h

index 4a5eb11bf320a500ddfca34f0504130637fda0f0..6236958a42a886bf5178aab8571f1513c596e00f 100644 (file)
--- a/aligncommand.h
+++ b/aligncommand.h
@@ -45,9 +45,9 @@ public:
         
  private:
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
         vector<int> processIDS;   //processid
         vector<linePair*> lines;
@@ -61,7 +61,7 @@ private:
         void appendReportFiles(string, string);
         
         #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
         #endif
         
         string candidateFileName, templateFileName, distanceFileName, search, align, outputDir;
@@ -86,15 +86,15 @@ typedef struct alignData {
         string align;
         string search;
         bool flip;
-       unsigned long int start;
-       unsigned long int end;
+       unsigned long long start;
+       unsigned long long end;
         MothurOut* m;
         //AlignmentDB* templateDB;
         float match, misMatch, gapOpen, gapExtend, threshold;
         int count, kmerSize, threadID;
         
         alignData(){}
-       alignData(string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long int st, unsigned long int en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) {
+       alignData(string a, string r, string ac, string f, string al, string se, int ks, MothurOut* mout, unsigned long long st, unsigned long long en, bool fl, float ma, float misMa, float gapO, float gapE, float thr, int tid) {
                 alignFName = a;
                 reportFName = r;
                 accnosFName = ac;
diff --git a/alignmentdb.cpp b/alignmentdb.cpp

index 57975d6d774a97b1d171b468c7a238158a7b0ebe..df4eaec783dbb81e33c09c3d97473c02425ba1ce 100644 (file)
--- a/alignmentdb.cpp
+++ b/alignmentdb.cpp
@@ -77,7 +77,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gap
                         
                         #ifdef USE_MPI  
                                 int pid, processors;
-                               vector<unsigned long int> positions;
+                               vector<unsigned long long> positions;
                         
                                 MPI_Status status; 
                                 MPI_File inMPI;
diff --git a/amovacommand.cpp b/amovacommand.cpp

index 141673375fb8bb1a37e67980df9708fe6d2da126..de277ab6edad4828ee5a35f37474cd4a3a8eee55 100644 (file)
--- a/amovacommand.cpp
+++ b/amovacommand.cpp
@@ -174,10 +174,17 @@ int AmovaCommand::execute(){
                 //link designMap to rows/columns in distance matrix
                 map<string, vector<int> > origGroupSampleMap;
                 for(int i=0;i<sampleNames.size();i++){
-                       origGroupSampleMap[designMap->getGroup(sampleNames[i])].push_back(i);
+                       string group = designMap->getGroup(sampleNames[i]);
+                       
+                       if (group == "not found") {
+                               m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+                       }else { origGroupSampleMap[group].push_back(i); }
+                       
                 }
                 int numGroups = origGroupSampleMap.size();
                 
+               if (m->control_pressed) { delete designMap; return 0; }
+               
                 //create a new filename
                 ofstream AMOVAFile;
                 string AMOVAFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName))  + "amova";                         
diff --git a/anosimcommand.cpp b/anosimcommand.cpp

index c5cb4d60c751d6614b82cd961c29d9e645fcbaa8..50b6f283d8d58de276b0a08f573894628a98295f 100644 (file)
--- a/anosimcommand.cpp
+++ b/anosimcommand.cpp
@@ -176,10 +176,16 @@ int AnosimCommand::execute(){
                 //link designMap to rows/columns in distance matrix
                 map<string, vector<int> > origGroupSampleMap;
                 for(int i=0;i<sampleNames.size();i++){
-                       origGroupSampleMap[designMap->getGroup(sampleNames[i])].push_back(i);
+                       string group = designMap->getGroup(sampleNames[i]);
+                       
+                       if (group == "not found") {
+                               m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+                       }else { origGroupSampleMap[group].push_back(i); }
                 }
                 int numGroups = origGroupSampleMap.size();
                 
+               if (m->control_pressed) { delete designMap; return 0; }
+               
                 //create a new filename
                 ofstream ANOSIMFile;
                 string ANOSIMFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName))  + "anosim";                               
diff --git a/bayesian.cpp b/bayesian.cpp

index a12afed8a0d0c0efa8448a55fb3e1d3e0fa450b9..d62bb23e3e23ae1c98b3757ddd4075b4edf5a806 100644 (file)
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -19,6 +19,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                 
                 threadID = tid;
                 string baseName = tempFile;
+                       
                 if (baseName == "saved") { baseName = rdb->getSavedReference(); }
                 
                 string baseTName = tfile;
@@ -49,7 +50,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                 }
                 
                 //if you want to save, but you dont need to calculate then just read
-               if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood) {  
+               if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood && (tempFile != "saved")) {  
                         ifstream saveIn;
                         m->openInputFile(tempFile, saveIn);
                         
@@ -445,8 +446,8 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                 #ifdef USE_MPI
                         
                         int pid, num, num2, processors;
-                       vector<unsigned long int> positions;
-                       vector<unsigned long int> positions2;
+                       vector<unsigned long long> positions;
+                       vector<unsigned long long> positions2;
                         
                         MPI_Status status; 
                         MPI_File inMPI;
@@ -575,7 +576,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                         string line = m->getline(in); m->gobble(in);
                         
                         in >> numKmers; m->gobble(in);
-                       
+                       //cout << threadID << '\t' << line << '\t' << numKmers << &in << '\t' << &inNum << '\t' << genusNodes.size() << endl;
                         //initialze probabilities
                         wordGenusProb.resize(numKmers);
                         
@@ -588,22 +589,25 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                         
                         //read version
                         string line2 = m->getline(inNum); m->gobble(inNum);
-                       
+               //cout << threadID << '\t' << line2 << '\t' << this << endl;    
                         while (inNum) {
                                 inNum >> zeroCountProb[count] >> num[count];  
                                 count++;
                                 m->gobble(inNum);
+                               //cout << threadID << '\t' << count << endl;
                         }
                         inNum.close();
-               
+               //cout << threadID << '\t' << "here1 " << &wordGenusProb << '\t' << &num << endl; //
+               //cout << threadID << '\t' << &genusTotals << '\t' << endl; 
+               //cout << threadID << '\t' << genusNodes.size() << endl;
                         while(in) {
                                 in >> kmer;
-                               
+                       //cout << threadID << '\t' << kmer << endl;
                                 //set them all to zero value
                                 for (int i = 0; i < genusNodes.size(); i++) {
                                         wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
                                 }
-                               
+                       //cout << threadID << '\t' << num[kmer] << "here" << endl;      
                                 //get probs for nonzero values
                                 for (int i = 0; i < num[kmer]; i++) {
                                         in >> name >> prob;
@@ -613,7 +617,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                                 m->gobble(in);
                         }
                         in.close();
-                       
+               //cout << threadID << '\t' << "here" << endl;   
                 #endif
         }
         catch(exception& e) {
diff --git a/bellerophon.cpp b/bellerophon.cpp

index 207fa51583b0835810ffaf1e52db88e95252346b..5a0a444b76146f94ada0daccf025c6d6d327d5d0 100644 (file)
--- a/bellerophon.cpp
+++ b/bellerophon.cpp
@@ -246,7 +246,7 @@ int Bellerophon::getChimeras() {
                 numSeqsPerProcessor = iters / processors;
                 
                 //each process hits this only once
-               unsigned long int startPos = pid * numSeqsPerProcessor;
+               unsigned long long startPos = pid * numSeqsPerProcessor;
                 if(pid == processors - 1){
                                 numSeqsPerProcessor = iters - pid * numSeqsPerProcessor;
                 }
@@ -326,7 +326,7 @@ int Bellerophon::getChimeras() {
                                 int numSeqsPerProcessor = iters / processors;
                                 
                                 for (int i = 0; i < processors; i++) {
-                                       unsigned long int startPos = i * numSeqsPerProcessor;
+                                       unsigned long long startPos = i * numSeqsPerProcessor;
                                         if(i == processors - 1){
                                                 numSeqsPerProcessor = iters - i * numSeqsPerProcessor;
                                         }
diff --git a/bellerophon.h b/bellerophon.h

index 08dca2b6cbbd296438b8575d0ad1d4e1919fa975..36555bf0d406addb3cc7685ee55fdac341dffe68 100644 (file)
--- a/bellerophon.h
+++ b/bellerophon.h
@@ -36,9 +36,9 @@ class Bellerophon : public Chimera {
                 
         private:
                 struct linePair {
-                       unsigned long int start;
+                       unsigned long long start;
                         int num;
-                       linePair(unsigned long int i, int j) : start(i), num(j) {}
+                       linePair(unsigned long long i, int j) : start(i), num(j) {}
                 };
                 
                 vector<linePair> lines;
diff --git a/chimera.cpp b/chimera.cpp

index ed843971ac7d4ab229595262631d548494f27d1c..715841662d9de15b3ab40d581e5ab1a4619c99bf 100644 (file)
--- a/chimera.cpp
+++ b/chimera.cpp
@@ -124,7 +124,7 @@ vector<Sequence*> Chimera::readSeqs(string file) {
                         
                         #ifdef USE_MPI  
                                 int pid, processors;
-                               vector<unsigned long int> positions;
+                               vector<unsigned long long> positions;
                                 int numSeqs;
                                 int tag = 2001;
                         
diff --git a/chimera.h b/chimera.h

index 7e327dd86bbe92a9ff6280847829134ed059f24a..1df4d6b2122d88318193a74d906e601e9cd214a1 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -133,9 +133,9 @@ struct sim {
  };
  
  struct linePair {
-                       unsigned long int start;
-                       unsigned long int end;
-                       linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+                       unsigned long long start;
+                       unsigned long long end;
+                       linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
                         linePair(){}
  };
  
diff --git a/chimeraccodecommand.cpp b/chimeraccodecommand.cpp

index 7c8485adfedc0a0c39dc928e50b45fc6bfef2a11..321ae77cf743420284e4dfac274d9c88b9b4557f 100644 (file)
--- a/chimeraccodecommand.cpp
+++ b/chimeraccodecommand.cpp
@@ -299,7 +299,7 @@ int ChimeraCcodeCommand::execute(){
                 
                                 int pid, numSeqsPerProcessor; 
                                 int tag = 2001;
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                                                 
                                 MPI_Status status; 
                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -390,7 +390,7 @@ int ChimeraCcodeCommand::execute(){
  
                         outHeader.close();
                         
-                       vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
+                       vector<unsigned long long> positions = m->divideFile(fastaFileNames[s], processors);
                                 
                         for (int i = 0; i < (positions.size()-1); i++) {
                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
@@ -522,7 +522,7 @@ int ChimeraCcodeCommand::driver(linePair* filePos, string outputFName, string fi
                         delete candidateSeq;
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (inFASTA.eof()) { break; }
@@ -547,7 +547,7 @@ int ChimeraCcodeCommand::driver(linePair* filePos, string outputFName, string fi
  }
  //**********************************************************************************************************************
  #ifdef USE_MPI
-int ChimeraCcodeCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
+int ChimeraCcodeCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long long>& MPIPos){
         try {
                                 
                 MPI_Status status; 
diff --git a/chimeraccodecommand.h b/chimeraccodecommand.h

index 87f5ee87f7df50625be6a7a6407070e73dfc5b8c..5b8092b6f7075111c7fa8c5c58caf14ef3ebaeca 100644 (file)
--- a/chimeraccodecommand.h
+++ b/chimeraccodecommand.h
@@ -36,9 +36,9 @@ public:
                 
  private:
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
         vector<int> processIDS;   //processid
         vector<linePair*> lines;
@@ -47,7 +47,7 @@ private:
         int createProcesses(string, string, string);
         
         #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
         #endif
  
         bool abort, filter, save;
diff --git a/chimeracheckcommand.cpp b/chimeracheckcommand.cpp

index 2b88eb5736bfd37364461facb043b0db76aa3cfc..3a530fde0ab5b31c71cbe19437984bb8a75578c1 100644 (file)
--- a/chimeracheckcommand.cpp
+++ b/chimeracheckcommand.cpp
@@ -349,7 +349,7 @@ int ChimeraCheckCommand::execute(){
                 
                                 int pid, numSeqsPerProcessor; 
                                 int tag = 2001;
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                 
                                 MPI_Status status; 
                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -424,7 +424,7 @@ int ChimeraCheckCommand::execute(){
                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
                 #else
                         
-                       vector<unsigned long int> positions = m->divideFile(fastaFileNames[i], processors);
+                       vector<unsigned long long> positions = m->divideFile(fastaFileNames[i], processors);
                                 
                         for (int s = 0; s < (positions.size()-1); s++) {
                                 lines.push_back(new linePair(positions[s], positions[(s+1)]));
@@ -520,7 +520,7 @@ int ChimeraCheckCommand::driver(linePair* filePos, string outputFName, string fi
                         delete candidateSeq;
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (inFASTA.eof()) { break; }
@@ -544,7 +544,7 @@ int ChimeraCheckCommand::driver(linePair* filePos, string outputFName, string fi
  }
  //**********************************************************************************************************************
  #ifdef USE_MPI
-int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long int>& MPIPos){
+int ChimeraCheckCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos){
         try {
                 MPI_File outAccMPI;
                 MPI_Status status; 
diff --git a/chimeracheckcommand.h b/chimeracheckcommand.h

index 350e10ea4bbbfc5bee07906e93edb029e996dc5f..918b4e5e9bf8aa4a16a1e0ba0716e09753a29cea 100644 (file)
--- a/chimeracheckcommand.h
+++ b/chimeracheckcommand.h
@@ -38,9 +38,9 @@ public:
  private:
  
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
  
         vector<int> processIDS;   //processid
@@ -50,7 +50,7 @@ private:
         int createProcesses(string, string);
                 
         #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, vector<unsigned long int>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, vector<unsigned long long>&);
         #endif
  
         bool abort, svg, save;
diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp

index 33ac69a37a33e6cb3b6e2993850ae7611f5f5a74..76dd0dab8efbb18d0f721328d60477ab96c24847 100644 (file)
--- a/chimerapintailcommand.cpp
+++ b/chimerapintailcommand.cpp
@@ -416,7 +416,7 @@ int ChimeraPintailCommand::execute(){
                 #ifdef USE_MPI
                         int pid, numSeqsPerProcessor; 
                                 int tag = 2001;
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                 
                                 MPI_Status status; 
                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -486,7 +486,7 @@ int ChimeraPintailCommand::execute(){
                                 MPI_File_close(&outMPIAccnos);
                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
                 #else
-                       vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
+                       vector<unsigned long long> positions = m->divideFile(fastaFileNames[s], processors);
                                 
                         for (int i = 0; i < (positions.size()-1); i++) {
                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
@@ -610,7 +610,7 @@ int ChimeraPintailCommand::driver(linePair* filePos, string outputFName, string
                         delete candidateSeq;
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (inFASTA.eof()) { break; }
@@ -635,7 +635,7 @@ int ChimeraPintailCommand::driver(linePair* filePos, string outputFName, string
  }
  //**********************************************************************************************************************
  #ifdef USE_MPI
-int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long int>& MPIPos){
+int ChimeraPintailCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, vector<unsigned long long>& MPIPos){
         try {
                                 
                 MPI_Status status; 
diff --git a/chimerapintailcommand.h b/chimerapintailcommand.h

index 9e1d6f8d253e61cc22e208eaf4d68259c06d7b02..ddbb648593c349bc52ca13bf990a5bc7f31bf50a 100644 (file)
--- a/chimerapintailcommand.h
+++ b/chimerapintailcommand.h
@@ -38,9 +38,9 @@ private:
         ReferenceDB* rdb;
         
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
  
         vector<int> processIDS;   //processid
@@ -50,7 +50,7 @@ private:
         int createProcesses(string, string, string);
         
         #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
         #endif
  
         bool abort, filter, save;
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp

index 0732bc20c3bf534c249a0193409dbf72d874aa0f..0958630862ebf3e61c9d2ec23e5618ddaa868ef1 100644 (file)
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -509,7 +509,7 @@ int ChimeraSlayerCommand::execute(){
                         string trimFastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s]))  + "slayer.fasta";
                         
                         //create chimera here if you are mac or linux because fork will copy for you. Create in create processes instead if you are windows.
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI
                         if (templatefile != "self") { //you want to run slayer with a reference template
                                 chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign, blastlocation, rand());      
                         }else {
@@ -546,7 +546,7 @@ int ChimeraSlayerCommand::execute(){
                 #ifdef USE_MPI  
                         int pid, numSeqsPerProcessor; 
                                 int tag = 2001;
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                 
                                 MPI_Status status; 
                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -659,7 +659,7 @@ int ChimeraSlayerCommand::execute(){
                                 
                 #else
                         //break up file
-                       vector<unsigned long int> positions; 
+                       vector<unsigned long long> positions; 
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                         positions = m->divideFile(fastaFileNames[s], processors);
                         for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
@@ -716,7 +716,7 @@ int ChimeraSlayerCommand::execute(){
                         
                 #endif
                         
-                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || USE_MPI
                         delete chimera;
                         #endif
                         
@@ -849,7 +849,7 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f
                         }
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (inFASTA.eof()) { break; }
@@ -876,7 +876,7 @@ int ChimeraSlayerCommand::driver(linePair* filePos, string outputFName, string f
  }
  //**********************************************************************************************************************
  #ifdef USE_MPI
-int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector<unsigned long int>& MPIPos){
+int ChimeraSlayerCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& outMPI, MPI_File& outAccMPI, MPI_File& outFastaMPI, vector<unsigned long long>& MPIPos){
         try {                           
                 MPI_Status status; 
                 int pid;
diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h

index a54b24f4471e5d7aab5c185952f65f500d6cec7a..b37d4c04ac397fb2ede7ba1d2e0501b9508e5b9d 100644 (file)
--- a/chimeraslayercommand.h
+++ b/chimeraslayercommand.h
@@ -36,9 +36,9 @@ public:
  private:
  
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
  
         vector<int> processIDS;   //processid
@@ -51,7 +51,7 @@ private:
         map<string, int> sortFastaFile(string, string);
                 
         #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
         #endif
  
         bool abort, realign, trim, trimera, save;
@@ -81,8 +81,8 @@ typedef struct slayerData {
         string blastlocation;
         bool trimera;
         bool trim, realign;
-       unsigned long int start;
-       unsigned long int end;
+       unsigned long long start;
+       unsigned long long end;
         int ksize, match, mismatch, window, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted;
         MothurOut* m;
         float divR;
@@ -92,7 +92,7 @@ typedef struct slayerData {
         int threadId;
         
         slayerData(){}
-       slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long int st, unsigned long int en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
+       slayerData(string o, string fa, string ac, string f, string te, string se, string bl, bool tri, bool trm, bool re, MothurOut* mout, unsigned long long st, unsigned long long en, int ks, int ma, int mis, int win, int minS, int minC, int miBS, int minSN, int par, int it, int inc, int numw, float div, map<string, int> prior, int tid) {
                 outputFName = o;
                 fasta = fa;
                 accnos = ac;
diff --git a/classify.cpp b/classify.cpp

index e31e8ccf9fe209809f2521d0556704ebfc114b3e..eb0865c2744acfe120505a5a533cd482af8b9b19 100644 (file)
--- a/classify.cpp
+++ b/classify.cpp
@@ -89,7 +89,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                         m->mothurOut("Generating search database...    "); cout.flush();
         #ifdef USE_MPI  
                                 int pid, processors;
-                               vector<unsigned long int> positions;
+                               vector<unsigned long long> positions;
                                 int tag = 2001;
                         
                                 MPI_Status status; 
@@ -252,7 +252,7 @@ int Classify::readTaxonomy(string file) {
  
  #ifdef USE_MPI 
                 int pid, num, processors;
-               vector<unsigned long int> positions;
+               vector<unsigned long long> positions;
                 int tag = 2001;
                 
                 MPI_Status status; 
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp

index 4291132c74ea9537e9abfdea7779ac404d636c2a..3b7b4a8c595cfeb9d6149a05a2b5347e6adfe6cc 100644 (file)
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -8,11 +8,6 @@
   */
  
  #include "classifyseqscommand.h"
-#include "sequence.hpp"
-#include "bayesian.h"
-#include "phylotree.h"
-#include "phylosummary.h"
-#include "knn.h"
  
  
  
@@ -383,6 +378,11 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                         
                         temp = validParameter.validFile(parameters, "save", false);                     if (temp == "not found"){       temp = "f";                             }
                         save = m->isTrue(temp); 
+                       //this is so the threads can quickly load the reference data
+                       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       #else
+                       if (processors != 1) { save = true; }
+                       #endif
                         rdb->save = save; 
                         if (save) { //clear out old references
                                 rdb->clearMemory();     
@@ -484,7 +484,6 @@ int ClassifySeqsCommand::execute(){
                 }
                 
                 if (m->control_pressed) { delete classify; return 0; }
-               
                                 
                 for (int s = 0; s < fastaFileNames.size(); s++) {
                 
@@ -518,7 +517,7 @@ int ClassifySeqsCommand::execute(){
  #ifdef USE_MPI 
                                 int pid, numSeqsPerProcessor; 
                                 int tag = 2001;
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                 
                                 MPI_Status status; 
                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -544,7 +543,7 @@ int ClassifySeqsCommand::execute(){
                                 MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax);
                                 MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax);
                                 
-                               if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify; return 0;  }
+                               if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI);  MPI_File_close(&outMPINewTax);   MPI_File_close(&outMPITempTax);  delete classify;  return 0;  }
                                 
                                 if (pid == 0) { //you are the root process 
                                         
@@ -599,25 +598,30 @@ int ClassifySeqsCommand::execute(){
                                 
  #else
                 
-                       vector<unsigned long int> positions = m->divideFile(fastaFileNames[s], processors);
+                       vector<unsigned long long> positions; 
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       positions = m->divideFile(fastaFileNames[s], processors);
+                       for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
+#else
+                       if (processors == 1) {
+                               lines.push_back(new linePair(0, 1000));
+                       }else {
+                               positions = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); 
                                 
-                       for (int i = 0; i < (positions.size()-1); i++) {
-                               lines.push_back(new linePair(positions[i], positions[(i+1)]));
-                       }       
-                       
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                               //figure out how many sequences you have to process
+                               int numSeqsPerProcessor = numFastaSeqs / processors;
+                               for (int i = 0; i < processors; i++) {
+                                       int startIndex =  i * numSeqsPerProcessor;
+                                       if(i == (processors - 1)){      numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
+                                       lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
+                               }
+                       }
+#endif
                         if(processors == 1){
                                 numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
-                       }
-                       else{
-                               processIDS.resize(0);
-                               
+                       }else{
                                 numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); 
-                               
                         }
-       #else
-                       numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
-       #endif  
  #endif
  
                 m->mothurOutEndLine();
@@ -746,6 +750,7 @@ int ClassifySeqsCommand::execute(){
                 }
                 
                 delete classify;
+               
                 return 0;
         }
         catch(exception& e) {
@@ -787,9 +792,12 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
  
  int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
         try {
+               
+               int num = 0;
+               processIDS.clear();
+               
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                 int process = 1;
-               int num = 0;
                 
                 //loop through and create all the processes you want
                 while (process != processors) {
@@ -832,6 +840,46 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile,
                         if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
                         in.close(); m->mothurRemove(m->getFullPathName(tempFile));
                 }
+#else
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               //Windows version shared memory, so be careful when passing variables through the alignData struct. 
+               //Above fork() will clone, so memory is separate, but that's not the case with windows, 
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               
+               vector<classifyData*> pDataArray; 
+               DWORD   dwThreadIdArray[processors-1];
+               HANDLE  hThreadArray[processors-1]; 
+               
+               //Create processor worker threads.
+               for( int i=0; i<processors-1; i++ ){
+                       // Allocate memory for thread data.
+                       string extension = "";
+                       if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+                       
+                       classifyData* tempclass = new classifyData(probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i);
+                       pDataArray.push_back(tempclass);
+                       
+                       //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+                       //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+                       hThreadArray[i] = CreateThread(NULL, 0, MyClassThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);  
+                       
+               }
+               
+               //parent does its part
+               num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", filename);
+               processIDS.push_back((processors-1));
+               
+               //Wait until all threads have terminated.
+               WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+               
+               //Close all thread handles and free memory allocations.
+               for(int i=0; i < pDataArray.size(); i++){
+                       num += pDataArray[i]->count;
+                       CloseHandle(hThreadArray[i]);
+                       delete pDataArray[i];
+               }
+               
+       #endif  
                 
                 for(int i=0;i<processIDS.size();i++){
                         appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
@@ -841,7 +889,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile,
                 }
                 
                 return num;
-#endif         
+               
         }
         catch(exception& e) {
                 m->errorOut(e, "ClassifySeqsCommand", "createProcesses");
@@ -918,7 +966,7 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT
                         delete candidateSeq;
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (inFASTA.eof()) { break; }
@@ -944,7 +992,7 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT
  }
  //**********************************************************************************************************************
  #ifdef USE_MPI
-int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long int>& MPIPos){
+int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector<unsigned long long>& MPIPos){
         try {
                 MPI_Status statusNew; 
                 MPI_Status statusTemp; 
diff --git a/classifyseqscommand.h b/classifyseqscommand.h

index 3dbf06ee5bf18c5f6c5ee96e717ec4348c919310..58de96937d995d5eb8a2169fcda3950a5f34b4d2 100644 (file)
--- a/classifyseqscommand.h
+++ b/classifyseqscommand.h
@@ -14,6 +14,12 @@
  #include "command.hpp"
  #include "classify.h"
  #include "referencedb.h"
+#include "sequence.hpp"
+#include "bayesian.h"
+#include "phylotree.h"
+#include "phylosummary.h"
+#include "knn.h"
+
  
  //KNN and Bayesian methods modeled from algorithms in
  //Naı¨ve Bayesian Classiﬁer for Rapid Assignment of rRNA Sequences 
@@ -46,9 +52,9 @@ public:
         
  private:
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
  
         vector<int> processIDS;   //processid
@@ -75,9 +81,140 @@ private:
         
         int MPIReadNamesFile(string);
         #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&);
         #endif
  };
  
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+typedef struct classifyData {
+       string taxFName; 
+       string tempTFName; 
+       string filename;
+       string search, taxonomyFileName, templateFileName, method;
+       unsigned long long start;
+       unsigned long long end;
+       MothurOut* m;
+       float match, misMatch, gapOpen, gapExtend;
+       int count, kmerSize, threadID, cutoff, iters, numWanted;
+       bool probs;
+        
+       classifyData(){}
+       classifyData(bool p, string me, string te, string tx, string a, string r, string f, string se, int ks, int i, int numW, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int cut, int tid) {
+               taxonomyFileName = tx;
+               templateFileName = te;
+               taxFName = a;
+               tempTFName = r;
+               filename = f;
+               search = se;
+               method = me;
+               m = mout;
+               start = st;
+               end = en;
+               match = ma; 
+               misMatch = misMa;
+               gapOpen = gapO; 
+               gapExtend = gapE; 
+               kmerSize = ks;
+               cutoff = cut;
+               iters = i;
+               numWanted = numW;
+               threadID = tid;
+               probs = p;
+               count = 0;
+       }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ 
+       classifyData* pDataArray;
+       pDataArray = (classifyData*)lpParam;
+       
+       try {
+               ofstream outTax;
+               pDataArray->m->openOutputFile(pDataArray->taxFName, outTax);
+               
+               ofstream outTaxSimple;
+               pDataArray->m->openOutputFile(pDataArray->tempTFName, outTaxSimple);
+               
+               ifstream inFASTA;
+               pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
+               
+               string taxonomy;
+                               
+               //print header if you are process 0
+               if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+                       inFASTA.seekg(0);
+               }else { //this accounts for the difference in line endings. 
+                       inFASTA.seekg(pDataArray->start-1); pDataArray->m->gobble(inFASTA); 
+               }
+               
+               pDataArray->count = pDataArray->end;
+               
+               //make classify
+               Classify* myclassify;
+               if(pDataArray->method == "bayesian"){   myclassify = new Bayesian("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID);             }
+               else if(pDataArray->method == "knn"){   myclassify = new Knn("saved", "saved", pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID);                             }
+               else {
+                       pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
+                       pDataArray->m->mothurOutEndLine();
+                       myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID);   
+               }
+               
+               if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
+               
+               int count = 0;
+               for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+                       
+                       if (pDataArray->m->control_pressed) { delete myclassify; return 0; }
+                       
+                       Sequence* candidateSeq = new Sequence(inFASTA); pDataArray->m->gobble(inFASTA);
+                       
+                       if (candidateSeq->getName() != "") {
+                               
+                               taxonomy = myclassify->getTaxonomy(candidateSeq);
+                               
+                               if (pDataArray->m->control_pressed) { delete candidateSeq; return 0; }
+                               
+                               if (taxonomy != "bad seq") {
+                                       //output confidence scores or not
+                                       if (pDataArray->probs) {
+                                               outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+                                       }else{
+                                               outTax << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+                                       }
+                                       
+                                       outTaxSimple << candidateSeq->getName() << '\t' << myclassify->getSimpleTax() << endl;
+                               }
+                               count++;
+                       }
+                       delete candidateSeq;
+                       //report progress
+                       if((count) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine();         }
+                       
+               }
+               //report progress
+               if((count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(count)); pDataArray->m->mothurOutEndLine();         }
+               
+               delete myclassify;
+               inFASTA.close();
+               outTax.close();
+               outTaxSimple.close();
+               
+       }
+       catch(exception& e) {
+               pDataArray->m->errorOut(e, "ClassifySeqsCommand", "MyClassThreadFunction");
+               exit(1);
+       }
+} 
+#endif
+
+
+
+
  #endif
  
diff --git a/clustercommand.cpp b/clustercommand.cpp

index 059f277db6856e2b8c360dbb03b1929d7fa39de5..678912c2076a10af0580f9b778238e3134542c44 100644 (file)
--- a/clustercommand.cpp
+++ b/clustercommand.cpp
@@ -326,7 +326,7 @@ int ClusterCommand::execute(){
                         cout.flush();
                         print_start = false;
                 }
-       
+               
                 if(previousDist <= 0.0000){
                         printData("unique");
                 }
diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp

index 5c8d12b1d9bbb2a32798bacdb7c762110b8cf343..8d2d3eac9ba398dbce884d61c99dd81b3f2442b6 100644 (file)
--- a/clusterfragmentscommand.cpp
+++ b/clusterfragmentscommand.cpp
@@ -226,7 +226,7 @@ int ClusterFragmentsCommand::execute(){
                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
                 
                 string newFastaFile = fileroot + "fragclust.fasta";
-               string newNamesFile = fileroot + "names";
+               string newNamesFile = fileroot + "fragclust.names";
                 
                 if (m->control_pressed) { return 0; }
                 
diff --git a/deconvolutecommand.cpp b/deconvolutecommand.cpp

index bc2dd481e5589ea5ba8b319aa707ecf3a3eea463..3fa622f01ff2fc334f5cbb9d79851a9ca7f8b30c 100644 (file)
--- a/deconvolutecommand.cpp
+++ b/deconvolutecommand.cpp
@@ -162,6 +162,7 @@ int DeconvoluteCommand::execute() {
                 map<string, string>::iterator itStrings;
                 set<string> nameInFastaFile; //for sanity checking
                 set<string>::iterator itname;
+               vector<string> nameFileOrder;
                 int count = 0;
                 while (!in.eof()) {
                         
@@ -189,8 +190,9 @@ int DeconvoluteCommand::execute() {
                                                         m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file, and not in your namefile, please correct."); m->mothurOutEndLine();
                                                 }else {
                                                         sequenceStrings[seq.getAligned()] = itNames->second;
+                                                       nameFileOrder.push_back(seq.getAligned());
                                                 }
-                                       }else { sequenceStrings[seq.getAligned()] = seq.getName();      }
+                                       }else { sequenceStrings[seq.getAligned()] = seq.getName();      nameFileOrder.push_back(seq.getAligned()); }
                                 }else { //this is a dup
                                         if (oldNameMapFName != "") {
                                                 itNames = nameMap.find(seq.getName());
@@ -222,17 +224,22 @@ int DeconvoluteCommand::execute() {
                 ofstream outNames;
                 m->openOutputFile(outNameFile, outNames);
                 
-               for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) {
+               for (int i = 0; i < nameFileOrder.size(); i++) {
+               //for (itStrings = sequenceStrings.begin(); itStrings != sequenceStrings.end(); itStrings++) {
                         if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); outNames.close(); m->mothurRemove(outNameFile); return 0; }
                         
-                       //get rep name
-                       int pos = (itStrings->second).find_first_of(',');
+                       itStrings = sequenceStrings.find(nameFileOrder[i]);
                         
-                       if (pos == string::npos) { // only reps itself
-                               outNames << itStrings->second << '\t' << itStrings->second << endl;
-                       }else {
-                               outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
-                       }
+                       if (itStrings != sequenceStrings.end()) {
+                               //get rep name
+                               int pos = (itStrings->second).find_first_of(',');
+                       
+                               if (pos == string::npos) { // only reps itself
+                                       outNames << itStrings->second << '\t' << itStrings->second << endl;
+                               }else {
+                                       outNames << (itStrings->second).substr(0, pos) << '\t' << itStrings->second << endl;
+                               }
+                       }else{ m->mothurOut("[ERROR]: mismatch in namefile print."); m->mothurOutEndLine(); m->control_pressed = true; }
                 }
                 outNames.close();
                 
diff --git a/distancecommand.cpp b/distancecommand.cpp

index f7dff2e3cb3fb5e81cc8d17af0bcd8d303a5ab6a..beea1bf915fccffa6b1f92e212e8125db6010af9 100644 (file)
--- a/distancecommand.cpp
+++ b/distancecommand.cpp
@@ -317,7 +317,7 @@ int DistanceCommand::execute(){
                         
                                 //do your part
                                 string outputMyPart;
-                               unsigned long int mySize;
+                               unsigned long long mySize;
                                 
                                 if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
                                 else { driverMPI(start, end, outputFile, mySize, output); }
@@ -339,7 +339,7 @@ int DistanceCommand::execute(){
  
                                 //wait on chidren
                                 for(int b = 1; b < processors; b++) { 
-                                       unsigned long int fileSize;
+                                       unsigned long long fileSize;
                                         
                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  delete distCalculator;  return 0; }
                                         
@@ -367,7 +367,7 @@ int DistanceCommand::execute(){
                                 MPI_File_close(&outMPI);
                         }else { //you are a child process
                                 //do your part
-                               unsigned long int size;
+                               unsigned long long size;
                                 if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
                                 else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
                                 
@@ -387,7 +387,7 @@ int DistanceCommand::execute(){
                         else { driver(0, numSeqs, outputFile, "square");  }
                 }else{ //you have multiple processors
                         
-                       unsigned long int numDists = 0;
+                       unsigned long long numDists = 0;
                         
                         if (output == "square") {
                                  numDists = numSeqs * numSeqs;
@@ -817,7 +817,7 @@ int DistanceCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI, flo
  }
  /**************************************************************************************************/
  /////// need to fix to work with calcs and sequencedb
-int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
+int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size){
         try {
                 ValidCalculators validCalculator;
                 Dist* distCalculator;
@@ -913,7 +913,7 @@ int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned
  }
  /**************************************************************************************************/
  /////// need to fix to work with calcs and sequencedb
-int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
+int DistanceCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size, string square){
         try {
                 ValidCalculators validCalculator;
                 Dist* distCalculator;
diff --git a/distancecommand.h b/distancecommand.h

index 9d5477d7ee1dca3d715189db8d61fab432c87cf3..66d8af15d1c3b2ffc67e747bb14c9ee7ca0f2480 100644 (file)
--- a/distancecommand.h
+++ b/distancecommand.h
@@ -213,8 +213,8 @@ private:
         
         #ifdef USE_MPI 
         int driverMPI(int, int, MPI_File&, float);
-       int driverMPI(int, int, string, unsigned long int&);
-       int driverMPI(int, int, string, unsigned long int&, string);
+       int driverMPI(int, int, string, unsigned long long&);
+       int driverMPI(int, int, string, unsigned long long&, string);
         #endif
         
         //int convertMatrix(string);
diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp

index 173227f372f4d4cbf7445e5f3fea440220e70b4a..e96b7fd3762fa69c5930066729e867ff69705b3d 100644 (file)
--- a/filterseqscommand.cpp
+++ b/filterseqscommand.cpp
@@ -341,7 +341,7 @@ int FilterSeqsCommand::filterSequences() {
  #ifdef USE_MPI 
                                 int pid, numSeqsPerProcessor, num; 
                                 int tag = 2001;
-                               vector<unsigned long int>MPIPos;
+                               vector<unsigned long long>MPIPos;
                                                 
                                 MPI_Status status; 
                                 MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
@@ -420,7 +420,7 @@ int FilterSeqsCommand::filterSequences() {
                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
                                 
  #else
-                       vector<unsigned long int> positions = m->divideFile(fastafileNames[s], processors);
+                       vector<unsigned long long> positions = m->divideFile(fastafileNames[s], processors);
                                 
                         for (int i = 0; i < (positions.size()-1); i++) {
                                 lines.push_back(new linePair(positions[i], positions[(i+1)]));
@@ -462,7 +462,7 @@ int FilterSeqsCommand::filterSequences() {
  }
  #ifdef USE_MPI
  /**************************************************************************************/
-int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long int>& MPIPos) {        
+int FilterSeqsCommand::driverMPIRun(int start, int num, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {       
         try {
                 string outputString = "";
                 int count = 0;
@@ -568,7 +568,7 @@ int FilterSeqsCommand::driverRunFilter(string F, string outputFilename, string i
                         }
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = in.tellg();
+                               unsigned long long pos = in.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (in.eof()) { break; }
@@ -676,7 +676,7 @@ string FilterSeqsCommand::createFilter() {
  #ifdef USE_MPI 
                                 int pid, numSeqsPerProcessor, num; 
                                 int tag = 2001;
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                 
                                 MPI_Status status; 
                                 MPI_File inMPI; 
@@ -736,7 +736,7 @@ string FilterSeqsCommand::createFilter() {
                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
                                 
  #else
-               vector<unsigned long int> positions = m->divideFile(fastafileNames[s], processors);
+               vector<unsigned long long> positions = m->divideFile(fastafileNames[s], processors);
                                 
                 for (int i = 0; i < (positions.size()-1); i++) {
                         lines.push_back(new linePair(positions[i], positions[(i+1)]));
@@ -877,7 +877,7 @@ int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair*
                         }
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = in.tellg();
+                               unsigned long long pos = in.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (in.eof()) { break; }
@@ -899,7 +899,7 @@ int FilterSeqsCommand::driverCreateFilter(Filters& F, string filename, linePair*
  }
  #ifdef USE_MPI
  /**************************************************************************************/
-int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector<unsigned long int>& MPIPos) {   
+int FilterSeqsCommand::MPICreateFilter(int start, int num, Filters& F, MPI_File& inMPI, vector<unsigned long long>& MPIPos) {  
         try {
                 
                 MPI_Status status; 
diff --git a/filterseqscommand.h b/filterseqscommand.h

index e445023df07f7c0c6e3bcc6c4d0e95c574e6174d..3bf36c040231c5f801281f9d553c60bed01101c8 100644 (file)
--- a/filterseqscommand.h
+++ b/filterseqscommand.h
@@ -33,9 +33,9 @@ public:
         \r
  private:\r
         struct linePair {\r
-               unsigned long int start;\r
-               unsigned long int end;\r
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}\r
+               unsigned long long start;\r
+               unsigned long long end;\r
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}\r
         };\r
  \r
         vector<linePair*> lines;\r
@@ -59,8 +59,8 @@ private:
         int driverRunFilter(string, string, string, linePair*);\r
         int driverCreateFilter(Filters& F, string filename, linePair* line);\r
         #ifdef USE_MPI\r
-       int driverMPIRun(int, int, MPI_File&, MPI_File&, vector<unsigned long int>&);\r
-       int MPICreateFilter(int, int, Filters&, MPI_File&, vector<unsigned long int>&); \r
+       int driverMPIRun(int, int, MPI_File&, MPI_File&, vector<unsigned long long>&);\r
+       int MPICreateFilter(int, int, Filters&, MPI_File&, vector<unsigned long long>&);        \r
         #endif\r
         \r
  };\r
diff --git a/flowdata.cpp b/flowdata.cpp

index d1769bbed46e34c12a1a8ad4a1f69bebaeca071d..6d6c89bc88fad0f6a474f5d466bc6c7d1d082e47 100644 (file)
--- a/flowdata.cpp
+++ b/flowdata.cpp
@@ -42,14 +42,11 @@ FlowData::FlowData(int numFlows, float signal, float noise, int maxHomoP, string
  bool FlowData::getNext(ifstream& flowFile){
         
         try {
-               
-               string lengthString;
-               string flowString;
-               
-               flowFile >> seqName >> endFlow;         
-               for(int i=0;i<numFlows;i++)     {       flowFile >> flowData[i];        }
-               
-               updateEndFlow();
+               flowFile >> seqName >> endFlow; 
+               //cout << "in Flowdata " + seqName << endl;
+               for(int i=0;i<numFlows;i++)     {       flowFile >> flowData[i];        }
+               //cout << "in Flowdata read " << seqName + " done" << endl;
+               updateEndFlow(); 
                 translateFlow();
                 
                 m->gobble(flowFile);
diff --git a/homovacommand.cpp b/homovacommand.cpp

index 22fd1bff1b0af5ac23532423f7b42471e1b6b3b3..18690048c712c69bad1b1e9aa44c4e6b64a8cee9 100644 (file)
--- a/homovacommand.cpp
+++ b/homovacommand.cpp
@@ -178,10 +178,16 @@ int HomovaCommand::execute(){
                 //link designMap to rows/columns in distance matrix
                 map<string, vector<int> > origGroupSampleMap;
                 for(int i=0;i<sampleNames.size();i++){
-                       origGroupSampleMap[designMap->getGroup(sampleNames[i])].push_back(i);
+                       string group = designMap->getGroup(sampleNames[i]);
+                       
+                       if (group == "not found") {
+                               m->mothurOut("[ERROR]: " + sampleNames[i] + " is not in your design file, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
+                       }else { origGroupSampleMap[group].push_back(i); }
                 }
                 int numGroups = origGroupSampleMap.size();
                 
+               if (m->control_pressed) { delete designMap; return 0; }
+               
                 //create a new filename
                 ofstream HOMOVAFile;
                 string HOMOVAFileName = outputDir + m->getRootName(m->getSimpleName(phylipFileName))  + "homova";                               
diff --git a/makefile b/makefile

index 0f7d0981991a9b12db6513bff253a47ea7e25009..3f6431210735a622f8c0e416a088581d331ed714 100644 (file)
--- a/makefile
+++ b/makefile
@@ -14,12 +14,12 @@ USEMPI ?= no
  USEREADLINE ?= yes
  CYGWIN_BUILD ?= no
  USECOMPRESSION ?= no
-MOTHUR_FILES="\"Enter_your_default_path_here\""
+MOTHUR_FILES="\"/Users/Sarahswork/desktop/release\""
  RELEASE_DATE = "\"7/25/2011\""
  VERSION = "\"1.21.0\""
  
  # Optimize to level 3:
-CXXFLAGS += -O3
+CXXFLAGS += -03 -g
  
  ifeq  ($(strip $(64BIT_VERSION)),yes)
         #if you are using centos uncomment the following lines
diff --git a/mothurout.cpp b/mothurout.cpp

index cdde507ce4464ef9da5d8f5041d56ae73818941f..c80bff27e4ab69bf26d0fddaae534fe625f6df0b 100644 (file)
--- a/mothurout.cpp
+++ b/mothurout.cpp
@@ -1050,19 +1050,27 @@ string MothurOut::sortFile(string distFile, string outputDir){
         }       
  }
  /**************************************************************************************************/
-vector<unsigned long int> MothurOut::setFilePosFasta(string filename, int& num) {
+vector<unsigned long long> MothurOut::setFilePosFasta(string filename, int& num) {
         try {
-                       vector<unsigned long int> positions;
+                       vector<unsigned long long> positions;
                         ifstream inFASTA;
-                       openInputFile(filename, inFASTA);
+                       //openInputFile(filename, inFASTA);
+                       inFASTA.open(filename.c_str(), ios::binary);
                                                 
                         string input;
+                       unsigned long long count = 0;
                         while(!inFASTA.eof()){
-                               input = getline(inFASTA); 
-                               if (input.length() != 0) {
-                                       if(input[0] == '>'){    unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+                               //input = getline(inFASTA); 
+                               //cout << input << '\t' << inFASTA.tellg() << endl;
+                               //if (input.length() != 0) {
+                               //      if(input[0] == '>'){    unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);  cout << (pos - input.length() - 1) << endl; }
+                               //}
+                               //gobble(inFASTA); //has to be here since windows line endings are 2 characters and mess up the positions
+                               char c = inFASTA.get(); count++;
+                               if (c == '>') {
+                                       positions.push_back(count-1);
+                                       //cout << count << endl;
                                 }
-                               gobble(inFASTA); //has to be here since windows line endings are 2 characters and mess up the positions
                         }
                         inFASTA.close();
                 
@@ -1080,7 +1088,7 @@ vector<unsigned long int> MothurOut::setFilePosFasta(string filename, int& num)
                                 fclose (pFile);
                         }*/
                         
-                       unsigned long int size = positions[(positions.size()-1)];
+                       unsigned long long size = positions[(positions.size()-1)];
                         ifstream in;
                         openInputFile(filename, in);
                         
@@ -1093,6 +1101,7 @@ vector<unsigned long int> MothurOut::setFilePosFasta(string filename, int& num)
                         in.close();
                 
                         positions.push_back(size);
+                       positions[0] = 0;
                 
                         return positions;
         }
@@ -1102,31 +1111,51 @@ vector<unsigned long int> MothurOut::setFilePosFasta(string filename, int& num)
         }
  }
  /**************************************************************************************************/
-vector<unsigned long int> MothurOut::setFilePosEachLine(string filename, int& num) {
+vector<unsigned long long> MothurOut::setFilePosEachLine(string filename, int& num) {
         try {
                         filename = getFullPathName(filename);
                         
-                       vector<unsigned long int> positions;
+                       vector<unsigned long long> positions;
                         ifstream in;
-                       openInputFile(filename, in);
-                               
+                       //openInputFile(filename, in);
+                       in.open(filename.c_str(), ios::binary);
+               
                         string input;
+                       unsigned long long count = 0;
+                       positions.push_back(0);
+               
                         while(!in.eof()){
-                               unsigned long int lastpos = in.tellg();
-                               input = getline(in); 
-                               if (input.length() != 0) {
-                                       unsigned long int pos = in.tellg(); 
-                                       if (pos != -1) { positions.push_back(pos - input.length() - 1); }
-                                       else {  positions.push_back(lastpos);  }
+                               //unsigned long long lastpos = in.tellg();
+                               //input = getline(in); 
+                               //if (input.length() != 0) {
+                                       //unsigned long long pos = in.tellg(); 
+                                       //if (pos != -1) { positions.push_back(pos - input.length() - 1);       }
+                                       //else {  positions.push_back(lastpos);  }
+                               //}
+                               //gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions
+                               
+                               
+                               //getline counting reads
+                               char d = in.get(); count++;
+                               while ((d != '\n') && (d != '\r') && (d != '\f') && (d != in.eof()))    {
+                                       //get next character
+                                       d = in.get(); 
+                                       count++;
+                               }
+                               
+                               if (!in.eof()) {
+                                       d=in.get(); count++;
+                                       while(isspace(d) && (d != in.eof()))            { d=in.get(); count++;}
                                 }
-                               gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions
+                               positions.push_back(count-1);
+                               cout << count-1 << endl;
                         }
                         in.close();
                 
-                       num = positions.size();
+                       num = positions.size()-1;
                 
                         FILE * pFile;
-                       unsigned long int size;
+                       unsigned long long size;
                         
                         //get num bytes in file
                         pFile = fopen (filename.c_str(),"rb");
@@ -1137,7 +1166,7 @@ vector<unsigned long int> MothurOut::setFilePosEachLine(string filename, int& nu
                                 fclose (pFile);
                         }
                 
-                       positions.push_back(size);
+                       positions[(positions.size()-1)] = size;
                 
                         return positions;
         }
@@ -1148,14 +1177,14 @@ vector<unsigned long int> MothurOut::setFilePosEachLine(string filename, int& nu
  }
  /**************************************************************************************************/
  
-vector<unsigned long int> MothurOut::divideFile(string filename, int& proc) {
+vector<unsigned long long> MothurOut::divideFile(string filename, int& proc) {
         try{
         
-               vector<unsigned long int> filePos;
+               vector<unsigned long long> filePos;
                 filePos.push_back(0);
                 
                 FILE * pFile;
-               unsigned long int size;
+               unsigned long long size;
                 
                 filename = getFullPathName(filename);
                 
@@ -1169,7 +1198,7 @@ vector<unsigned long int> MothurOut::divideFile(string filename, int& proc) {
                 }
         
                 //estimate file breaks
-               unsigned long int chunkSize = 0;
+               unsigned long long chunkSize = 0;
                 chunkSize = size / proc;
  
                 //file to small to divide by processors
@@ -1177,21 +1206,21 @@ vector<unsigned long int> MothurOut::divideFile(string filename, int& proc) {
         
                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
                 for (int i = 0; i < proc; i++) {
-                       unsigned long int spot = (i+1) * chunkSize;
+                       unsigned long long spot = (i+1) * chunkSize;
                         
                         ifstream in;
                         openInputFile(filename, in);
                         in.seekg(spot);
                         
                         //look for next '>'
-                       unsigned long int newSpot = spot;
+                       unsigned long long newSpot = spot;
                         while (!in.eof()) {
                            char c = in.get();
                            if (c == '>') {   in.putback(c); newSpot = in.tellg(); break;  }
                         }
                 
                         //there was not another sequence before the end of the file
-                       unsigned long int sanityPos = in.tellg();
+                       unsigned long long sanityPos = in.tellg();
  
                         if (sanityPos == -1) {  break;  }
                         else {  filePos.push_back(newSpot);  }
@@ -1220,7 +1249,7 @@ vector<unsigned long int> MothurOut::divideFile(string filename, int& proc) {
  int MothurOut::divideFile(string filename, int& proc, vector<string>& files) {
         try{
                 
-               vector<unsigned long int> filePos = divideFile(filename, proc);
+               vector<unsigned long long> filePos = divideFile(filename, proc);
                 
                 for (int i = 0; i < (filePos.size()-1); i++) {
                         
@@ -1228,7 +1257,7 @@ int MothurOut::divideFile(string filename, int& proc, vector<string>& files) {
                         ifstream in;
                         openInputFile(filename, in);
                         in.seekg(filePos[i]);
-                       unsigned long int size = filePos[(i+1)] - filePos[i];
+                       unsigned long long size = filePos[(i+1)] - filePos[i];
                         char* chunk = new char[size];
                         in.read(chunk, size);
                         in.close();
diff --git a/mothurout.h b/mothurout.h

index 6f04f892dbb588700c884d27071fb613457fb35b..013aa541444227deab43ec80eda7948dfd402bb6 100644 (file)
--- a/mothurout.h
+++ b/mothurout.h
@@ -57,10 +57,10 @@ class MothurOut {
                 
                 //functions from mothur.h
                 //file operations
-               vector<unsigned long int> divideFile(string, int&);
+               vector<unsigned long long> divideFile(string, int&);
                 int divideFile(string, int&, vector<string>&);
-               vector<unsigned long int> setFilePosEachLine(string, int&);
-               vector<unsigned long int> setFilePosFasta(string, int&);
+               vector<unsigned long long> setFilePosEachLine(string, int&);
+               vector<unsigned long long> setFilePosFasta(string, int&);
                 string sortFile(string, string);
                 void appendFiles(string, string);
                 int renameFile(string, string); //oldname, newname
diff --git a/pairwiseseqscommand.cpp b/pairwiseseqscommand.cpp

index ebbf686e20ce080f2889cb91d9e5063c63592579..2245ac9a80a1ccd94b7f13273fb80f4ab6f004b7 100644 (file)
--- a/pairwiseseqscommand.cpp
+++ b/pairwiseseqscommand.cpp
@@ -385,7 +385,7 @@ int PairwiseSeqsCommand::execute(){
                                 
                                         //do your part
                                         string outputMyPart;
-                                       unsigned long int mySize;
+                                       unsigned long long mySize;
                                         
                                         if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
                                         else { driverMPI(start, end, outputFile, mySize, output); }
@@ -403,7 +403,7 @@ int PairwiseSeqsCommand::execute(){
  
                                         //wait on chidren
                                         for(int b = 1; b < processors; b++) { 
-                                               unsigned long int fileSize;
+                                               unsigned long long fileSize;
                                                 
                                                 if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&outMPI);  m->mothurRemove(outputFile);  delete distCalculator;  return 0; }
                                                 
@@ -431,7 +431,7 @@ int PairwiseSeqsCommand::execute(){
                                         MPI_File_close(&outMPI);
                                 }else { //you are a child process
                                         //do your part
-                                       unsigned long int size;
+                                       unsigned long long size;
                                         if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
                                         else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
                                         
@@ -782,7 +782,7 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, MPI_File& outMPI,
  }
  /**************************************************************************************************/
  /////// need to fix to work with calcs and sequencedb
-int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size){
+int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size){
         try {
                 MPI_Status status;
                 
@@ -858,7 +858,7 @@ int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsi
  }
  /**************************************************************************************************/
  /////// need to fix to work with calcs and sequencedb
-int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long int& size, string square){
+int PairwiseSeqsCommand::driverMPI(int startLine, int endLine, string file, unsigned long long& size, string square){
         try {
                 MPI_Status status;
                 
diff --git a/pairwiseseqscommand.h b/pairwiseseqscommand.h

index afb8e19dc476050139de07964c7697626fef3fd7..a3a91f7bcd1f4ad2bbd52024d36153399ebc1414 100644 (file)
--- a/pairwiseseqscommand.h
+++ b/pairwiseseqscommand.h
@@ -54,8 +54,8 @@ private:
         
         #ifdef USE_MPI 
         int driverMPI(int, int, MPI_File&, float);
-       int driverMPI(int, int, string, unsigned long int&);
-       int driverMPI(int, int, string, unsigned long int&, string);
+       int driverMPI(int, int, string, unsigned long long&);
+       int driverMPI(int, int, string, unsigned long long&, string);
         #endif
         
         string fastaFileName, align, calc, outputDir, output;
diff --git a/phylotree.cpp b/phylotree.cpp

index 891cca424719d7013a8e2984c1a0032612c464db..6ca36361dc55fe6c204017486c98ebeea77e7a9c 100644 (file)
--- a/phylotree.cpp
+++ b/phylotree.cpp
@@ -131,7 +131,7 @@ PhyloTree::PhyloTree(string tfile){
                 
                 #ifdef USE_MPI
                         int pid, num, processors;
-                       vector<unsigned long int> positions;
+                       vector<unsigned long long> positions;
                         
                         MPI_Status status; 
                         MPI_File inMPI;
diff --git a/preclustercommand.cpp b/preclustercommand.cpp

index 3f6e0c975750d7e6bd69845a32e11e7e20b2b68a..67b2f31bebe6185fef4137d9c5e839a35b23f0b8 100644 (file)
--- a/preclustercommand.cpp
+++ b/preclustercommand.cpp
@@ -8,6 +8,8 @@
   */
  
  #include "preclustercommand.h"
+#include "sequenceparser.h"
+#include "deconvolutecommand.h"
  
  //**********************************************************************************************************************
  inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
@@ -17,7 +19,9 @@ vector<string> PreClusterCommand::setParameters(){
         try {
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
                 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs);
+               CommandParameter pbygroup("bygroup", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pbygroup);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
@@ -38,7 +42,9 @@ string PreClusterCommand::getHelpString(){
                 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
                 helpString += "The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n";
                 helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
+               helpString += "The group parameter allows you to provide a group file. \n";
                 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
+               helpString += "The bygroup parameter allows you to indicate you whether you want to cluster sequences only within each group, default=T, when a groupfile is given. \n";
                 helpString += "The pre.cluster command should be in the following format: \n";
                 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
                 helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
@@ -114,6 +120,14 @@ PreClusterCommand::PreClusterCommand(string option) {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                               
+                               it = parameters.find("group");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["group"] = inputDir + it->second;            }
+                               }
                         }
  
                         //check for required parameters
@@ -137,10 +151,25 @@ PreClusterCommand::PreClusterCommand(string option) {
                         namefile = validParameter.validFile(parameters, "name", true);
                         if (namefile == "not found") { namefile =  "";  }
                         else if (namefile == "not open") { abort = true; }      
-                       else {  readNameFile();  m->setNameFile(namefile); }
+                       else {  m->setNameFile(namefile); }
+                       
+                       groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not found") { groupfile =  "";  }
+                       else if (groupfile == "not open") { abort = true; groupfile =  ""; }    
+                       else {   m->setGroupFile(groupfile); }
                         
-                       string temp     = validParameter.validFile(parameters, "diffs", false);                         if(temp == "not found"){        temp = "1"; }
+                       string temp     = validParameter.validFile(parameters, "diffs", false);         if(temp == "not found"){        temp = "1"; }
                         convert(temp, diffs); 
+                       
+                       temp = validParameter.validFile(parameters, "bygroup", false);                  
+                       if (temp == "not found") { 
+                               if (groupfile != "") { temp = "T"; }
+                               else { temp = "F"; }
+                       }
+                       bygroup = m->isTrue(temp); 
+                       
+                       if (bygroup && (groupfile == "")) { m->mothurOut("You cannot set bygroup=T, unless you provide a groupfile."); m->mothurOutEndLine(); abort=true; }
+
                 }
                                 
         }
@@ -158,22 +187,133 @@ int PreClusterCommand::execute(){
                 
                 int start = time(NULL);
                 
-               //reads fasta file and return number of seqs
-               int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
+               string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
+               string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile);
+               string newNamesFile = fileroot + "precluster.names";
                 
-               if (m->control_pressed) { return 0; }
-       
-               if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
-               if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
+               if (bygroup) {
+                       //clear out old files
+                       ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
+                       ofstream outNames; m->openOutputFile(newNamesFile, outNames);  outNames.close();
+                       
+                       //parse fasta and name file by group
+                       SequenceParser* parser;
+                       if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);      }
+                       else                            { parser = new SequenceParser(groupfile, fastafile);                    }
+                       
+                       vector<string> groups = parser->getNamesOfGroups();
+                       
+                       //precluster each group
+                       for (int i = 0; i < groups.size(); i++) {
+                               
+                               start = time(NULL);
+                               
+                               if (m->control_pressed) {  delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+                               
+                               m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
+                               
+                               map<string, string> thisNameMap;
+                               if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); }
+                               vector<Sequence> thisSeqs = parser->getSeqs(groups[i]);
+                               
+                               //fill alignSeqs with this groups info.
+                               int numSeqs = loadSeqs(thisNameMap, thisSeqs);
+                               
+                               if (m->control_pressed) {  delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+                               
+                               if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
+                               
+                               int count = process();
+                               
+                               if (m->control_pressed) {  delete parser; m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+
+                               m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
+                               m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
+                               printData(newFastaFile, newNamesFile);
+                               
+                               m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
+                               
+                       }
+                       
+                       //run unique.seqs for deconvolute results
+                       string inputString = "fasta=" + newFastaFile;
+                       if (namefile != "") { inputString += ", name=" + newNamesFile; }
+                       m->mothurOutEndLine(); 
+                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                       m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+                       
+                       Command* uniqueCommand = new DeconvoluteCommand(inputString);
+                       uniqueCommand->execute();
+                       
+                       map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+                       
+                       delete uniqueCommand;
+                       
+                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                       
+                       newNamesFile = filenames["name"][0];
+                       newFastaFile = filenames["fasta"][0];
+                       
+               }else {
+                       if (namefile != "") { readNameFile(); }
+               
+                       //reads fasta file and return number of seqs
+                       int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
                 
-               //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
-//             sizes.clear();
+                       if (m->control_pressed) { return 0; }
         
+                       if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
+                       if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
+                       
+                       int count = process();
+                       
+                       if (m->control_pressed) { return 0; }   
+                       
+                       m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
+                       m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
+                       printData(newFastaFile, newNamesFile);
+                       
+                       m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
+               }
+               
+               if (m->control_pressed) { m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
+               
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               m->mothurOut(newFastaFile); m->mothurOutEndLine();      outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
+               m->mothurOut(newNamesFile); m->mothurOutEndLine();      outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile);
+               m->mothurOutEndLine();
+               
+               //set fasta file as new current fastafile
+               string current = "";
+               itTypes = outputTypes.find("fasta");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+               }
+               
+               itTypes = outputTypes.find("name");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+               }
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PreClusterCommand", "execute");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int PreClusterCommand::process(){
+       try {
+               
                 //sort seqs by number of identical seqs
                 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
-       
+               
                 int count = 0;
-
+               int numSeqs = alignSeqs.size();
+               
                 //think about running through twice...
                 for (int i = 0; i < numSeqs; i++) {
                         
@@ -195,70 +335,32 @@ int PreClusterCommand::execute(){
                                                         //merge
                                                         alignSeqs[i].names += ',' + alignSeqs[j].names;
                                                         alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
-
+                                                       
                                                         alignSeqs[j].active = 0;
                                                         alignSeqs[j].numIdentical = 0;
                                                         count++;
                                                 }
                                         }//end if j active
                                 }//end if i != j
-                       
-                       //remove from active list 
+                               
+                               //remove from active list 
                                 alignSeqs[i].active = 0;
                                 
                         }//end if active i
                         if(i % 100 == 0)        { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
                 }
                 
-               if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }
-       
-               
-               string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
-               
-               string newFastaFile = fileroot + "precluster" + m->getExtension(fastafile);
-               string newNamesFile = fileroot + "precluster.names";
-               
-               if (m->control_pressed) { return 0; }
-               
-               m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
-               m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
-               printData(newFastaFile, newNamesFile);
-               
-               m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
-               
-               if (m->control_pressed) { m->mothurRemove(newFastaFile); m->mothurRemove(newNamesFile); return 0; }
-               
-               m->mothurOutEndLine();
-               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
-               m->mothurOut(newFastaFile); m->mothurOutEndLine();      outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
-               m->mothurOut(newNamesFile); m->mothurOutEndLine();      outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile);
-               m->mothurOutEndLine();
-               
-               //set fasta file as new current fastafile
-               string current = "";
-               itTypes = outputTypes.find("fasta");
-               if (itTypes != outputTypes.end()) {
-                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
-               }
-               
-               itTypes = outputTypes.find("name");
-               if (itTypes != outputTypes.end()) {
-                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
-               }
+               if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }       
                 
-               return 0;
+               return count;
                 
         }
         catch(exception& e) {
-               m->errorOut(e, "PreClusterCommand", "execute");
+               m->errorOut(e, "PreClusterCommand", "process");
                 exit(1);
         }
  }
-
  /**************************************************************************************************/
-
-//this requires the names and fasta file to be in the same order
-
  int PreClusterCommand::readFASTA(){
         try {
                 //ifstream inNames;
@@ -313,7 +415,55 @@ int PreClusterCommand::readFASTA(){
                 exit(1);
         }
  }
-
+/**************************************************************************************************/
+int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs){
+       try {
+               length = 0;
+               alignSeqs.clear();
+               map<string, string>::iterator it;
+               bool error = false;
+                       
+               for (int i = 0; i < thisSeqs.size(); i++) {
+                       
+                       if (m->control_pressed) { return 0; }
+                                               
+                       if (namefile != "") {
+                               it = thisName.find(thisSeqs[i].getName());
+                               
+                               //should never be true since parser checks for this
+                               if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
+                               else{
+                                       //get number of reps
+                                       int numReps = 0;
+                                       for(int j=0;j<(it->second).length();j++){
+                                               if((it->second)[j] == ','){     numReps++;      }
+                                       }
+                                       
+                                       seqPNode tempNode(numReps, thisSeqs[i], it->second);
+                                       alignSeqs.push_back(tempNode);
+                                       if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
+                               }       
+                       }else { //no names file, you are identical to yourself 
+                               seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
+                               alignSeqs.push_back(tempNode);
+                               if (thisSeqs[i].getAligned().length() > length) {  length = thisSeqs[i].getAligned().length();  }
+                       }
+               }
+               
+               //sanity check
+               if (error) { m->control_pressed = true; }
+               
+               thisSeqs.clear();
+               
+               return alignSeqs.size();
+       }
+       
+       catch(exception& e) {
+               m->errorOut(e, "PreClusterCommand", "loadSeqs");
+               exit(1);
+       }
+}
+                               
  /**************************************************************************************************/
  
  int PreClusterCommand::calcMisMatches(string seq1, string seq2){
@@ -341,10 +491,14 @@ void PreClusterCommand::printData(string newfasta, string newname){
                 ofstream outFasta;
                 ofstream outNames;
                 
-               m->openOutputFile(newfasta, outFasta);
-               m->openOutputFile(newname, outNames);
+               if (bygroup) {
+                       m->openOutputFileAppend(newfasta, outFasta);
+                       m->openOutputFileAppend(newname, outNames);
+               }else {
+                       m->openOutputFile(newfasta, outFasta);
+                       m->openOutputFile(newname, outNames);
+               }
                                 
-               
                 for (int i = 0; i < alignSeqs.size(); i++) {
                         if (alignSeqs[i].numIdentical != 0) {
                                 alignSeqs[i].seq.printSequence(outFasta); 
diff --git a/preclustercommand.h b/preclustercommand.h

index 3a12413d6f055d0c4f12e0291a442ed4dcc8fb63..38bcd37cf1ffc33a8ee3572c6f24053e1372c1fc 100644 (file)
--- a/preclustercommand.h
+++ b/preclustercommand.h
@@ -47,8 +47,8 @@ public:
         
  private:
         int diffs, length;
-       bool abort;
-       string fastafile, namefile, outputDir;
+       bool abort, bygroup;
+       string fastafile, namefile, outputDir, groupfile;
         vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
         map<string, string> names; //represents the names file first column maps to second column
         map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
@@ -62,6 +62,8 @@ private:
         //int readNamesFASTA();
         int calcMisMatches(string, string);
         void printData(string, string); //fasta filename, names file name
+       int process();
+       int loadSeqs(map<string, string>&, vector<Sequence>&);
  };
  
  /************************************************************/
diff --git a/rarefactcommand.cpp b/rarefactcommand.cpp

index 8d883aa64ae03c4e0619b0669f3b29fafd752435..bc27d6a3fe00b8dafaba29ce05844c6652459a77 100644 (file)
--- a/rarefactcommand.cpp
+++ b/rarefactcommand.cpp
@@ -38,6 +38,7 @@ vector<string> RareFactCommand::setParameters(){
                 CommandParameter pcalc("calc", "Multiple", "sobs-chao-nseqs-coverage-ace-jack-shannon-shannoneven-npshannon-heip-smithwilson-simpson-simpsoneven-invsimpson-bootstrap", "sobs", "", "", "",true,false); parameters.push_back(pcalc);
                 CommandParameter pabund("abund", "Number", "", "10", "", "", "",false,false); parameters.push_back(pabund);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+               CommandParameter pgroupmode("groupmode", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pgroupmode);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                 
@@ -63,6 +64,7 @@ string RareFactCommand::getHelpString(){
                 helpString += "Example rarefaction.single(label=unique-.01-.03, iters=10000, freq=10, calc=sobs-rchao-race-rjack-rbootstrap-rshannon-rnpshannon-rsimpson).\n";
                 helpString += "The default values for iters is 1000, freq is 100, and calc is rarefaction which calculates the rarefaction curve for the observed richness.\n";
                 validCalculator.printCalc("rarefaction");
+               helpString += "If you are running rarefaction.single with a shared file and would like your results collated in one file, set groupmode=t. (Default=true).\n";
                 helpString += "The label parameter is used to analyze specific labels in your input.\n";
                 helpString += "Note: No spaces between parameter labels (i.e. freq), '=' and parameters (i.e.yourFreq).\n";
                 return helpString;
@@ -262,6 +264,9 @@ RareFactCommand::RareFactCommand(string option)  {
                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                         m->setProcessors(temp);
                         convert(temp, processors);
+                       
+                       temp = validParameter.validFile(parameters, "groupmode", false);                if (temp == "not found") { temp = "T"; }
+                       groupMode = m->isTrue(temp);
                 }
                 
         }
@@ -440,6 +445,11 @@ int RareFactCommand::execute(){
                 }
                 
                 
+               if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); } return 0; }
+
+               //create summary file containing all the groups data for each label - this function just combines the info from the files already created.
+               if ((sharedfile != "") && (groupMode)) {   outputNames = createGroupFile(outputNames);  }
+
                 if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       m->mothurRemove(outputNames[i]); } return 0; }
  
                 m->mothurOutEndLine();
@@ -455,6 +465,114 @@ int RareFactCommand::execute(){
         }
  }
  //**********************************************************************************************************************
+vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames) {
+       try {
+               
+               vector<string> newFileNames;
+               
+               //find different types of files
+               map<string, vector<string> > typesFiles;
+               for (int i = 0; i < outputNames.size(); i++) {
+                       string extension = m->getExtension(outputNames[i]);
+                       
+                       ifstream in;
+                       m->openInputFile(outputNames[i], in);
+                       
+                       string labels = m->getline(in);
+                       string newLine = labels.substr(0, labels.find_first_of('\t'));
+                       
+                       newLine += "\tGroup" + labels.substr(labels.find_first_of('\t'));
+                       
+                       typesFiles[extension].push_back(outputNames[i]);
+                       
+                       string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
+                       
+                       //print headers
+                       ofstream out;
+                       m->openOutputFile(combineFileName, out);
+                       out << newLine << endl;
+                       out.close();
+                       
+               }
+               
+               //for each type create a combo file
+               for (map<string, vector<string> >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) {
+                       
+                       ofstream out;
+                       string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
+                       m->openOutputFileAppend(combineFileName, out);
+                       newFileNames.push_back(combineFileName);
+                       
+                       vector<string> thisTypesFiles = it->second;
+               
+                       //open each type summary file
+                       map<string, vector<string> > files; //maps file name to lines in file
+                       int maxLines = 0;
+                       for (int i=0; i<thisTypesFiles.size(); i++) {
+                                                               
+                               ifstream temp;
+                               m->openInputFile(thisTypesFiles[i], temp);
+                               
+                               //read through first line - labels
+                               m->getline(temp);       m->gobble(temp);
+                               
+                               vector<string> thisFilesLines;
+                               string group = m->getRootName(thisTypesFiles[i]);
+                               group = group.substr(0, group.length()-1);
+                               group = group.substr(group.find_last_of('.')+1);
+                               
+                               thisFilesLines.push_back(group);
+                               while (!temp.eof()){
+                               
+                                       string thisLine = m->getline(temp);
+                                                                               
+                                       thisFilesLines.push_back(thisLine);
+                                       
+                                       m->gobble(temp);
+                               }
+                               
+                               files[thisTypesFiles[i]] = thisFilesLines;
+                               
+                               //save longest file for below
+                               if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); }
+                               
+                               temp.close();
+                               m->mothurRemove(thisTypesFiles[i]);
+                       }
+                       
+                       
+                       //for each label
+                       for (int k = 1; k < maxLines; k++) {
+                               
+                               //grab data for each group
+                               for (int i=0; i<thisTypesFiles.size(); i++) {
+                                       
+                                       string output = "NA\t";
+                                       if (k < files[thisTypesFiles[i]].size()) {
+                                               string line = files[thisTypesFiles[i]][k];
+                                               output = line.substr(0, line.find_first_of('\t'));
+                                               output += '\t' + files[thisTypesFiles[i]][0] + '\t' + line.substr(line.find_first_of('\t'));
+                                       }else{
+                                               output += '\t' + files[thisTypesFiles[i]][0] + '\t';
+                                       }
+                                       out << output << endl;
+                               }
+                       }       
+                       
+                       out.close();
+                       
+               }
+               
+               //return combine file name
+               return newFileNames;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RareFactCommand", "createGroupFile");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
  vector<string> RareFactCommand::parseSharedFile(string filename) {
         try {
                 vector<string> filenames;
diff --git a/rarefactcommand.h b/rarefactcommand.h

index b4bce3e172d141ab5e1fdcea1c59bd35b25dadab..36288927ac2d5bb575378329c9c0a6b66f1db5c4 100644 (file)
--- a/rarefactcommand.h
+++ b/rarefactcommand.h
@@ -42,7 +42,7 @@ private:
         int nIters, abund, processors;
         float freq;
         
-       bool abort, allLines;
+       bool abort, allLines, groupMode;
         set<string> labels; //holds labels to be used
         string label, calc, sharedfile, listfile, rabundfile, sabundfile, format, inputfile;
         vector<string>  Estimators;
@@ -51,6 +51,7 @@ private:
         string outputDir;
         
         vector<string> parseSharedFile(string);
+       vector<string> createGroupFile(vector<string>&);
  };
  
  #endif
diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp

index f751525b6b1fa6211ab6017063ba6c7090f6e484..d5f2a6c20126dc2a6d8d9d15e0e5d7a15c2ef8bc 100644 (file)
--- a/screenseqscommand.cpp
+++ b/screenseqscommand.cpp
@@ -261,7 +261,7 @@ int ScreenSeqsCommand::execute(){
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
                 //if the user want to optimize we need to know the 90% mark
-               vector<unsigned long int> positions;
+               vector<unsigned long long> positions;
                 if (optimize.size() != 0) {  //get summary is paralellized so we need to divideFile, no need to do this step twice so I moved it here
                         //use the namefile to optimize correctly
                         if (namefile != "") { nameMap = m->readNames(namefile); }
@@ -284,7 +284,7 @@ int ScreenSeqsCommand::execute(){
  #ifdef USE_MPI 
                         int pid, numSeqsPerProcessor; 
                         int tag = 2001;
-                       vector<unsigned long int> MPIPos;
+                       vector<unsigned long long> MPIPos;
                         
                         MPI_Status status; 
                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
@@ -629,7 +629,7 @@ int ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
         }
  }
  //***************************************************************************************************************
-int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
+int ScreenSeqsCommand::getSummary(vector<unsigned long long>& positions){
         try {
                 
                 vector<int> startPosition;
@@ -638,7 +638,7 @@ int ScreenSeqsCommand::getSummary(vector<unsigned long int>& positions){
                 vector<int> ambigBases;
                 vector<int> longHomoPolymer;
                 
-               vector<unsigned long int> positions = m->divideFile(fastafile, processors);
+               vector<unsigned long long> positions = m->divideFile(fastafile, processors);
                                 
                 for (int i = 0; i < (positions.size()-1); i++) {
                         lines.push_back(new linePair(positions[i], positions[(i+1)]));
@@ -724,7 +724,7 @@ int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<in
                         }
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = in.tellg();
+                               unsigned long long pos = in.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (in.eof()) { break; }
@@ -1050,7 +1050,7 @@ int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAcc
                         }
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (inFASTA.eof()) { break; }
@@ -1076,7 +1076,7 @@ int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAcc
  }
  //**********************************************************************************************************************
  #ifdef USE_MPI
-int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long int>& MPIPos, set<string>& badSeqNames){
+int ScreenSeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& goodFile, MPI_File& badAccnosFile, vector<unsigned long long>& MPIPos, set<string>& badSeqNames){
         try {
                 string outputString = "";
                 MPI_Status statusGood; 
diff --git a/screenseqscommand.h b/screenseqscommand.h

index 3642295b54af9d46fd246ad642065952af7295ab..87c8beef870b82a95f79bfddcdacba8ab2a1d47e 100644 (file)
--- a/screenseqscommand.h
+++ b/screenseqscommand.h
@@ -33,9 +33,9 @@ public:
  private:
  
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
  
         vector<int> processIDS;   //processid
@@ -50,7 +50,7 @@ private:
         int createProcesses(string, string, string, set<string>&);
         
         #ifdef USE_MPI
-       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&, set<string>&);
+       int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long long>&, set<string>&);
         #endif
  
         bool abort;
@@ -61,7 +61,7 @@ private:
         map<string, int> nameMap;
         int readNames();
         
-       int getSummary(vector<unsigned long int>&);
+       int getSummary(vector<unsigned long long>&);
         int createProcessesCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string);
         int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair*);       
  };
diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp

index 5c80047681c15640c5926d771536a42ea60a8c9d..d3f62826dce1d75e03e70c8614de2465ded64302 100644 (file)
--- a/seqerrorcommand.cpp
+++ b/seqerrorcommand.cpp
@@ -273,9 +273,9 @@ int SeqErrorCommand::execute(){
  
                 if(namesFileName != ""){        weights = getWeights(); }
                 
-               vector<unsigned long int> fastaFilePos;
-               vector<unsigned long int> qFilePos;
-               vector<unsigned long int> reportFilePos;
+               vector<unsigned long long> fastaFilePos;
+               vector<unsigned long long> qFilePos;
+               vector<unsigned long long> reportFilePos;
                 
                 setLines(queryFileName, qualFileName, reportFileName, fastaFilePos, qFilePos, reportFilePos);
                 
@@ -731,7 +731,7 @@ int SeqErrorCommand::driver(string filename, string qFileName, string rFileName,
                         index++;
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = queryFile.tellg();
+                               unsigned long long pos = queryFile.tellg();
                                 if ((pos == -1) || (pos >= line.end)) { break; }
                         #else
                                 if (queryFile.eof()) { break; }
@@ -1204,7 +1204,7 @@ void SeqErrorCommand::printQualityFR(vector<vector<int> > qualForwardMap, vector
  }
  /**************************************************************************************************/
  
-int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos, vector<unsigned long int>& rfileFilePos) {
+int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos, vector<unsigned long long>& rfileFilePos) {
         try {
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                 //set file positions for fasta file
@@ -1246,7 +1246,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam
                                         map<string, int>::iterator it = firstSeqNames.find(sname);
                                         
                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
-                                               unsigned long int pos = inQual.tellg(); 
+                                               unsigned long long pos = inQual.tellg(); 
                                                 qfileFilePos.push_back(pos - input.length() - 1);       
                                                 firstSeqNames.erase(it);
                                         }
@@ -1267,7 +1267,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam
                 
                 //get last file position of qfile
                 FILE * pFile;
-               unsigned long int size;
+               unsigned long long size;
                 
                 //get num bytes in file
                 pFile = fopen (qfilename.c_str(),"rb");
@@ -1305,7 +1305,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam
                                 map<string, int>::iterator it = firstSeqNamesReport.find(sname);
                         
                                 if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk
-                                       unsigned long int pos = inR.tellg(); 
+                                       unsigned long long pos = inR.tellg(); 
                                         rfileFilePos.push_back(pos - input.length() - 1);       
                                         firstSeqNamesReport.erase(it);
                                 }
@@ -1326,7 +1326,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam
                 
                 //get last file position of qfile
                 FILE * rFile;
-               unsigned long int sizeR;
+               unsigned long long sizeR;
                 
                 //get num bytes in file
                 rFile = fopen (rfilename.c_str(),"rb");
@@ -1346,7 +1346,7 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam
                 fastaFilePos.push_back(0); qfileFilePos.push_back(0);
                 //get last file position of fastafile
                 FILE * pFile;
-               unsigned long int size;
+               unsigned long long size;
                 
                 //get num bytes in file
                 pFile = fopen (filename.c_str(),"rb");
diff --git a/seqerrorcommand.h b/seqerrorcommand.h

index 590c554b25ec761a42ecc9593d3b6376b725f15e..cc904ec7f246aed91970b41cf52e39b7d32b896b 100644 (file)
--- a/seqerrorcommand.h
+++ b/seqerrorcommand.h
@@ -62,9 +62,9 @@ private:
         ReferenceDB* rdb;
         
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
         
         vector<int> processIDS;   //processid
@@ -82,7 +82,7 @@ private:
         void printErrorQuality(map<char, vector<int> >);
         void printQualityFR(vector<vector<int> >, vector<vector<int> >);
         
-       int setLines(string, string, string, vector<unsigned long int>&, vector<unsigned long int>&, vector<unsigned long int>&);
+       int setLines(string, string, string, vector<unsigned long long>&, vector<unsigned long long>&, vector<unsigned long long>&);
         int driver(string, string, string, string, string, string, linePair, linePair, linePair);
         int createProcesses(string, string, string, string, string, string);
  
diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp

index ee8835386e1937bd8bc6d51ab2b0779f05f6a679..be75e4fac4ab60697b0dd84de6eeb275d231a5b3 100644 (file)
--- a/seqsummarycommand.cpp
+++ b/seqsummarycommand.cpp
@@ -170,7 +170,7 @@ int SeqSummaryCommand::execute(){
                                 int tag = 2001;
                                 int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5;
                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
-                               vector<unsigned long int> MPIPos;
+                               vector<unsigned long long> MPIPos;
                                 
                                 MPI_Status status; 
                                 MPI_Status statusOut;
@@ -281,7 +281,7 @@ int SeqSummaryCommand::execute(){
                                 
                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
  #else
-                       vector<unsigned long int> positions; 
+                       vector<unsigned long long> positions; 
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                                 positions = m->divideFile(fastafile, processors);
                                 for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
@@ -332,14 +332,14 @@ int SeqSummaryCommand::execute(){
                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
                 
                 m->mothurOutEndLine();
-               m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer"); m->mothurOutEndLine();
-               m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); m->mothurOutEndLine();
-               m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); m->mothurOutEndLine();
-               m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); m->mothurOutEndLine();
-               m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); m->mothurOutEndLine();
-               m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); m->mothurOutEndLine();
-               m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); m->mothurOutEndLine();
-               m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); m->mothurOutEndLine();
+               m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine();
+               m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine();
+               m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine();
+               m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine();
+               m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine();
+               m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine();
+               m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine();
+               m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine();
                 if (namefile == "") {  m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
                 else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); }
                 
@@ -415,7 +415,7 @@ int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<in
                         }
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = in.tellg();
+                               unsigned long long pos = in.tellg();
                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
                         #else
                                 if (in.eof()) { break; }
@@ -438,7 +438,7 @@ int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<in
  }
  #ifdef USE_MPI
  /**************************************************************************************/
-int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long int>& MPIPos) {       
+int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {      
         try {
                 
                 int pid;
diff --git a/seqsummarycommand.h b/seqsummarycommand.h

index a7ccb573490135c10c6fdb7258bb46228047d98b..32947cac88b546e386e7e9960715364e384db3b3 100644 (file)
--- a/seqsummarycommand.h
+++ b/seqsummarycommand.h
@@ -39,9 +39,9 @@ private:
         map<string, int> nameMap;
         
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
  
         vector<linePair*> lines;
@@ -51,7 +51,7 @@ private:
         int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, string, linePair*);       
  
         #ifdef USE_MPI
-       int MPICreateSummary(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, MPI_File&, MPI_File&, vector<unsigned long int>&); 
+       int MPICreateSummary(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, MPI_File&, MPI_File&, vector<unsigned long long>&);        
         #endif
  
  
@@ -69,8 +69,8 @@ typedef struct seqSumData {
         vector<int>* longHomoPolymer; 
         string filename; 
         string sumFile; 
-       unsigned long int start;
-       unsigned long int end;
+       unsigned long long start;
+       unsigned long long end;
         int count;
         MothurOut* m;
         string namefile;
@@ -78,7 +78,7 @@ typedef struct seqSumData {
         
         
         seqSumData(){}
-       seqSumData(vector<int>* s, vector<int>* e, vector<int>* l, vector<int>* a, vector<int>* h, string f, string sf, MothurOut* mout, unsigned long int st, unsigned long int en, string na, map<string, int> nam) {
+       seqSumData(vector<int>* s, vector<int>* e, vector<int>* l, vector<int>* a, vector<int>* h, string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map<string, int> nam) {
                 startPosition = s;
                 endPosition = e;
                 seqLength = l;
diff --git a/sequenceparser.cpp b/sequenceparser.cpp

new file mode 100644 (file)

index 0000000..44012d8
--- /dev/null
+++ b/sequenceparser.cpp
@@ -0,0 +1,273 @@
+/*
+ *  sequenceParser.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 9/9/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "sequenceParser.h"
+
+
+/************************************************************/
+SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) {
+       try {
+               
+               m = MothurOut::getInstance();
+               int error;
+               
+               //read group file
+               groupMap = new GroupMap(groupFile);
+               error = groupMap->readMap();
+               
+               if (error == 1) { m->control_pressed = true; }
+               
+               //initialize maps
+               vector<string> namesOfGroups = groupMap->getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {
+                       vector<Sequence> temp;
+                       map<string, string> tempMap;
+                       seqs[namesOfGroups[i]] = temp;
+                       nameMapPerGroup[namesOfGroups[i]] = tempMap;
+               }
+               
+               //read fasta file making sure each sequence is in the group file
+               ifstream in;
+               m->openInputFile(fastaFile, in);
+               
+               map<string, string> seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file
+               while (!in.eof()) {
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       Sequence seq(in); m->gobble(in);
+                       
+                       if (seq.getName() != "") {
+                               
+                                string group = groupMap->getGroup(seq.getName());
+                                if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
+                                else { 
+                                        seqs[group].push_back(seq);    
+                                        seqName[seq.getName()] = seq.getAligned();
+                                }
+                       }
+               }
+               in.close();
+                                
+               if (error == 1) { m->control_pressed = true; }
+                                
+               //read name file
+               ifstream inName;
+               m->openInputFile(nameFile, inName);
+               
+               string first, second;
+               int countName = 0;
+               while(!inName.eof()) {
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       inName >> first; m->gobble(inName);
+                       inName >> second; m->gobble(inName);
+                       
+                       vector<string> names;
+                       m->splitAtChar(second, names, ',');
+                       
+                       //get aligned string for these seqs from the fasta file
+                       string alignedString = "";
+                       map<string, string>::iterator itAligned = seqName.find(names[0]);
+                       if (itAligned == seqName.end()) {
+                               error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine();
+                       }else {
+                               alignedString = itAligned->second;
+                       }
+                       
+                       //separate by group - parse one line in name file
+                       map<string, string> splitMap; //group -> name1,name2,...
+                       map<string, string>::iterator it;
+                       for (int i = 0; i < names.size(); i++) {
+                               
+                               string group = groupMap->getGroup(names[i]);
+                               if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
+                               else {  
+                                       
+                                       it = splitMap.find(group);
+                                       if (it != splitMap.end()) { //adding seqs to this group
+                                               (it->second) += "," + names[i];
+                                               countName++;
+                                       }else { //first sighting of this group
+                                               splitMap[group] = names[i];
+                                               countName++;
+                                               
+                                               //is this seq in the fasta file?
+                                               if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match
+                                                       Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file.
+                                                       seqs[group].push_back(tempSeq);
+                                               }
+                                       }
+                               }
+                       }
+                       
+                       
+                       //fill nameMapPerGroup - holds all lines in namefile separated by group
+                       for (it = splitMap.begin(); it != splitMap.end(); it++) {
+                               //grab first name
+                               string firstName = "";
+                               for(int i = 0; i < (it->second).length(); i++) {
+                                       if (((it->second)[i]) != ',') {
+                                               firstName += ((it->second)[i]);
+                                       }else { break; }
+                               }
+                               
+                               //group1 -> seq1 -> seq1,seq2,seq3
+                               nameMapPerGroup[it->first][firstName] = it->second;
+                       }
+               }
+               
+               inName.close();
+               
+               if (error == 1) { m->control_pressed = true; }
+               
+               if (countName != (groupMap->getNumSeqs())) {
+                       m->mothurOutEndLine();
+                       m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
+                       m->mothurOutEndLine();
+                       m->control_pressed = true;
+               }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "SequenceParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceParser::SequenceParser(string groupFile, string fastaFile) {
+       try {
+               
+               m = MothurOut::getInstance();
+               int error;
+               
+               //read group file
+               groupMap = new GroupMap(groupFile);
+               error = groupMap->readMap();
+               
+               if (error == 1) { m->control_pressed = true; }
+               
+               //initialize maps
+               vector<string> namesOfGroups = groupMap->getNamesOfGroups();
+               for (int i = 0; i < namesOfGroups.size(); i++) {
+                       vector<Sequence> temp;
+                       seqs[namesOfGroups[i]] = temp;
+               }
+               
+               //read fasta file making sure each sequence is in the group file
+               ifstream in;
+               m->openInputFile(fastaFile, in);
+               int count = 0;
+               
+               while (!in.eof()) {
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       Sequence seq(in); m->gobble(in);
+                       
+                       if (seq.getName() != "") {
+                               
+                               string group = groupMap->getGroup(seq.getName());
+                               if (group == "not found") {  error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine();  }
+                               else {  seqs[group].push_back(seq);     count++; }
+                       }
+               }
+               in.close();
+               
+               if (error == 1) { m->control_pressed = true; }
+               
+               if (count != (groupMap->getNumSeqs())) {
+                       m->mothurOutEndLine();
+                       m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct.");
+                       if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); }
+                       m->mothurOutEndLine();
+                       m->control_pressed = true;
+               }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "SequenceParser");
+               exit(1);
+       }
+}
+/************************************************************/
+SequenceParser::~SequenceParser(){ delete groupMap; }
+/************************************************************/
+int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); }
+/************************************************************/
+vector<string> SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); }
+/************************************************************/
+bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); }
+/************************************************************/
+string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); }
+/************************************************************/
+int SequenceParser::getNumSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               int num = 0;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine();
+               }else {
+                       num = (it->second).size();
+               }
+               
+               return num; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getNumSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+vector<Sequence> SequenceParser::getSeqs(string g){ 
+       try {
+               map<string, vector<Sequence> >::iterator it;
+               vector<Sequence> seqForThisGroup;
+               
+               it = seqs.find(g);
+               if(it == seqs.end()) {
+                       m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       seqForThisGroup = it->second;
+               }
+               
+               return seqForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getSeqs");
+               exit(1);
+       }
+}
+/************************************************************/
+map<string, string> SequenceParser::getNameMap(string g){ 
+       try {
+               map<string, map<string, string> >::iterator it;
+               map<string, string> nameMapForThisGroup;
+               
+               it = nameMapPerGroup.find(g);
+               if(it == nameMapPerGroup.end()) {
+                       m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine();
+               }else {
+                       nameMapForThisGroup = it->second;
+               }
+               
+               return nameMapForThisGroup; 
+       }
+       catch(exception& e) {
+               m->errorOut(e, "SequenceParser", "getNameMap");
+               exit(1);
+       }
+}
+/************************************************************/
+
+
+
diff --git a/sequenceparser.h b/sequenceparser.h

new file mode 100644 (file)

index 0000000..fa838f0
--- /dev/null
+++ b/sequenceparser.h
@@ -0,0 +1,56 @@
+#ifndef SEQUENCEPARSER_H
+#define SEQUENCEPARSER_H
+
+/*
+ *  sequenceParser.h
+ *  Mothur
+ *
+ *  Created by westcott on 9/9/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "sequence.hpp"
+#include "groupmap.h"
+
+/* This class reads a fasta and group file with a namesfile as optional and parses the data by group. 
+ 
+       Note: The sum of all the groups unique sequences will be larger than the original number of unique sequences. 
+       This is because when we parse the name file we make a unique for each group instead of 1 unique for all
+       groups. 
+ 
+ */
+
+class SequenceParser {
+       
+       public:
+       
+               SequenceParser(string, string);                 //group, fasta - file mismatches will set m->control_pressed = true
+               SequenceParser(string, string, string); //group, fasta, name  - file mismatches will set m->control_pressed = true
+               ~SequenceParser();
+               
+               //general operations
+               int getNumGroups();
+               vector<string> getNamesOfGroups();      
+               bool isValidGroup(string);  //return true if string is a valid group
+               string getGroup(string);        //returns group of a specific sequence
+               
+               int getNumSeqs(string);         //returns the number of unique sequences in a specific group
+               vector<Sequence> getSeqs(string); //returns unique sequences in a specific group
+               map<string, string> getNameMap(string); //returns seqName -> namesOfRedundantSeqs separated by commas for a specific group - the name file format, but each line is parsed by group.
+               
+       private:
+       
+               GroupMap* groupMap;
+               MothurOut* m;
+       
+               int numSeqs;
+               map<string, vector<Sequence> > seqs; //a vector for each group
+               map<string, map<string, string> > nameMapPerGroup; //nameMap for each group
+};
+
+#endif
+
diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp

index 341dd18fd442923df7751b85801714b00cf7a0d0..4965cfd75e06c9f8ba8157056922514581e3e10e 100644 (file)
--- a/sffinfocommand.cpp
+++ b/sffinfocommand.cpp
@@ -466,7 +466,7 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){
                         //read offset
                         char buffer2 [8];
                         in.read(buffer2, 8);
-                       header.indexOffset =  be_int8(*(unsigned long int *)(&buffer2));
+                       header.indexOffset =  be_int8(*(unsigned long long *)(&buffer2));
                         
                         //read index length
                         char buffer3 [4];
@@ -513,8 +513,8 @@ int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){
                         delete[] tempBuffer2;
                                 
                         /* Pad to 8 chars */
-                       unsigned long int spotInFile = in.tellg();
-                       unsigned long int spot = (spotInFile + 7)& ~7;  // ~ inverts
+                       unsigned long long spotInFile = in.tellg();
+                       unsigned long long spot = (spotInFile + 7)& ~7;  // ~ inverts
                         in.seekg(spot);
                         
                 }else{
@@ -581,8 +581,8 @@ int SffInfoCommand::readHeader(ifstream& in, Header& header){
                         decodeName(header.timestamp, header.region, header.xy, header.name);
                         
                         /* Pad to 8 chars */
-                       unsigned long int spotInFile = in.tellg();
-                       unsigned long int spot = (spotInFile + 7)& ~7;
+                       unsigned long long spotInFile = in.tellg();
+                       unsigned long long spot = (spotInFile + 7)& ~7;
                         in.seekg(spot);
                         
                 }else{
@@ -634,8 +634,8 @@ int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, i
                         }
         
                         /* Pad to 8 chars */
-                       unsigned long int spotInFile = in.tellg();
-                       unsigned long int spot = (spotInFile + 7)& ~7;
+                       unsigned long long spotInFile = in.tellg();
+                       unsigned long long spot = (spotInFile + 7)& ~7;
                         in.seekg(spot);
                         
                 }else{
diff --git a/sffinfocommand.h b/sffinfocommand.h

index f18a3f5568167c031adabd5607cf2c07c2f651f9..903a589cd777b89b7e508e48d1173d550210f864 100644 (file)
--- a/sffinfocommand.h
+++ b/sffinfocommand.h
@@ -17,7 +17,7 @@
  struct CommonHeader {
         unsigned int magicNumber;
         string version;
-       unsigned long int indexOffset;
+       unsigned long long indexOffset;
         unsigned int indexLength;
         unsigned int numReads;
         unsigned short headerLength;
diff --git a/sharedrabundvector.cpp b/sharedrabundvector.cpp

index 56b7d094b1444a2afada95807a36b52e235c70e8..51e277a659ce33789482ea837db6c7a6d0075b25 100644 (file)
--- a/sharedrabundvector.cpp
+++ b/sharedrabundvector.cpp
@@ -395,7 +395,7 @@ int SharedRAbundVector::getGroupIndex()  { return index; }
  void SharedRAbundVector::setGroupIndex(int vIndex)     { index = vIndex; }
  /***********************************************************************/
  int SharedRAbundVector::getNumBins(){
-       return numBins;
+               return numBins;
  }
  
  /***********************************************************************/
@@ -423,6 +423,7 @@ vector<SharedRAbundVector*> SharedRAbundVector::getSharedRAbundVectors(){
                 vector<string> Groups = m->getGroups();
                 vector<string> allGroups = m->getAllGroups();
                 util->setGroups(Groups, allGroups);
+               m->setGroups(Groups);
                 
                 bool remove = false;
                 for (int i = 0; i < lookup.size(); i++) {
diff --git a/shhhercommand.cpp b/shhhercommand.cpp

index 5017f88bdfef592fd3996302ac0fbe9058088a49..a40d7748cfd8bd915924f16c204f529feb2d0f74 100644 (file)
--- a/shhhercommand.cpp
+++ b/shhhercommand.cpp
@@ -18,14 +18,6 @@
  #include "sparsematrix.hpp"
  #include <cfloat>
  
-//**********************************************************************************************************************
-
-#define NUMBINS 1000
-#define HOMOPS 10
-#define MIN_COUNT 0.1
-#define MIN_WEIGHT 0.1
-#define MIN_TAU 0.0001
-#define MIN_ITER 10
  //**********************************************************************************************************************
  vector<string> ShhherCommand::setParameters(){ 
         try {
@@ -310,8 +302,8 @@ int ShhherCommand::execute(){
                         processors = ncpus;
                         
                         m->mothurOut("\nGetting preliminary data...\n");
-                       getSingleLookUp();
-                       getJointLookUp();
+                       getSingleLookUp();      if (m->control_pressed) { return 0; }
+                       getJointLookUp();       if (m->control_pressed) { return 0; }
                         
                         vector<string> flowFileVector;
                         if(flowFilesFileName != "not found"){
@@ -320,7 +312,7 @@ int ShhherCommand::execute(){
                                 ifstream flowFilesFile;
                                 m->openInputFile(flowFilesFileName, flowFilesFile);
                                 while(flowFilesFile){
-                                       flowFilesFile >> fName;
+                                       fName = m->getline(flowFilesFile);
                                         flowFileVector.push_back(fName);
                                         m->gobble(flowFilesFile);
                                 }
@@ -335,17 +327,24 @@ int ShhherCommand::execute(){
                         }
                         
                         for(int i=0;i<numFiles;i++){
+                               
+                               if (m->control_pressed) { break; }
+                               
                                 double begClock = clock();
-                               unsigned long int begTime = time(NULL);
+                               unsigned long long begTime = time(NULL);
  
                                 flowFileName = flowFileVector[i];
                                 
                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
                                 m->mothurOut("Reading flowgrams...\n");
                                 getFlowData();
+                               
+                               if (m->control_pressed) { break; }
  
                                 m->mothurOut("Identifying unique flowgrams...\n");
                                 getUniques();
+                               
+                               if (m->control_pressed) { break; }
  
                                 m->mothurOut("Calculating distances between flowgrams...\n");
                                 char fileName[1024];
@@ -368,6 +367,8 @@ int ShhherCommand::execute(){
                                                         
                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
                                 
+                               if (m->control_pressed) { break; }
+                               
                                 int done;
                                 for(int i=1;i<ncpus;i++){
                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
@@ -378,17 +379,25 @@ int ShhherCommand::execute(){
  
                                 string namesFileName = createNamesFile();
                                 
+                               if (m->control_pressed) { break; }
+                               
                                 m->mothurOut("\nClustering flowgrams...\n");
                                 string listFileName = cluster(distFileName, namesFileName);
-
+                               
+                               if (m->control_pressed) { break; }
+                               
                                 getOTUData(listFileName);
  
                                 m->mothurRemove(distFileName);
                                 m->mothurRemove(namesFileName);
                                 m->mothurRemove(listFileName);
                                 
+                               if (m->control_pressed) { break; }
+                               
                                 initPyroCluster();
  
+                               if (m->control_pressed) { break; }
+                               
                                 for(int i=1;i<ncpus;i++){
                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
@@ -396,6 +405,7 @@ int ShhherCommand::execute(){
                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
                                 }
                                 
+                               if (m->control_pressed) { break; }
                                 
                                 double maxDelta = 0;
                                 int iter = 0;
@@ -406,10 +416,12 @@ int ShhherCommand::execute(){
                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
                                 
                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
-
+                                       
                                         double cycClock = clock();
-                                       unsigned long int cycTime = time(NULL);
+                                       unsigned long long cycTime = time(NULL);
                                         fill();
+                                       
+                                       if (m->control_pressed) { break; }
  
                                         int total = singleTau.size();
                                         for(int i=1;i<ncpus;i++){
@@ -442,9 +454,9 @@ int ShhherCommand::execute(){
                                                 }
                                         }
                                                                         
-                                       maxDelta = getNewWeights();
-                                       double nLL = getLikelihood();
-                                       checkCentroids();
+                                       maxDelta = getNewWeights(); if (m->control_pressed) { break; }
+                                       double nLL = getLikelihood(); if (m->control_pressed) { break; }
+                                       checkCentroids(); if (m->control_pressed) { break; }
                                         
                                         for(int i=1;i<ncpus;i++){
                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
@@ -521,19 +533,26 @@ int ShhherCommand::execute(){
                                         
                                 }       
                                 
+                               if (m->control_pressed) { break; }
+                               
                                 m->mothurOut("\nFinalizing...\n");
                                 fill();
+                               
+                               if (m->control_pressed) { break; }
+                               
                                 setOTUs();
                                 
                                 vector<int> otuCounts(numOTUs, 0);
                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
                                 calcCentroidsDriver(0, numOTUs);
                                 
-                               writeQualities(otuCounts);
-                               writeSequences(otuCounts);
-                               writeNames(otuCounts);
-                               writeClusters(otuCounts);
-                               writeGroups();
+                               if (m->control_pressed) { break; }
+                               
+                               writeQualities(otuCounts);      if (m->control_pressed) { break; }
+                               writeSequences(otuCounts);      if (m->control_pressed) { break; }
+                               writeNames(otuCounts);          if (m->control_pressed) { break; }
+                               writeClusters(otuCounts);       if (m->control_pressed) { break; }
+                               writeGroups();                          if (m->control_pressed) { break; }
                                 
                                                                  
                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
@@ -549,6 +568,9 @@ int ShhherCommand::execute(){
                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
  
                         for(int i=0;i<numFiles;i++){
+                               
+                               if (m->control_pressed) { break; }
+                               
                                 //Now into the pyrodist part
                                 bool live = 1;
  
@@ -578,7 +600,9 @@ int ShhherCommand::execute(){
                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
                                 
                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
-
+                               
+                               if (m->control_pressed) { break; }
+                               
                                 int done = 1;
                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
  
@@ -608,6 +632,8 @@ int ShhherCommand::execute(){
  
                                 while(live){
                                         
+                                       if (m->control_pressed) { break; }
+                                       
                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                                         singleTau.assign(total, 0.0000);
                                         seqNumber.assign(total, 0);
@@ -643,7 +669,10 @@ int ShhherCommand::execute(){
                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
                                 }
                         }
-               }               
+               }
+               
+               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+               
                 MPI_Barrier(MPI_COMM_WORLD);
  
                 
@@ -680,6 +709,9 @@ string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
                 double begClock = clock();
                 
                 for(int i=startSeq;i<stopSeq;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         for(int j=0;j<i;j++){
                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
                                 
@@ -695,10 +727,12 @@ string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
                         }
                 }
                 
-               m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
-               
                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
+               
+               if (m->control_pressed) { return fDistFileName; }
+               
+               m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
  
                 ofstream distFile(fDistFileName.c_str());
                 distFile << outStream.str();            
@@ -719,9 +753,10 @@ int ShhherCommand::execute(){
         try {
                 if (abort == true) { return 0; }
                 
-               getSingleLookUp();
-               getJointLookUp();
+               getSingleLookUp();      if (m->control_pressed) { return 0; }
+               getJointLookUp();       if (m->control_pressed) { return 0; }
                                 
+               
                 vector<string> flowFileVector;
                 if(flowFilesFileName != "not found"){
                         string fName;
@@ -729,7 +764,7 @@ int ShhherCommand::execute(){
                         ifstream flowFilesFile;
                         m->openInputFile(flowFilesFileName, flowFilesFile);
                         while(flowFilesFile){
-                               flowFilesFile >> fName;
+                               fName = m->getline(flowFilesFile);
                                 flowFileVector.push_back(fName);
                                 m->gobble(flowFilesFile);
                         }
@@ -741,76 +776,111 @@ int ShhherCommand::execute(){
                 
                 
                 for(int i=0;i<numFiles;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         flowFileName = flowFileVector[i];
  
                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
                         m->mothurOut("Reading flowgrams...\n");
                         getFlowData();
                         
+                       if (m->control_pressed) { break; }
+                       
                         m->mothurOut("Identifying unique flowgrams...\n");
                         getUniques();
                         
+                       if (m->control_pressed) { break; }
                         
                         m->mothurOut("Calculating distances between flowgrams...\n");
                         string distFileName = createDistFile(processors);
                         string namesFileName = createNamesFile();
-                               
+                       
+                       if (m->control_pressed) { break; }
+                       
                         m->mothurOut("\nClustering flowgrams...\n");
                         string listFileName = cluster(distFileName, namesFileName);
+                       
+                       if (m->control_pressed) { break; }
+                       
                         getOTUData(listFileName);
                         
+                       if (m->control_pressed) { break; }
+                       
                         m->mothurRemove(distFileName);
                         m->mothurRemove(namesFileName);
                         m->mothurRemove(listFileName);
                         
                         initPyroCluster();
                         
+                       if (m->control_pressed) { break; }
+                       
                         double maxDelta = 0;
                         int iter = 0;
                         
                         double begClock = clock();
-                       unsigned long int begTime = time(NULL);
+                       unsigned long long begTime = time(NULL);
  
                         m->mothurOut("\nDenoising flowgrams...\n");
                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
                         
                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
                                 
+                               if (m->control_pressed) { break; }
+                               
                                 double cycClock = clock();
-                               unsigned long int cycTime = time(NULL);
+                               unsigned long long cycTime = time(NULL);
                                 fill();
                                 
+                               if (m->control_pressed) { break; }
+
                                 calcCentroids();
                                 
-                               maxDelta = getNewWeights();
-                               double nLL = getLikelihood();
+                               if (m->control_pressed) { break; }
+
+                               maxDelta = getNewWeights();  if (m->control_pressed) { break; }
+                               double nLL = getLikelihood(); if (m->control_pressed) { break; }
                                 checkCentroids();
                                 
+                               if (m->control_pressed) { break; }
+                               
                                 calcNewDistances();
-
+                               
+                               if (m->control_pressed) { break; }
+                               
                                 iter++;
                                 
                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
  
                         }       
                         
+                       if (m->control_pressed) { break; }
+                       
                         m->mothurOut("\nFinalizing...\n");
                         fill();
+                       
+                       if (m->control_pressed) { break; }
+                       
                         setOTUs();
                         
+                       if (m->control_pressed) { break; }
+                       
                         vector<int> otuCounts(numOTUs, 0);
                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
                         
-                       calcCentroidsDriver(0, numOTUs);
-                       writeQualities(otuCounts);
-                       writeSequences(otuCounts);
-                       writeNames(otuCounts);
-                       writeClusters(otuCounts);
-                       writeGroups();
+                       calcCentroidsDriver(0, numOTUs);        if (m->control_pressed) { break; }
+                       writeQualities(otuCounts);                      if (m->control_pressed) { break; }
+                       writeSequences(otuCounts);                      if (m->control_pressed) { break; }
+                       writeNames(otuCounts);                          if (m->control_pressed) { break; }
+                       writeClusters(otuCounts);                       if (m->control_pressed) { break; }
+                       writeGroups();                                          if (m->control_pressed) { break; }
                         
                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
                 }
                 
+               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+               
                 if(compositeFASTAFileName != ""){
                         outputNames.push_back(compositeFASTAFileName);
                         outputNames.push_back(compositeNamesFileName);
@@ -850,6 +920,9 @@ void ShhherCommand::getFlowData(){
                 flowFile >> numFlowCells;
                 int index = 0;//pcluster
                 while(!flowFile.eof()){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         flowFile >> seqName >> currentNumFlowCells;
                         lengths.push_back(currentNumFlowCells);
  
@@ -869,6 +942,9 @@ void ShhherCommand::getFlowData(){
                 numSeqs = seqNameVector.size();         
                 
                 for(int i=0;i<numSeqs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         int iNumFlowCells = i * numFlowCells;
                         for(int j=lengths[i];j<numFlowCells;j++){
                                 flowDataIntI[iNumFlowCells + j] = 0;
@@ -894,6 +970,9 @@ void ShhherCommand::getSingleLookUp(){
                 m->openInputFile(lookupFileName, lookUpFile);
                 
                 for(int i=0;i<HOMOPS;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         float logFracFreq;
                         lookUpFile >> logFracFreq;
                         
@@ -919,6 +998,9 @@ void ShhherCommand::getJointLookUp(){
                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
                 
                 for(int i=0;i<NUMBINS;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         for(int j=0;j<NUMBINS;j++){             
                                 
                                 double minSum = 100000000;
@@ -947,6 +1029,9 @@ double ShhherCommand::getProbIntensity(int intIntensity){
  
                 
                 for(int i=0;i<HOMOPS;i++){//loop signal strength
+                       
+                       if (m->control_pressed) { break; }
+                       
                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
                 }
@@ -975,6 +1060,9 @@ void ShhherCommand::getUniques(){
                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
                 
                 for(int i=0;i<numSeqs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         int index = 0;
                         
                         vector<short> current(numFlowCells);
@@ -1025,7 +1113,7 @@ void ShhherCommand::getUniques(){
                 uniqueLengths.resize(numUniques);       
                 
                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
-               for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
+               for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
         }
         catch(exception& e) {
                 m->errorOut(e, "ShhherCommand", "getUniques");
@@ -1046,6 +1134,9 @@ float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
                 float dist = 0;
                 
                 for(int i=0;i<minLength;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
                         
@@ -1078,6 +1169,9 @@ void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int st
                 double begClock = clock();
  
                 for(int i=startSeq;i<stopSeq;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         for(int j=0;j<i;j++){
                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
  
@@ -1094,13 +1188,17 @@ void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int st
                                 m->mothurOutEndLine();
                         }
                 }
-               m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
-               m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
-               m->mothurOutEndLine();
                 
                 ofstream distFile(distFileName.c_str());
                 distFile << outStream.str();            
                 distFile.close();
+               
+               if (m->control_pressed) {}
+               else {
+                       m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
+                       m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+                       m->mothurOutEndLine();
+               }
         }
         catch(exception& e) {
                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
@@ -1112,31 +1210,37 @@ void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int st
  
  string ShhherCommand::createDistFile(int processors){
         try{
+//////////////////////// until I figure out the shared memory issue //////////////////////             
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+               processors=1;
+#endif
+//////////////////////// until I figure out the shared memory issue //////////////////////             
+               
                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
                                 
-               unsigned long int begTime = time(NULL);
+               unsigned long long begTime = time(NULL);
                 double begClock = clock();
-
-               vector<int> start;
-               vector<int> end;
                 
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               if (numSeqs < processors){      processors = 1; }
+               
                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
+               
                 else{ //you have multiple processors
                         
-                       if (numSeqs < processors){      processors = 1; }
-                       
                         vector<int> start(processors, 0);
                         vector<int> end(processors, 0);
                         
+                       int process = 1;
+                       vector<int> processIDs;
+                       
                         for (int i = 0; i < processors; i++) {
                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
                         }
                         
-                       int process = 1;
-                       vector<int> processIDs;
-                       
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               
                         //loop through and create all the processes you want
                         while (process != processors) {
                                 int pid = fork();
@@ -1163,6 +1267,43 @@ string ShhherCommand::createDistFile(int processors){
                                 int temp = processIDs[i];
                                 wait(&temp);
                         }
+#else
+                       //////////////////////////////////////////////////////////////////////////////////////////////////////
+                       //Windows version shared memory, so be careful when passing variables through the flowDistParentForkData struct. 
+                       //Above fork() will clone, so memory is separate, but that's not the case with windows, 
+                       //////////////////////////////////////////////////////////////////////////////////////////////////////
+                       
+                       vector<flowDistParentForkData*> pDataArray; 
+                       DWORD   dwThreadIdArray[processors-1];
+                       HANDLE  hThreadArray[processors-1]; 
+                       
+                       //Create processor worker threads.
+                       for(int i = 0; i < processors-1; i++){
+                               // Allocate memory for thread data.
+                               string extension = extension = toString(i) + ".temp"; 
+                               
+                               flowDistParentForkData* tempdist = new flowDistParentForkData((fDistFileName + extension), mapUniqueToSeq, mapSeqToUnique, lengths, flowDataIntI, flowDataPrI, jointLookUp, m, start[i+1], end[i+1], numFlowCells, cutoff, i);
+                               pDataArray.push_back(tempdist);
+                               processIDs.push_back(i);
+                               
+                               //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+                               //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+                               hThreadArray[i] = CreateThread(NULL, 0, MyflowDistParentForkThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
+                       }
+                       
+                       //parent does its part
+                       flowDistParentFork(fDistFileName, start[0], end[0]);
+                       
+                       //Wait until all threads have terminated.
+                       WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+                       
+                       //Close all thread handles and free memory allocations.
+                       for(int i=0; i < pDataArray.size(); i++){
+                               CloseHandle(hThreadArray[i]);
+                               delete pDataArray[i];
+                       }
+                       
+#endif
                         
                         //append and remove temp files
                         for (int i=0;i<processIDs.size();i++) { 
@@ -1172,15 +1313,9 @@ string ShhherCommand::createDistFile(int processors){
                         
                 }
                 
-#else
-               flowDistParentFork(fDistFileName, 0, numUniques);
-#endif
-
                 m->mothurOutEndLine();
-               
                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
                 
-
                 return fDistFileName;
         }
         catch(exception& e) {
@@ -1206,6 +1341,9 @@ string ShhherCommand::createNamesFile(){
                 m->openOutputFile(nameFileName, nameFile);
                 
                 for(int i=0;i<numUniques;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
  //                     nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
                 }
@@ -1244,6 +1382,9 @@ string ShhherCommand::cluster(string distFileName, string namesFileName){
                 
                 double clusterCutoff = cutoff;
                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         cluster->update(clusterCutoff);
                 }
                 
@@ -1288,6 +1429,8 @@ void ShhherCommand::getOTUData(string listFileName){
                 string singleOTU = "";
                 
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
  
                         listFile >> singleOTU;
                         
@@ -1358,6 +1501,9 @@ void ShhherCommand::getOTUData(string listFileName){
  
  void ShhherCommand::initPyroCluster(){                          
         try{
+               
+               if (numOTUs < processors) { processors = 1; }
+               
                 dist.assign(numSeqs * numOTUs, 0);
                 change.assign(numOTUs, 1);
                 centroids.assign(numOTUs, -1);
@@ -1387,6 +1533,9 @@ void ShhherCommand::fill(){
         try {
                 int index = 0;
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         cumNumSeqs[i] = index;
                         for(int j=0;j<nSeqsPerOTU[i];j++){
                                 seqNumber[index] = aaP[i][j];
@@ -1408,7 +1557,7 @@ void ShhherCommand::calcCentroids(){
         try{
                 
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
+               
                 if(processors == 1)     {
                         calcCentroidsDriver(0, numOTUs);                
                 }
@@ -1465,9 +1614,10 @@ void ShhherCommand::calcCentroidsDriver(int start, int finish){
         
         try{
                 
-       
                 for(int i=start;i<finish;i++){
                         
+                       if (m->control_pressed) { break; }
+                       
                         double count = 0;
                         int position = 0;
                         int minFlowGram = 100000000;
@@ -1545,7 +1695,7 @@ double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
                 int flowBValue = flow * numFlowCells;
                 
                 double dist = 0;
-               
+
                 for(int i=0;i<length;i++){
                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
                         flowAValue++;
@@ -1569,6 +1719,8 @@ double ShhherCommand::getNewWeights(){
                 
                 for(int i=0;i<numOTUs;i++){
                         
+                       if (m->control_pressed) { break; }
+                       
                         double difference = weight[i];
                         weight[i] = 0;
                         
@@ -1606,6 +1758,9 @@ double ShhherCommand::getLikelihood(){
                 
                 string hold;
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         for(int j=0;j<nSeqsPerOTU[i];j++){
                                 int index = cumNumSeqs[i] + j;
                                 int nI = seqIndex[index];
@@ -1644,6 +1799,9 @@ void ShhherCommand::checkCentroids(){
                 }
                 
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         if(unique[i] == 1){
                                 for(int j=i+1;j<numOTUs;j++){
                                         if(unique[j] == 1){
@@ -1672,7 +1830,7 @@ void ShhherCommand::calcNewDistances(){
         try{
                 
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-               
+
                 if(processors == 1)     {
                         calcNewDistancesParent(0, numSeqs);             
                 }
@@ -1706,7 +1864,7 @@ void ShhherCommand::calcNewDistances(){
                                         exit(0);
                                 }
                         }
-                       
+                               
                         //parent does its part
                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
                         int total = seqIndex.size();
@@ -1761,9 +1919,10 @@ void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<i
                 seqIndex.clear();
                 singleTau.clear();
                 
-               
-               
                 for(int i=startSeq;i<stopSeq;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         double offset = 1e8;
                         int indexOffset = i * numOTUs;
                         
@@ -1818,6 +1977,9 @@ void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>
                 child_singleTau.resize(0);
                 
                 for(int i=startSeq;i<stopSeq;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         double offset = 1e8;
                         int indexOffset = i * numOTUs;
                         
@@ -1869,22 +2031,26 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
                 vector<double> newTau(numOTUs,0);
                 vector<double> norms(numSeqs, 0);
                 nSeqsPerOTU.assign(numOTUs, 0);
-               
+
                 for(int i=startSeq;i<stopSeq;i++){
-                       int indexOffset = i * numOTUs;
                         
+                       if (m->control_pressed) { break; }
+                       
+                       int indexOffset = i * numOTUs;
+
                         double offset = 1e8;
                         
                         for(int j=0;j<numOTUs;j++){
+
                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
                                 }
-                               
+       
                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
                                         offset = dist[indexOffset + j];
                                 }
                         }
-                       
+
                         for(int j=0;j<numOTUs;j++){
                                 if(weight[j] > MIN_WEIGHT){
                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
@@ -1894,11 +2060,11 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
                                         newTau[j] = 0.0;
                                 }
                         }
-                       
+
                         for(int j=0;j<numOTUs;j++){
                                 newTau[j] /= norms[i];
                         }
-                       
+       
                         for(int j=0;j<numOTUs;j++){
                                 if(newTau[j] > MIN_TAU){
                                         
@@ -1917,7 +2083,9 @@ void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
                                         nSeqsPerOTU[j]++;
                                 }
                         }
+
                 }
+
         }
         catch(exception& e) {
                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
@@ -1933,6 +2101,9 @@ void ShhherCommand::setOTUs(){
                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
                 
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         for(int j=0;j<nSeqsPerOTU[i];j++){
                                 int index = cumNumSeqs[i] + j;
                                 double tauValue = singleTau[seqNumber[index]];
@@ -1994,6 +2165,9 @@ void ShhherCommand::writeQualities(vector<int> otuCounts){
                 
                 
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         int index = 0;
                         int base = 0;
                         
@@ -2090,6 +2264,9 @@ void ShhherCommand::writeSequences(vector<int> otuCounts){
                 vector<string> names(numOTUs, "");
                 
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         int index = centroids[i];
                         
                         if(otuCounts[i] > 0){
@@ -2131,6 +2308,9 @@ void ShhherCommand::writeNames(vector<int> otuCounts){
                 m->openOutputFile(nameFileName, nameFile);
                 
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
                         if(otuCounts[i] > 0){
                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
                                 
@@ -2165,6 +2345,7 @@ void ShhherCommand::writeGroups(){
                 m->openOutputFile(groupFileName, groupFile);
                 
                 for(int i=0;i<numSeqs;i++){
+                       if (m->control_pressed) { break; }
                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
                 }
                 groupFile.close();
@@ -2188,6 +2369,10 @@ void ShhherCommand::writeClusters(vector<int> otuCounts){
                 string bases = flowOrder;
                 
                 for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) {
+                               break;
+                       }
                         //output the translated version of the centroid sequence for the otu
                         if(otuCounts[i] > 0){
                                 int index = centroids[i];
diff --git a/shhhercommand.h b/shhhercommand.h

index bdc850362b4b08e6c2e14655360e455353e74793..00bd41abef486338da1d33463b0698bd895266a1 100644 (file)
--- a/shhhercommand.h
+++ b/shhhercommand.h
@@ -13,6 +13,15 @@
  #include "mothur.h"
  #include "command.hpp"
  
+//**********************************************************************************************************************
+
+#define NUMBINS 1000
+#define HOMOPS 10
+#define MIN_COUNT 0.1
+#define MIN_WEIGHT 0.1
+#define MIN_TAU 0.0001
+#define MIN_ITER 10
+//**********************************************************************************************************************
  
  class ShhherCommand : public Command {
         
@@ -116,6 +125,125 @@ private:
         
  };
  
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+typedef struct flowDistParentForkData {
+       string distFileName; 
+       vector<int> mapUniqueToSeq;
+       vector<int> mapSeqToUnique;
+       vector<int> lengths;
+       vector<short> flowDataIntI;
+       vector<double> flowDataPrI;
+       vector<double> jointLookUp;
+       MothurOut* m;
+       int threadID, startSeq, stopSeq, numFlowCells;
+       float cutoff;
+       
+       flowDistParentForkData(){}
+       flowDistParentForkData(string d, vector<int> mapU, vector<int> mapS, vector<int> l, vector<short> flowD, vector<double> flowDa, vector<double> j, MothurOut* mout, int st, int sp, int n, float cut, int tid) {
+               distFileName = d;
+               mapUniqueToSeq = mapU;
+               mapSeqToUnique = mapS;
+               lengths = l;
+               flowDataIntI = flowD;
+               flowDataPrI = flowDa;
+               jointLookUp = j;
+               m = mout;
+               startSeq = st;
+               stopSeq = sp;
+               numFlowCells = n;
+               cutoff= cut;
+               threadID = tid;
+       }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyflowDistParentForkThreadFunction(LPVOID lpParam){ 
+       flowDistParentForkData* pDataArray;
+       pDataArray = (flowDistParentForkData*)lpParam;
+       
+       try {
+               ostringstream outStream;
+               outStream.setf(ios::fixed, ios::floatfield);
+               outStream.setf(ios::dec, ios::basefield);
+               outStream.setf(ios::showpoint);
+               outStream.precision(6);
+               
+               int begTime = time(NULL);
+               double begClock = clock();
+               string tempOut = "start and end = " + toString(pDataArray->startSeq) +'\t' + toString(pDataArray->stopSeq) + "-";
+               cout << tempOut << endl;
+       
+               for(int i=pDataArray->startSeq;i<pDataArray->stopSeq;i++){
+                       
+                       if (pDataArray->m->control_pressed) { break; }
+                       cout << "thread i = " << i << endl;
+                       for(int j=0;j<i;j++){
+                               
+                               cout << "thread j = " << j << endl;
+                               float flowDistance = 0.0;
+                               ////////////////// calcPairwiseDist ///////////////////
+                               //needed because this is a static global function that can't see the classes internal functions
+                               
+                               int minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[i]]];
+                               if(pDataArray->lengths[pDataArray->mapUniqueToSeq[j]] < minLength){     minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[j]]];     }
+                               
+                               int ANumFlowCells = pDataArray->mapUniqueToSeq[i] * pDataArray->numFlowCells;
+                               int BNumFlowCells = pDataArray->mapUniqueToSeq[j] * pDataArray->numFlowCells;
+                               
+                               for(int k=0;k<minLength;k++){
+                                       
+                                       if (pDataArray->m->control_pressed) { break; }
+                                       
+                                       int flowAIntI = pDataArray->flowDataIntI[ANumFlowCells + k];
+                                       float flowAPrI = pDataArray->flowDataPrI[ANumFlowCells + k];
+                                       
+                                       int flowBIntI = pDataArray->flowDataIntI[BNumFlowCells + k];
+                                       float flowBPrI = pDataArray->flowDataPrI[BNumFlowCells + k];
+                                       flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+                               }
+                               
+                               flowDistance /= (float) minLength;
+                               //cout << flowDistance << endl;
+                               ////////////////// end of calcPairwiseDist ///////////////////
+                                                               
+                               if(flowDistance < 1e-6){
+                                       outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
+                               }
+                               else if(flowDistance <= pDataArray->cutoff){
+                                       outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << flowDistance << endl;
+                               }
+                       }
+                       if(i % 100 == 0){
+                               pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
+                               pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+                               pDataArray->m->mothurOutEndLine();
+                       }
+               }
+               
+               ofstream distFile(pDataArray->distFileName.c_str());
+               distFile << outStream.str();            
+               distFile.close();
+               
+               if (pDataArray->m->control_pressed) {}
+               else {
+                       pDataArray->m->mothurOut(toString(pDataArray->stopSeq-1) + "\t" + toString(time(NULL) - begTime));
+                       pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+                       pDataArray->m->mothurOutEndLine();
+               }               
+               
+       }
+       catch(exception& e) {
+               pDataArray->m->errorOut(e, "ShhherCommand", "MyflowDistParentForkThreadFunction");
+               exit(1);
+       }
+} 
+#endif
+
  
  #endif
  
diff --git a/simpson.cpp b/simpson.cpp

index bf3365ddeadd942a53733e85cbb1eeb30cdc0bb7..edc35bd6c04f42c3d687dce377df6b6442e36a18 100644 (file)
--- a/simpson.cpp
+++ b/simpson.cpp
@@ -28,13 +28,13 @@ EstOutput Simpson::getValues(SAbundVector* rank){
                 if(sobs != 0){
                         double simnum=0.0000;
                 
-                       for(unsigned long int i=1;i<=maxRank;i++){
+                       for(unsigned long long i=1;i<=maxRank;i++){
                                 simnum += (double)(rank->get(i)*i*(i-1));
                         }
                         
                         simpson = simnum / (sampled*(sampled-1));
                 
-                       for(unsigned long int i=1;i<=maxRank;i++){
+                       for(unsigned long long i=1;i<=maxRank;i++){
                                 double piI = (double) i / (double)sampled;
                                 firstTerm += rank->get(i) * pow(piI, 3);
                                 secondTerm += rank->get(i) * pow(piI, 2);
diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp

index 651ee028426f2713954557c4cdaf44af29112104..28cfc3493e5b2ac6ac790445a4acaa397405b2ad 100644 (file)
--- a/subsamplecommand.cpp
+++ b/subsamplecommand.cpp
@@ -9,6 +9,7 @@
  
  #include "subsamplecommand.h"
  #include "sharedutilities.h"
+#include "deconvolutecommand.h"
  
  //**********************************************************************************************************************
  vector<string> SubSampleCommand::setParameters(){      
@@ -423,60 +424,45 @@ int SubSampleCommand::getSubSampleFasta() {
                 
                 set<string> subset; //dont want repeat sequence names added
                 if (persample) {
-                       for (int i = 0; i < Groups.size(); i++) {
-                               
-                               //randomly select a subset of those names from this group to include in the subsample
-                               for (int j = 0; j < size; j++) {
-                                       
-                                       if (m->control_pressed) { return 0; }
+                       //initialize counts
+                       map<string, int> groupCounts;
+                       for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
+                       
+                       for (int j = 0; j < names.size(); j++) {
                                         
-                                       //get random sequence to add, making sure we have not already added it
-                                       bool done = false;
-                                       int myrand;
-                                       while (!done) {
-                                               myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-                                               
-                                               if (subset.count(names[myrand]) == 0)  { 
-                                                       
-                                                       string group = groupMap->getGroup(names[myrand]);
-                                                       if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-                                                               
-                                                       if (group == Groups[i]) {       subset.insert(names[myrand]); break;    }
-                                               }
-                                       }
-                               }
+                               if (m->control_pressed) { return 0; }
+                                                                                               
+                               string group = groupMap->getGroup(names[j]);
+                               if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+                               else{
+                                       if (groupCounts[group] < size) {        subset.insert(names[j]);        }
+                               }                               
                         }
                 }else {
                         
                         //randomly select a subset of those names to include in the subsample
-                       for (int j = 0; j < size; j++) {
+                       //since names was randomly shuffled just grab the next one
+                       for (int j = 0; j < names.size(); j++) {
                                 
                                 if (m->control_pressed) { return 0; }
                                 
-                               //get random sequence to add, making sure we have not already added it
-                               bool done = false;
-                               int myrand;
-                               while (!done) {
-                                       myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
+                               if (groupfile != "") { //if there is a groupfile given fill in group info
+                                       string group = groupMap->getGroup(names[j]);
+                                       if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
                                         
-                                       if (subset.count(names[myrand]) == 0)  { 
-                                               
-                                               if (groupfile != "") { //if there is a groupfile given fill in group info
-                                                       string group = groupMap->getGroup(names[myrand]);
-                                                       if (group == "not found") { m->mothurOut("[ERROR]: " + names[myrand] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
-                                                       
-                                                       if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
-                                                               if (m->inUsersGroups(group, Groups)) {
-                                                                       subset.insert(names[myrand]); break;
-                                                               }
-                                                       }else{
-                                                               subset.insert(names[myrand]); break;
-                                                       }
-                                               }else{ //save everyone, group
-                                                       subset.insert(names[myrand]); break;
-                                               }                                       
+                                       if (pickedGroups) { //if hte user picked groups, we only want to keep the names of sequences from those groups
+                                               if (m->inUsersGroups(group, Groups)) {
+                                                       subset.insert(names[j]); 
+                                               }
+                                       }else{
+                                               subset.insert(names[j]); 
                                         }
-                               }
+                               }else{ //save everyone, group
+                                       subset.insert(names[j]); 
+                               }                                       
+                       
+                               //do we have enough??
+                               if (subset.size() == size) { break; }
                         }       
                 }
                 
@@ -488,7 +474,6 @@ int SubSampleCommand::getSubSampleFasta() {
                 
                 ofstream out;
                 m->openOutputFile(outputFileName, out);
-               outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
                 
                 //read through fasta file outputting only the names on the subsample list
                 ifstream in;
@@ -531,6 +516,32 @@ int SubSampleCommand::getSubSampleFasta() {
                         m->mothurOut("[ERROR]: The subset selected contained " + toString(subset.size()) + " sequences, but I only found " + toString(count) + " of those in the fastafile."); m->mothurOutEndLine();
                 }
                 
+               if (namefile != "") {
+                       m->mothurOut("Deconvoluting subsampled fasta file... "); m->mothurOutEndLine();
+                       
+                       //use unique.seqs to create new name and fastafile
+                       string inputString = "fasta=" + outputFileName;
+                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                       m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+                       
+                       Command* uniqueCommand = new DeconvoluteCommand(inputString);
+                       uniqueCommand->execute();
+                       
+                       map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+                       
+                       delete uniqueCommand;
+                       
+                       outputTypes["name"].push_back(filenames["name"][0]);  outputNames.push_back(filenames["name"][0]);
+                       m->mothurRemove(outputFileName);
+                       outputFileName = filenames["fasta"][0];
+                       
+                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                       
+                       m->mothurOut("Done."); m->mothurOutEndLine();
+               }
+               
+               outputTypes["fasta"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+               
                 //if a groupfile is provided read through the group file only outputting the names on the subsample list
                 if (groupfile != "") {
                         
@@ -815,9 +826,10 @@ int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup) {
                                         if (m->control_pressed) { delete order; out.close(); return 0; }
                                         
                                         //get random number to sample from order between 0 and thisSize-1.
-                                       int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
+                                       //don't need this because of the random shuffle above
+                                       //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
                                         
-                                       int bin = order->get(myrand);
+                                       int bin = order->get(j);
                                         
                                         int abund = thislookup[i]->getAbundance(bin);
                                         thislookup[i]->set(bin, (abund+1), thisgroup);
@@ -1011,37 +1023,26 @@ int SubSampleCommand::getSubSampleList() {
                 //randomly select a subset of those names to include in the subsample
                 set<string> subset; //dont want repeat sequence names added
                 if (persample) {
-                       for (int i = 0; i < Groups.size(); i++) {
+                       //initialize counts
+                       map<string, int> groupCounts;
+                       for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
+                       
+                       for (int j = 0; j < names.size(); j++) {
                                 
-                               for (int j = 0; j < size; j++) {
-                                       
-                                       if (m->control_pressed) { break; }
-                                       
-                                       //get random sequence to add, making sure we have not already added it
-                                       bool done = false;
-                                       int myrand;
-                                       while (!done) {
-                                               myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0));
-                                               
-                                               if (subset.count(names[myrand]) == 0) { //you are not already added
-                                                       if (groupMap->getGroup(names[myrand]) == Groups[i])  { subset.insert(names[myrand]); break;     }
-                                               }
-                                       }
-                               }
+                               if (m->control_pressed) { return 0; }
+                               
+                               string group = groupMap->getGroup(names[j]);
+                               if (group == "not found") { m->mothurOut("[ERROR]: " + names[j] + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+                               else{
+                                       if (groupCounts[group] < size) {        subset.insert(names[j]);        }
+                               }                               
                         }
                 }else{
                         for (int j = 0; j < size; j++) {
                                 
                                 if (m->control_pressed) { break; }
                                 
-                               //get random sequence to add, making sure we have not already added it
-                               bool done = false;
-                               int myrand;
-                               while (!done) {
-                                       myrand = int((float)(names.size()) * (float)(rand()) / ((float)RAND_MAX+1.0));
-                                       
-                                       if (subset.count(names[myrand]) == 0)  { subset.insert(names[myrand]); break;   }
-                               }
+                               subset.insert(names[j]); 
                         }       
                 }
                 
@@ -1322,10 +1323,7 @@ int SubSampleCommand::processRabund(RAbundVector*& rabund, ofstream& out) {
                                 
                                 if (m->control_pressed) { delete order; return 0; }
                                 
-                               //get random number to sample from order between 0 and thisSize-1.
-                               int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-                               
-                               int bin = order->get(myrand);
+                               int bin = order->get(j);
                                 
                                 int abund = rabund->get(bin);
                                 rabund->set(bin, (abund+1));
@@ -1484,10 +1482,7 @@ int SubSampleCommand::processSabund(SAbundVector*& sabund, ofstream& out) {
         
                                 if (m->control_pressed) { delete order; return 0; }
                                 
-                               //get random number to sample from order between 0 and thisSize-1.
-                               int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-                               
-                               int bin = order->get(myrand);
+                               int bin = order->get(j);
                                 
                                 int abund = rabund->get(bin);
                                 rabund->set(bin, (abund+1));
diff --git a/systemcommand.cpp b/systemcommand.cpp

index d0057e8b4e8c199f9113f54af6a46836ab6f58b4..c07deff1f19c0315b29e0f488572b3feacb7114f 100644 (file)
--- a/systemcommand.cpp
+++ b/systemcommand.cpp
@@ -91,23 +91,31 @@ int SystemCommand::execute(){
                 
                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
                 
-               command += " > ./commandScreen.output 2>&1";
-               system(command.c_str());
+               //if command contains a redirect don't add the redirect
+               bool usedRedirect = false;
+               if ((command.find('>')) == string::npos) {
+                       command += " > ./commandScreen.output 2>&1";
+                       usedRedirect = true;
+               }
                 
-               ifstream in;
-               string filename = "./commandScreen.output";
-               m->openInputFile(filename, in, "no error");
+               system(command.c_str());
                 
-               string output = "";
-               while(char c = in.get()){
-                       if(in.eof())            {       break;                  }
-                       else                            {       output += c;    }
+               if (usedRedirect) {
+                       ifstream in;
+                       string filename = "./commandScreen.output";
+                       m->openInputFile(filename, in, "no error");
+                       
+                       string output = "";
+                       while(char c = in.get()){
+                               if(in.eof())            {       break;                  }
+                               else                            {       output += c;    }
+                       }
+                       in.close();
+                       
+                       m->mothurOut(output); m->mothurOutEndLine();
+                       m->mothurRemove(filename);
                 }
-               in.close();
                 
-               m->mothurOut(output); m->mothurOutEndLine();
-               m->mothurRemove(filename);
-                               
                 return 0;               
         }
  
diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp

index d17e000e9ce3a2d36f6a432f8647e99b7f58549f..6d697359bb2c26c4957faa9d981fb7399ac54313 100644 (file)
--- a/trimflowscommand.cpp
+++ b/trimflowscommand.cpp
@@ -10,6 +10,7 @@
  #include "trimflowscommand.h"
  #include "needlemanoverlap.hpp"
  
+
  //**********************************************************************************************************************
  vector<string> TrimFlowsCommand::setParameters(){      
         try {
@@ -224,25 +225,46 @@ int TrimFlowsCommand::execute(){
                         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
                 }
                 
-               vector<unsigned long int> flowFilePos = getFlowFileBreaks();
+               vector<unsigned long long> flowFilePos;
+       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               flowFilePos = getFlowFileBreaks();
                 for (int i = 0; i < (flowFilePos.size()-1); i++) {
                         lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
                 }       
-
+       #else
+               ifstream in; m->openInputFile(flowFileName, in); in >> numFlows; in.close();
+       ///////////////////////////////////////// until I fix multiple processors for windows //////////////////        
+               processors = 1;
+       ///////////////////////////////////////// until I fix multiple processors for windows //////////////////                
+               if (processors == 1) {
+                       lines.push_back(new linePair(0, 1000));
+               }else {
+                       int numFlowLines;
+                       flowFilePos = m->setFilePosEachLine(flowFileName, numFlowLines);
+                       flowFilePos.erase(flowFilePos.begin() + 1); numFlowLines--;
+                       
+                       //figure out how many sequences you have to process
+                       int numSeqsPerProcessor = numFlowLines / processors;
+                       cout << numSeqsPerProcessor << '\t' << numFlowLines << endl;
+                       for (int i = 0; i < processors; i++) {
+                               int startIndex =  i * numSeqsPerProcessor;
+                               if(i == (processors - 1)){      numSeqsPerProcessor = numFlowLines - i * numSeqsPerProcessor;   }
+                               lines.push_back(new linePair(flowFilePos[startIndex], numSeqsPerProcessor));
+                               cout << flowFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
+                       }
+               }
+       #endif
+               
                 vector<vector<string> > barcodePrimerComboFileNames;
                 if(oligoFileName != ""){
                         getOligos(barcodePrimerComboFileNames); 
                 }
                 
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                 if(processors == 1){
                         driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
                 }else{
                         createProcessesCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames); 
                 }       
-#else
-               driverCreateTrim(flowFileName, trimFlowFileName, scrapFlowFileName, fastaFileName, barcodePrimerComboFileNames, lines[0]);
-#endif
                 
                 if (m->control_pressed) {  return 0; }                  
                 
@@ -258,7 +280,7 @@ int TrimFlowsCommand::execute(){
                                 for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
                                         
                                         FILE * pFile;
-                                       unsigned long int size;
+                                       unsigned long long size;
                                         
                                         //get num bytes in file
                                         pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
@@ -314,10 +336,9 @@ int TrimFlowsCommand::execute(){
  
  //***************************************************************************************************************
  
-int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames, linePair* line){
+int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > thisBarcodePrimerComboFileNames, linePair* line){
         
         try {
-               
                 ofstream trimFlowFile;
                 m->openOutputFile(trimFlowFileName, trimFlowFile);
                 trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
@@ -325,16 +346,24 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                 ofstream scrapFlowFile;
                 m->openOutputFile(scrapFlowFileName, scrapFlowFile);
                 scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
-
-               if(line->start == 4){
+               
+               ofstream fastaFile;
+               if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
+               
+               ifstream flowFile;
+               m->openInputFile(flowFileName, flowFile);
+               
+               flowFile.seekg(line->start);
+               
+               if(line->start == 0){
+                       flowFile >> numFlows; m->gobble(flowFile);
                         scrapFlowFile << maxFlows << endl;
                         trimFlowFile << maxFlows << endl;
                         if(allFiles){
-                               for(int i=0;i<barcodePrimerComboFileNames.size();i++){
-                                       for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
-                                               //                              barcodePrimerComboFileNames[i][j] += toString(getpid()) + ".temp";
+                               for(int i=0;i<thisBarcodePrimerComboFileNames.size();i++){
+                                       for(int j=0;j<thisBarcodePrimerComboFileNames[0].size();j++){
                                                 ofstream temp;
-                                               m->openOutputFile(barcodePrimerComboFileNames[i][j], temp);
+                                               m->openOutputFile(thisBarcodePrimerComboFileNames[i][j], temp);
                                                 temp << maxFlows << endl;
                                                 temp.close();
                                         }
@@ -343,28 +372,28 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                 }
                 
                 FlowData flowData(numFlows, signal, noise, maxHomoP, flowOrder);
-               
-               ofstream fastaFile;
-               if(fasta){      m->openOutputFile(fastaFileName, fastaFile);    }
-               
-               ifstream flowFile;
-               m->openInputFile(flowFileName, flowFile);
-               
-               flowFile.seekg(line->start);
-
+               //cout << " driver flowdata address " <<  &flowData  << &flowFile << endl;      
                 int count = 0;
                 bool moreSeqs = 1;
-                       
+               
+               TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
+               
                 while(moreSeqs) {
+                       //cout << "driver " << count << endl;
+                       
+       
+                       if (m->control_pressed) { break; }
                         
                         int success = 1;
                         int currentSeqDiffs = 0;
                         string trashCode = "";
                         
-                       flowData.getNext(flowFile);
+                       flowData.getNext(flowFile); 
+                       //cout << "driver good bit " << flowFile.good() << endl;        
                         flowData.capFlows(maxFlows);    
                         
                         Sequence currSeq = flowData.getSequence();
+                       
                         if(!flowData.hasMinFlows(minFlows)){    //screen to see if sequence is of a minimum number of flows
                                 success = 0;
                                 trashCode += 'l';
@@ -374,13 +403,13 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                         int barcodeIndex = 0;
                         
                         if(barcodes.size() != 0){
-                               success = stripBarcode(currSeq, barcodeIndex);
+                               success = trimOligos.stripBarcode(currSeq, barcodeIndex);
                                 if(success > bdiffs)            {       trashCode += 'b';       }
                                 else{ currentSeqDiffs += success;  }
                         }
                         
                         if(numFPrimers != 0){
-                               success = stripForward(currSeq, primerIndex);
+                               success = trimOligos.stripForward(currSeq, primerIndex);
                                 if(success > pdiffs)            {       trashCode += 'f';       }
                                 else{ currentSeqDiffs += success;  }
                         }
@@ -388,10 +417,10 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                         if (currentSeqDiffs > tdiffs)   {       trashCode += 't';   }
                         
                         if(numRPrimers != 0){
-                               success = stripReverse(currSeq);
+                               success = trimOligos.stripReverse(currSeq);
                                 if(!success)                            {       trashCode += 'r';       }
                         }
-
+                       
                         if(trashCode.length() == 0){
                                                         
                                 flowData.printFlows(trimFlowFile);
@@ -400,7 +429,7 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                                 
                                 if(allFiles){
                                         ofstream output;
-                                       m->openOutputFileAppend(barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
+                                       m->openOutputFileAppend(thisBarcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
                                         output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
                                         
                                         flowData.printFlows(output);
@@ -412,12 +441,12 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN
                         }
                                 
                         count++;
-                                               
+                       //cout << "driver" << '\t' << currSeq.getName() << endl;                        
                         //report progress
                         if((count) % 10000 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
  
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                       unsigned long int pos = flowFile.tellg();
+                       unsigned long long pos = flowFile.tellg();
  
                         if ((pos == -1) || (pos >= line->end)) { break; }
  #else
@@ -575,352 +604,16 @@ void TrimFlowsCommand::getOligos(vector<vector<string> >& outFlowFileNames){
                 exit(1);
         }
  }
-
-//***************************************************************************************************************
-
-int TrimFlowsCommand::stripBarcode(Sequence& seq, int& group){
-       try {
-               
-               string rawSequence = seq.getUnaligned();
-               int success = bdiffs + 1;       //guilty until proven innocent
-               
-               //can you find the barcode
-               for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
-                       string oligo = it->first;
-                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
-                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
-                               break;  
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
-                               group = it->second;
-                               seq.setUnaligned(rawSequence.substr(oligo.length()));
-                               
-                               success = 0;
-                               break;
-                       }
-               }
-               
-               //if you found the barcode or if you don't want to allow for diffs
-               if ((bdiffs == 0) || (success == 0)) { return success;  }
-               
-               else { //try aligning and see if you can find it
-                       
-                       int maxLength = 0;
-                       
-                       Alignment* alignment;
-                       if (barcodes.size() > 0) {
-                               map<string,int>::iterator it=barcodes.begin();
-                               
-                               for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
-                                       if(it->first.length() > maxLength){
-                                               maxLength = it->first.length();
-                                       }
-                               }
-                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
-                               
-                       }else{ alignment = NULL; } 
-                       
-                       //can you find the barcode
-                       int minDiff = 1e6;
-                       int minCount = 1;
-                       int minGroup = -1;
-                       int minPos = 0;
-                       
-                       for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
-                               string oligo = it->first;
-                               //                              int length = oligo.length();
-                               
-                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
-                                       success = bdiffs + 10;
-                                       break;
-                               }
-                               
-                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
-                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
-                               oligo = alignment->getSeqAAln();
-                               string temp = alignment->getSeqBAln();
-                               
-                               int alnLength = oligo.length();
-                               
-                               for(int i=oligo.length()-1;i>=0;i--){
-                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
-                               }
-                               oligo = oligo.substr(0,alnLength);
-                               temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
-                               
-                               if(numDiff < minDiff){
-                                       minDiff = numDiff;
-                                       minCount = 1;
-                                       minGroup = it->second;
-                                       minPos = 0;
-                                       for(int i=0;i<alnLength;i++){
-                                               if(temp[i] != '-'){
-                                                       minPos++;
-                                               }
-                                       }
-                               }
-                               else if(numDiff == minDiff){
-                                       minCount++;
-                               }
-                               
-                       }
-                       
-                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
-                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
-                       else{                                                                                                   //use the best match
-                               group = minGroup;
-                               seq.setUnaligned(rawSequence.substr(minPos));
-                               success = minDiff;
-                       }
-                       
-                       if (alignment != NULL) {  delete alignment;  }
-                       
-               }
-               
-               return success;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimFlowsCommand", "stripBarcode");
-               exit(1);
-       }
-       
-}
-
-//***************************************************************************************************************
-
-int TrimFlowsCommand::stripForward(Sequence& seq, int& group){
-       try {
-               
-               string rawSequence = seq.getUnaligned();
-               int success = pdiffs + 1;       //guilty until proven innocent
-
-               //can you find the primer
-               for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
-                       string oligo = it->first;
-                       if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
-                               success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
-                               break;  
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
-                               group = it->second;
-                               seq.setUnaligned(rawSequence.substr(oligo.length()));
-                               success = 0;
-                               break;
-                       }
-               }
-               
-               //if you found the barcode or if you don't want to allow for diffs
-               if ((pdiffs == 0) || (success == 0)) {  return success;  }
-               
-               else { //try aligning and see if you can find it
-                       
-                       int maxLength = 0;
-                       
-                       Alignment* alignment;
-                       if (primers.size() > 0) {
-                               map<string,int>::iterator it=primers.begin();
-                               
-                               for(it;it!=primers.end();it++){
-                                       if(it->first.length() > maxLength){
-                                               maxLength = it->first.length();
-                                       }
-                               }
-                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
-                               
-                       }else{ alignment = NULL; } 
-                       
-                       //can you find the barcode
-                       int minDiff = 1e6;
-                       int minCount = 1;
-                       int minGroup = -1;
-                       int minPos = 0;
-                       
-                       for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
-                               string oligo = it->first;
-                               //                              int length = oligo.length();
-                               
-                               if(rawSequence.length() < maxLength){   
-                                       success = pdiffs + 100;
-                                       break;
-                               }
-                               
-                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
-                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
-                               oligo = alignment->getSeqAAln();
-                               string temp = alignment->getSeqBAln();
-                               
-                               int alnLength = oligo.length();
-                               
-                               for(int i=oligo.length()-1;i>=0;i--){
-                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
-                               }
-                               oligo = oligo.substr(0,alnLength);
-                               temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
-                               
-                               if(numDiff < minDiff){
-                                       minDiff = numDiff;
-                                       minCount = 1;
-                                       minGroup = it->second;
-                                       minPos = 0;
-                                       for(int i=0;i<alnLength;i++){
-                                               if(temp[i] != '-'){
-                                                       minPos++;
-                                               }
-                                       }
-                               }
-                               else if(numDiff == minDiff){
-                                       minCount++;
-                               }
-                               
-                       }
-                       
-                       if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
-                       else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
-                       else{                                                                                                   //use the best match
-                               group = minGroup;
-                               seq.setUnaligned(rawSequence.substr(minPos));
-                               success = minDiff;
-                       }
-                       
-                       if (alignment != NULL) {  delete alignment;  }
-                       
-               }
-               
-               return success;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimFlowsCommand", "stripForward");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-
-bool TrimFlowsCommand::stripReverse(Sequence& seq){
-       try {
-
-               string rawSequence = seq.getUnaligned();
-               bool success = 0;       //guilty until proven innocent
-               
-               for(int i=0;i<numRPrimers;i++){
-                       string oligo = revPrimer[i];
-                       
-                       if(rawSequence.length() < oligo.length()){
-                               success = 0;
-                               break;
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
-                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
-                               success = 1;
-                               break;
-                       }
-               }       
-
-               return success;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimFlowsCommand", "stripReverse");
-               exit(1);
-       }
-}
-
-
-//***************************************************************************************************************
-
-bool TrimFlowsCommand::compareDNASeq(string oligo, string seq){
-       try {
-               bool success = 1;
-               int length = oligo.length();
-               
-               for(int i=0;i<length;i++){
-                       
-                       if(oligo[i] != seq[i]){
-                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
-                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
-                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
-                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
-                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
-                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
-                               
-                               if(success == 0)        {       break;   }
-                       }
-                       else{
-                               success = 1;
-                       }
-               }
-               
-               return success;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimFlowsCommand", "compareDNASeq");
-               exit(1);
-       }
-       
-}
-
-//***************************************************************************************************************
-
-int TrimFlowsCommand::countDiffs(string oligo, string seq){
-       try {
-               
-               int length = oligo.length();
-               int countDiffs = 0;
-               
-               for(int i=0;i<length;i++){
-                       
-                       if(oligo[i] != seq[i]){
-                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
-                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
-                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
-                       }
-                       
-               }
-               
-               return countDiffs;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimFlowsCommand", "countDiffs");
-               exit(1);
-       }
-       
-}
-
  /**************************************************************************************************/
-
-vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
+vector<unsigned long long> TrimFlowsCommand::getFlowFileBreaks() {
  
         try{
                         
-               vector<unsigned long int> filePos;
+               vector<unsigned long long> filePos;
                 filePos.push_back(0);
                                         
                 FILE * pFile;
-               unsigned long int size;
+               unsigned long long size;
                 
                 //get num bytes in file
                 pFile = fopen (flowFileName.c_str(),"rb");
@@ -932,7 +625,7 @@ vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
                 }
                                 
                 //estimate file breaks
-               unsigned long int chunkSize = 0;
+               unsigned long long chunkSize = 0;
                 chunkSize = size / processors;
  
                 //file too small to divide by processors
@@ -940,7 +633,7 @@ vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
                 
                 //for each process seekg to closest file break and search for next '>' char. make that the filebreak
                 for (int i = 0; i < processors; i++) {
-                       unsigned long int spot = (i+1) * chunkSize;
+                       unsigned long long spot = (i+1) * chunkSize;
                         
                         ifstream in;
                         m->openInputFile(flowFileName, in);
@@ -949,7 +642,7 @@ vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
                         string dummy = m->getline(in);
                         
                         //there was not another sequence before the end of the file
-                       unsigned long int sanityPos = in.tellg();
+                       unsigned long long sanityPos = in.tellg();
                         
  //                     if (sanityPos == -1) {  break;  }
  //                     else {  filePos.push_back(newSpot);  }
@@ -971,8 +664,8 @@ vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
                 m->openInputFile(flowFileName, in);
                 in >> numFlows;
                 m->gobble(in);
-               unsigned long int spot = in.tellg();
-               filePos[0] = spot;
+               //unsigned long long spot = in.tellg();
+               //filePos[0] = spot;
                 in.close();
                 
                 processors = (filePos.size() - 1);
@@ -990,10 +683,11 @@ vector<unsigned long int> TrimFlowsCommand::getFlowFileBreaks() {
  int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trimFlowFileName, string scrapFlowFileName, string fastaFileName, vector<vector<string> > barcodePrimerComboFileNames){
  
         try {
+               processIDS.clear();
+               int exitCommand = 1;
+               
  #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                 int process = 1;
-               int exitCommand = 1;
-               processIDS.clear();
                 
                 //loop through and create all the processes you want
                 while (process != processors) {
@@ -1050,7 +744,85 @@ int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trim
                         int temp = processIDS[i];
                         wait(&temp);
                 }
+#else
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               //Windows version shared memory, so be careful when passing variables through the trimFlowData struct. 
+               //Above fork() will clone, so memory is separate, but that's not the case with windows, 
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               
+               vector<trimFlowData*> pDataArray; 
+               DWORD   dwThreadIdArray[processors-1];
+               HANDLE  hThreadArray[processors-1]; 
+               
+               //Create processor worker threads.
+               for( int i=0; i<processors-1; i++ ){
+                       // Allocate memory for thread data.
+                       string extension = "";
+                       if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+                       
+                       vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
+                       if(allFiles){
+                               for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
+                                       for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
+                                               tempBarcodePrimerComboFileNames[i][j] += extension;
+                                               ofstream temp;
+                                               m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
+                                               temp.close();
+                                               
+                                       }
+                               }
+                       }
+                       
+                       trimFlowData* tempflow = new trimFlowData(flowFileName, (trimFlowFileName + extension), (scrapFlowFileName + extension), fastaFileName, flowOrder, tempBarcodePrimerComboFileNames, barcodes, primers, revPrimer, fasta, allFiles, lines[i]->start, lines[i]->end, m, signal, noise, numFlows, maxFlows, minFlows, maxHomoP, tdiffs, bdiffs, pdiffs, i);
+                       pDataArray.push_back(tempflow);
+                       
+                       //MyTrimFlowThreadFunction is in header. It must be global or static to work with the threads.
+                       //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+                       hThreadArray[i] = CreateThread(NULL, 0, MyTrimFlowThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
+               }
+               
+               //using the main process as a worker saves time and memory
+               ofstream temp;
+               m->openOutputFile(trimFlowFileName, temp);
+               temp.close();
+               
+               m->openOutputFile(scrapFlowFileName, temp);
+               temp.close();
+               
+               if(fasta){
+                       m->openOutputFile(fastaFileName, temp);
+                       temp.close();
+               }
                 
+               vector<vector<string> > tempBarcodePrimerComboFileNames = barcodePrimerComboFileNames;
+               if(allFiles){
+                       for(int i=0;i<tempBarcodePrimerComboFileNames.size();i++){
+                               for(int j=0;j<tempBarcodePrimerComboFileNames[0].size();j++){
+                                       tempBarcodePrimerComboFileNames[i][j] += toString(processors-1) + ".temp";
+                                       ofstream temp;
+                                       m->openOutputFile(tempBarcodePrimerComboFileNames[i][j], temp);
+                                       temp.close();
+                                       
+                               }
+                       }
+               }
+               
+               //do my part - do last piece because windows is looking for eof
+               int num = driverCreateTrim(flowFileName, (trimFlowFileName  + toString(processors-1) + ".temp"), (scrapFlowFileName  + toString(processors-1) + ".temp"), (fastaFileName + toString(processors-1) + ".temp"), tempBarcodePrimerComboFileNames, lines[processors-1]);
+               processIDS.push_back((processors-1)); 
+               
+               //Wait until all threads have terminated.
+               WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+               
+               //Close all thread handles and free memory allocations.
+               for(int i=0; i < pDataArray.size(); i++){
+                       num += pDataArray[i]->count;
+                       CloseHandle(hThreadArray[i]);
+                       delete pDataArray[i];
+               }
+               
+               
+#endif 
                 //append files
                 m->mothurOutEndLine();
                 for(int i=0;i<processIDS.size();i++){
@@ -1081,7 +853,7 @@ int TrimFlowsCommand::createProcessesCreateTrim(string flowFileName, string trim
                 }
                 
                 return exitCommand;
-#endif         
+       
         }
         catch(exception& e) {
                 m->errorOut(e, "TrimFlowsCommand", "createProcessesCreateTrim");
diff --git a/trimflowscommand.h b/trimflowscommand.h

index 5affc3fbdf7fedc366f7c8294c762e28681d7625..ab8ca91a42b35a9a283a3f9c6f2ee4606a01f7a4 100644 (file)
--- a/trimflowscommand.h
+++ b/trimflowscommand.h
@@ -15,6 +15,7 @@
  #include "sequence.hpp"
  #include "flowdata.h"
  #include "groupmap.h"
+#include "trimoligos.h"
  
  class TrimFlowsCommand : public Command {
  public:
@@ -37,15 +38,15 @@ private:
         bool abort;
  
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
         int comboStarts;
         vector<int> processIDS;   //processid
         vector<linePair*> lines;
  
-       vector<unsigned long int> getFlowFileBreaks();
+       vector<unsigned long long> getFlowFileBreaks();
         int createProcessesCreateTrim(string, string, string, string, vector<vector<string> >); 
         int driverCreateTrim(string, string, string, string, vector<vector<string> >, linePair*);
  
@@ -53,11 +54,6 @@ private:
         set<string> filesToRemove;
         
         void getOligos(vector<vector<string> >&);               //a rewrite of what is in trimseqscommand.h
-       int stripBarcode(Sequence&, int&);                              //largely redundant with trimseqscommand.h
-       int stripForward(Sequence&, int&);                              //largely redundant with trimseqscommand.h
-       bool stripReverse(Sequence&);                                   //largely redundant with trimseqscommand.h
-       bool compareDNASeq(string, string);                             //largely redundant with trimseqscommand.h
-       int countDiffs(string, string);                                 //largely redundant with trimseqscommand.h
         
         bool allFiles;
         int processors;
@@ -83,5 +79,183 @@ private:
         
  };
  
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+typedef struct trimFlowData {
+       string flowFileName; 
+       string trimFlowFileName; 
+       string scrapFlowFileName;
+       string fastaFileName;
+       string flowOrder;
+       vector<vector<string> > barcodePrimerComboFileNames;
+       map<string, int> barcodes;
+       map<string, int> primers;
+       vector<string> revPrimer;
+       bool fasta, allFiles;
+       unsigned long long start;
+       unsigned long long end;
+       MothurOut* m;
+       float signal, noise;
+       int numFlows, maxFlows, minFlows, maxHomoP, tdiffs, bdiffs, pdiffs, threadID, count;
+       
+       trimFlowData(){}
+       trimFlowData(string ff, string tf, string sf, string f, string fo, vector<vector<string> > bfn, map<string, int> bar, map<string, int> pri, vector<string> rev, bool fa, bool al, unsigned long long st, unsigned long long en, MothurOut* mout, float sig, float n, int numF, int maxF, int minF, int maxH, int td, int bd, int pd, int tid) {
+               flowFileName = ff;
+               trimFlowFileName = tf;
+               scrapFlowFileName = sf;
+               fastaFileName = f;
+               flowOrder = fo;
+               barcodePrimerComboFileNames = bfn;
+               barcodes = bar;
+               primers = pri;
+               revPrimer = rev;
+               fasta = fa;
+               allFiles = al;
+               start = st;
+               end = en;
+               m = mout;
+               signal = sig;
+               noise = n;
+               numFlows = numF;
+               maxFlows = maxF;
+               minFlows = minF;
+               maxHomoP = maxH;
+               tdiffs = td;
+               bdiffs = bd;
+               pdiffs = pd;
+               threadID = tid;
+       }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyTrimFlowThreadFunction(LPVOID lpParam){ 
+       trimFlowData* pDataArray;
+       pDataArray = (trimFlowData*)lpParam;
+       
+       try {
+               ofstream trimFlowFile;
+               pDataArray->m->openOutputFile(pDataArray->trimFlowFileName, trimFlowFile);
+               trimFlowFile.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
+               
+               ofstream scrapFlowFile;
+               pDataArray->m->openOutputFile(pDataArray->scrapFlowFileName, scrapFlowFile);
+               scrapFlowFile.setf(ios::fixed, ios::floatfield); scrapFlowFile.setf(ios::showpoint);
+               
+               ofstream fastaFile;
+               if(pDataArray->fasta){  pDataArray->m->openOutputFile(pDataArray->fastaFileName, fastaFile);    }
+               
+               ifstream flowFile;
+               pDataArray->m->openInputFile(pDataArray->flowFileName, flowFile);
+               
+               flowFile.seekg(pDataArray->start);
+               
+               if(pDataArray->start == 0){
+                       flowFile >> pDataArray->numFlows; pDataArray->m->gobble(flowFile);
+                       scrapFlowFile << pDataArray->maxFlows << endl;
+                       trimFlowFile << pDataArray->maxFlows << endl;
+                       if(pDataArray->allFiles){
+                               for(int i=0;i<pDataArray->barcodePrimerComboFileNames.size();i++){
+                                       for(int j=0;j<pDataArray->barcodePrimerComboFileNames[0].size();j++){
+                                               ofstream temp;
+                                               pDataArray->m->openOutputFile(pDataArray->barcodePrimerComboFileNames[i][j], temp);
+                                               temp << pDataArray->maxFlows << endl;
+                                               temp.close();
+                                       }
+                               }                       
+                       }
+               }
+               
+               FlowData flowData(pDataArray->numFlows, pDataArray->signal, pDataArray->noise, pDataArray->maxHomoP, pDataArray->flowOrder);
+               cout << " thread flowdata address " <<  &flowData  << '\t' << &flowFile << endl;
+               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer);
+               
+               pDataArray->count = pDataArray->end;
+               cout << pDataArray->threadID << '\t' << pDataArray->count << endl;
+               int count = 0;
+               for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+                       
+                       if (pDataArray->m->control_pressed) {  break; }
+                       cout << pDataArray->threadID << '\t' << count << endl;
+                       int success = 1;
+                       int currentSeqDiffs = 0;
+                       string trashCode = "";
+                               
+                       flowData.getNext(flowFile);
+                       cout << "thread good bit " << flowFile.good() << endl;
+                       flowData.capFlows(pDataArray->maxFlows);        
+                       
+                       Sequence currSeq = flowData.getSequence();
+                       if(!flowData.hasMinFlows(pDataArray->minFlows)){        //screen to see if sequence is of a minimum number of flows
+                               success = 0;
+                               trashCode += 'l';
+                       }
+                       
+                       int primerIndex = 0;
+                       int barcodeIndex = 0;
+                       
+                       if(pDataArray->barcodes.size() != 0){
+                               success = trimOligos.stripBarcode(currSeq, barcodeIndex);
+                               if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
+                               else{ currentSeqDiffs += success;  }
+                       }
+                       
+                       if(pDataArray->primers.size() != 0){
+                               success = trimOligos.stripForward(currSeq, primerIndex);
+                               if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
+                               else{ currentSeqDiffs += success;  }
+                       }
+                       
+                       if (currentSeqDiffs > pDataArray->tdiffs)       {       trashCode += 't';   }
+                       
+                       if(pDataArray->revPrimer.size() != 0){
+                               success = trimOligos.stripReverse(currSeq);
+                               if(!success)                            {       trashCode += 'r';       }
+                       }
+                       
+                       if(trashCode.length() == 0){
+                               
+                               flowData.printFlows(trimFlowFile);
+                               
+                               if(pDataArray->fasta)   {       currSeq.printSequence(fastaFile);       }
+                               
+                               if(pDataArray->allFiles){
+                                       ofstream output;
+                                       pDataArray->m->openOutputFileAppend(pDataArray->barcodePrimerComboFileNames[barcodeIndex][primerIndex], output);
+                                       output.setf(ios::fixed, ios::floatfield); trimFlowFile.setf(ios::showpoint);
+                                       
+                                       flowData.printFlows(output);
+                                       output.close();
+                               }                               
+                       }
+                       else{
+                               flowData.printFlows(scrapFlowFile, trashCode);
+                       }
+                       
+                       count++;
+                               cout << pDataArray->threadID << '\t' << currSeq.getName() << endl;              
+                       //report progress
+                       if((count) % 10000 == 0){       pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine();           }
+                       
+               }
+               //report progress
+               if((count) % 10000 != 0){       pDataArray->m->mothurOut(toString(count)); pDataArray->m->mothurOutEndLine();           }
+               
+               trimFlowFile.close();
+               scrapFlowFile.close();
+               flowFile.close();
+               if(pDataArray->fasta){  fastaFile.close();      }
+               
+       }
+       catch(exception& e) {
+               pDataArray->m->errorOut(e, "TrimFlowsCommand", "MyTrimFlowsThreadFunction");
+               exit(1);
+       }
+} 
+#endif
+
  
  #endif
diff --git a/trimoligos.cpp b/trimoligos.cpp

new file mode 100644 (file)

index 0000000..f0b5a80
--- /dev/null
+++ b/trimoligos.cpp
@@ -0,0 +1,627 @@
+/*
+ *  trimoligos.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 9/1/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "trimoligos.h"
+#include "alignment.hpp"
+#include "needlemanoverlap.hpp"
+
+
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
+       try {
+               m = MothurOut::getInstance();
+               
+               pdiffs = p;
+               bdiffs = b;
+               
+               barcodes = br;
+               primers = pr;
+               revPrimer = r;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "TrimOligos");
+               exit(1);
+       }
+}
+/********************************************************************/
+TrimOligos::~TrimOligos() {}
+//*******************************************************************/
+int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               int success = bdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the barcode
+               for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+                       
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
+                               
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(oligo.length(), -1);
+                               }
+                               
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((bdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (barcodes.size() > 0) {
+                               map<string,int>::iterator it=barcodes.begin();
+                               
+                               for(it;it!=barcodes.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = bdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(minPos, -1);
+                               }
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripBarcode");
+               exit(1);
+       }
+       
+}
+//*******************************************************************/
+int TrimOligos::stripBarcode(Sequence& seq, int& group){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               int success = bdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the barcode
+               for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+                       
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
+                               
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((bdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (barcodes.size() > 0) {
+                               map<string,int>::iterator it=barcodes.begin();
+                               
+                               for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = bdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripBarcode");
+               exit(1);
+       }
+       
+}
+//********************************************************************/
+int TrimOligos::stripForward(Sequence& seq, int& group){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               int success = pdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the primer
+               for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
+                               success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+                       
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((pdiffs == 0) || (success == 0)) {  return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (primers.size() > 0) {
+                               map<string,int>::iterator it=primers.begin();
+                               
+                               for(it;it!=primers.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   
+                                       success = pdiffs + 100;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripForward");
+               exit(1);
+       }
+}
+//*******************************************************************/
+int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){
+       try {
+               string rawSequence = seq.getUnaligned();
+               int success = pdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the primer
+               for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
+                               success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+                       
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(oligo.length(), -1);
+                               }
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((pdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (primers.size() > 0) {
+                               map<string,int>::iterator it=primers.begin();
+                               
+                               for(it;it!=primers.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   
+                                       success = pdiffs + 100;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(minPos, -1);
+                               }
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripForward");
+               exit(1);
+       }
+}
+//******************************************************************/
+bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){
+       try {
+               string rawSequence = seq.getUnaligned();
+               bool success = 0;       //guilty until proven innocent
+               
+               for(int i=0;i<revPrimer.size();i++){
+                       string oligo = revPrimer[i];
+                       
+                       if(rawSequence.length() < oligo.length()){
+                               success = 0;
+                               break;
+                       }
+                       
+                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(-1, rawSequence.length()-oligo.length());
+                               }
+                               success = 1;
+                               break;
+                       }
+               }       
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripReverse");
+               exit(1);
+       }
+}
+//******************************************************************/
+bool TrimOligos::stripReverse(Sequence& seq){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               bool success = 0;       //guilty until proven innocent
+               
+               for(int i=0;i<revPrimer.size();i++){
+                       string oligo = revPrimer[i];
+                       
+                       if(rawSequence.length() < oligo.length()){
+                               success = 0;
+                               break;
+                       }
+                       
+                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
+                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+                               success = 1;
+                               break;
+                       }
+               }       
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripReverse");
+               exit(1);
+       }
+}
+
+//******************************************************************/
+bool TrimOligos::compareDNASeq(string oligo, string seq){
+       try {
+               bool success = 1;
+               int length = oligo.length();
+               
+               for(int i=0;i<length;i++){
+                       
+                       if(oligo[i] != seq[i]){
+                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
+                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
+                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
+                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
+                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
+                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
+                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
+                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
+                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
+                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
+                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
+                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
+                               
+                               if(success == 0)        {       break;   }
+                       }
+                       else{
+                               success = 1;
+                       }
+               }
+               
+               return success;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "compareDNASeq");
+               exit(1);
+       }
+       
+}
+//********************************************************************/
+int TrimOligos::countDiffs(string oligo, string seq){
+       try {
+               
+               int length = oligo.length();
+               int countDiffs = 0;
+               
+               for(int i=0;i<length;i++){
+                       
+                       if(oligo[i] != seq[i]){
+                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
+                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
+                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
+                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
+                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
+                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
+                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
+                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
+                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
+                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
+                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
+                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
+                       }
+                       
+               }
+               
+               return countDiffs;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "countDiffs");
+               exit(1);
+       }
+}
+/********************************************************************/
+
+
+
diff --git a/trimoligos.h b/trimoligos.h

new file mode 100644 (file)

index 0000000..8830dff
--- /dev/null
+++ b/trimoligos.h
@@ -0,0 +1,49 @@
+#ifndef TRIMOLIGOS_H
+#define TRIMOLIGOS_H
+
+/*
+ *  trimoligos.h
+ *  Mothur
+ *
+ *  Created by westcott on 9/1/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "sequence.hpp"
+#include "qualityscores.h"
+
+
+class TrimOligos {
+       
+       public:
+               TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
+               ~TrimOligos();
+       
+               int stripBarcode(Sequence&, int&);      
+               int stripBarcode(Sequence&, QualityScores&, int&);
+       
+               int stripForward(Sequence&, int&);
+               int stripForward(Sequence&, QualityScores&, int&);
+       
+               bool stripReverse(Sequence&);
+               bool stripReverse(Sequence&, QualityScores&);
+                               
+       
+       private:
+               int pdiffs, bdiffs;
+       
+               map<string, int> barcodes;
+               map<string, int> primers;
+               vector<string> revPrimer;
+       
+               MothurOut* m;
+       
+               bool compareDNASeq(string, string);                             
+               int countDiffs(string, string);                 
+};
+
+#endif
+
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp

index a8c3fc697ae10c1c6a117ce68c550a928281ec44..1cbb91a2da487a7f0aa00a0689d393297ba592c0 100644 (file)
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -9,6 +9,7 @@
  
  #include "trimseqscommand.h"
  #include "needlemanoverlap.hpp"
+#include "trimoligos.h"
  
  //**********************************************************************************************************************
  vector<string> TrimSeqsCommand::setParameters(){       
@@ -351,8 +352,8 @@ int TrimSeqsCommand::execute(){
                         }
                 }
                 
-               vector<unsigned long int> fastaFilePos;
-               vector<unsigned long int> qFilePos;
+               vector<unsigned long long> fastaFilePos;
+               vector<unsigned long long> qFilePos;
                 
                 setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
                 
@@ -539,6 +540,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                 
                 int count = 0;
                 bool moreSeqs = 1;
+               TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
         
                 while (moreSeqs) {
                                 
@@ -572,13 +574,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                 int primerIndex = 0;
                                 
                                 if(barcodes.size() != 0){
-                                       success = stripBarcode(currSeq, currQual, barcodeIndex);
+                                       success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
                                         if(success > bdiffs)            {       trashCode += 'b';       }
                                         else{ currentSeqsDiffs += success;  }
                                 }
                                 
                                 if(numFPrimers != 0){
-                                       success = stripForward(currSeq, currQual, primerIndex);
+                                       success = trimOligos.stripForward(currSeq, currQual, primerIndex);
                                         if(success > pdiffs)            {       trashCode += 'f';       }
                                         else{ currentSeqsDiffs += success;  }
                                 }
@@ -586,7 +588,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                 if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
                                 
                                 if(numRPrimers != 0){
-                                       success = stripReverse(currSeq, currQual);
+                                       success = trimOligos.stripReverse(currSeq, currQual);
                                         if(!success)                            {       trashCode += 'r';       }
                                 }
  
@@ -722,7 +724,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                         }
                         
                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                 if ((pos == -1) || (pos >= line->end)) { break; }
                         
                         #else
@@ -938,7 +940,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
  
  /**************************************************************************************************/
  
-int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
+int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
         try {
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                 //set file positions for fasta file
@@ -977,7 +979,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned
                                         map<string, int>::iterator it = firstSeqNames.find(sname);
                                         
                                         if(it != firstSeqNames.end()) { //this is the start of a new chunk
-                                               unsigned long int pos = inQual.tellg(); 
+                                               unsigned long long pos = inQual.tellg(); 
                                                 qfileFilePos.push_back(pos - input.length() - 1);       
                                                 firstSeqNames.erase(it);
                                         }
@@ -999,7 +1001,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned
  
                 //get last file position of qfile
                 FILE * pFile;
-               unsigned long int size;
+               unsigned long long size;
                 
                 //get num bytes in file
                 pFile = fopen (qfilename.c_str(),"rb");
@@ -1019,7 +1021,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned
                         fastaFilePos.push_back(0); qfileFilePos.push_back(0);
                         //get last file position of fastafile
                         FILE * pFile;
-                       unsigned long int size;
+                       unsigned long long size;
                         
                         //get num bytes in file
                         pFile = fopen (filename.c_str(),"rb");
@@ -1240,279 +1242,6 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                 exit(1);
         }
  }
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
-       try {
-               
-               string rawSequence = seq.getUnaligned();
-               int success = bdiffs + 1;       //guilty until proven innocent
-               
-               //can you find the barcode
-               for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
-                       string oligo = it->first;
-                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
-                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
-                               break;  
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
-                               group = it->second;
-                               seq.setUnaligned(rawSequence.substr(oligo.length()));
-                               
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(oligo.length(), -1);
-                               }
-                               
-                               success = 0;
-                               break;
-                       }
-               }
-               
-               //if you found the barcode or if you don't want to allow for diffs
-               if ((bdiffs == 0) || (success == 0)) { return success;  }
-               
-               else { //try aligning and see if you can find it
-
-                       int maxLength = 0;
-
-                       Alignment* alignment;
-                       if (barcodes.size() > 0) {
-                               map<string,int>::iterator it=barcodes.begin();
-
-                               for(it;it!=barcodes.end();it++){
-                                       if(it->first.length() > maxLength){
-                                               maxLength = it->first.length();
-                                       }
-                               }
-                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
-
-                       }else{ alignment = NULL; } 
-                       
-                       //can you find the barcode
-                       int minDiff = 1e6;
-                       int minCount = 1;
-                       int minGroup = -1;
-                       int minPos = 0;
-                       
-                       for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
-                               string oligo = it->first;
-//                             int length = oligo.length();
-                               
-                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
-                                       success = bdiffs + 10;
-                                       break;
-                               }
-                               
-                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
-                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
-                               oligo = alignment->getSeqAAln();
-                               string temp = alignment->getSeqBAln();
-               
-                               int alnLength = oligo.length();
-                               
-                               for(int i=oligo.length()-1;i>=0;i--){
-                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
-                               }
-                               oligo = oligo.substr(0,alnLength);
-                               temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
-                               
-                               if(numDiff < minDiff){
-                                       minDiff = numDiff;
-                                       minCount = 1;
-                                       minGroup = it->second;
-                                       minPos = 0;
-                                       for(int i=0;i<alnLength;i++){
-                                               if(temp[i] != '-'){
-                                                       minPos++;
-                                               }
-                                       }
-                               }
-                               else if(numDiff == minDiff){
-                                       minCount++;
-                               }
-
-                       }
-
-                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
-                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
-                       else{                                                                                                   //use the best match
-                               group = minGroup;
-                               seq.setUnaligned(rawSequence.substr(minPos));
-                               
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(minPos, -1);
-                               }
-                               success = minDiff;
-                       }
-                       
-                       if (alignment != NULL) {  delete alignment;  }
-                       
-               }
-       
-               return success;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
-               exit(1);
-       }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
-       try {
-               string rawSequence = seq.getUnaligned();
-               int success = pdiffs + 1;       //guilty until proven innocent
-               
-               //can you find the primer
-               for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
-                       string oligo = it->first;
-                       if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
-                               success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
-                               break;  
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
-                               group = it->second;
-                               seq.setUnaligned(rawSequence.substr(oligo.length()));
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(oligo.length(), -1);
-                               }
-                               success = 0;
-                               break;
-                       }
-               }
-
-               //if you found the barcode or if you don't want to allow for diffs
-               if ((pdiffs == 0) || (success == 0)) { return success;  }
-               
-               else { //try aligning and see if you can find it
-
-                       int maxLength = 0;
-
-                       Alignment* alignment;
-                       if (primers.size() > 0) {
-                               map<string,int>::iterator it=primers.begin();
-
-                               for(it;it!=primers.end();it++){
-                                       if(it->first.length() > maxLength){
-                                               maxLength = it->first.length();
-                                       }
-                               }
-                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
-
-                       }else{ alignment = NULL; } 
-                       
-                       //can you find the barcode
-                       int minDiff = 1e6;
-                       int minCount = 1;
-                       int minGroup = -1;
-                       int minPos = 0;
-                       
-                       for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
-                               string oligo = it->first;
-//                             int length = oligo.length();
-                               
-                               if(rawSequence.length() < maxLength){   
-                                       success = pdiffs + 100;
-                                       break;
-                               }
-                               
-                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
-                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
-                               oligo = alignment->getSeqAAln();
-                               string temp = alignment->getSeqBAln();
-               
-                               int alnLength = oligo.length();
-                               
-                               for(int i=oligo.length()-1;i>=0;i--){
-                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
-                               }
-                               oligo = oligo.substr(0,alnLength);
-                               temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
-                               
-                               if(numDiff < minDiff){
-                                       minDiff = numDiff;
-                                       minCount = 1;
-                                       minGroup = it->second;
-                                       minPos = 0;
-                                       for(int i=0;i<alnLength;i++){
-                                               if(temp[i] != '-'){
-                                                       minPos++;
-                                               }
-                                       }
-                               }
-                               else if(numDiff == minDiff){
-                                       minCount++;
-                               }
-
-                       }
-
-                       if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
-                       else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
-                       else{                                                                                                   //use the best match
-                               group = minGroup;
-                               seq.setUnaligned(rawSequence.substr(minPos));
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(minPos, -1);
-                               }
-                               success = minDiff;
-                       }
-                       
-                       if (alignment != NULL) {  delete alignment;  }
-                       
-               }
-               
-               return success;
-
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "stripForward");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
-       try {
-               string rawSequence = seq.getUnaligned();
-               bool success = 0;       //guilty until proven innocent
-               
-               for(int i=0;i<numRPrimers;i++){
-                       string oligo = revPrimer[i];
-                       
-                       if(rawSequence.length() < oligo.length()){
-                               success = 0;
-                               break;
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
-                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(-1, rawSequence.length()-oligo.length());
-                               }
-                               success = 1;
-                               break;
-                       }
-               }       
-               return success;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "stripReverse");
-               exit(1);
-       }
-}
-
  //***************************************************************************************************************
  
  bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
@@ -1618,80 +1347,4 @@ bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
         }
         
  }
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
-       try {
-               bool success = 1;
-               int length = oligo.length();
-               
-               for(int i=0;i<length;i++){
-                       
-                       if(oligo[i] != seq[i]){
-                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
-                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
-                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
-                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
-                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
-                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
-                               
-                               if(success == 0)        {       break;   }
-                       }
-                       else{
-                               success = 1;
-                       }
-               }
-               
-               return success;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
-               exit(1);
-       }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::countDiffs(string oligo, string seq){
-       try {
-
-               int length = oligo.length();
-               int countDiffs = 0;
-               
-               for(int i=0;i<length;i++){
-                                                               
-                       if(oligo[i] != seq[i]){
-                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
-                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
-                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
-                       }
-                       
-               }
-               
-               return countDiffs;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "countDiffs");
-               exit(1);
-       }
-
-}
-
  //***************************************************************************************************************
diff --git a/trimseqscommand.h b/trimseqscommand.h

index 13d1a417684bbf3b96d8a9d05c0bde2a2e22c544..137cb735719fba42f0d103af842de80c9930cc3e 100644 (file)
--- a/trimseqscommand.h
+++ b/trimseqscommand.h
@@ -37,24 +37,17 @@ private:
         GroupMap* groupMap;
         
         struct linePair {
-               unsigned long int start;
-               unsigned long int end;
-               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
+               unsigned long long start;
+               unsigned long long end;
+               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         };
         
         bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
-       int stripBarcode(Sequence&, QualityScores&, int&);
-       int stripForward(Sequence&, QualityScores&, int&);
-       bool stripReverse(Sequence&, QualityScores&);
-       
         bool keepFirstTrim(Sequence&, QualityScores&);
         bool removeLastTrim(Sequence&, QualityScores&);
-
         bool cullLength(Sequence&);
         bool cullHomoP(Sequence&);
         bool cullAmbigs(Sequence&);
-       bool compareDNASeq(string, string);
-       int countDiffs(string, string);
  
         bool abort, createGroup;
         string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
@@ -81,7 +74,7 @@ private:
         
         int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair*, linePair*);  
         int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
-       int setLines(string, string, vector<unsigned long int>&, vector<unsigned long int>&);
+       int setLines(string, string, vector<unsigned long long>&, vector<unsigned long long>&);
  };
  
  #endif
author	westcott <westcott>
	Tue, 13 Sep 2011 10:08:39 +0000 (10:08 +0000)
committer	westcott <westcott>
	Tue, 13 Sep 2011 10:08:39 +0000 (10:08 +0000)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
aligncommand.cpp		patch \| blob \| history
aligncommand.h		patch \| blob \| history
alignmentdb.cpp		patch \| blob \| history
amovacommand.cpp		patch \| blob \| history
anosimcommand.cpp		patch \| blob \| history
bayesian.cpp		patch \| blob \| history
bellerophon.cpp		patch \| blob \| history
bellerophon.h		patch \| blob \| history
chimera.cpp		patch \| blob \| history
chimera.h		patch \| blob \| history
chimeraccodecommand.cpp		patch \| blob \| history
chimeraccodecommand.h		patch \| blob \| history
chimeracheckcommand.cpp		patch \| blob \| history
chimeracheckcommand.h		patch \| blob \| history
chimerapintailcommand.cpp		patch \| blob \| history
chimerapintailcommand.h		patch \| blob \| history
chimeraslayercommand.cpp		patch \| blob \| history
chimeraslayercommand.h		patch \| blob \| history
classify.cpp		patch \| blob \| history
classifyseqscommand.cpp		patch \| blob \| history
classifyseqscommand.h		patch \| blob \| history
clustercommand.cpp		patch \| blob \| history
clusterfragmentscommand.cpp		patch \| blob \| history
deconvolutecommand.cpp		patch \| blob \| history
distancecommand.cpp		patch \| blob \| history
distancecommand.h		patch \| blob \| history
filterseqscommand.cpp		patch \| blob \| history
filterseqscommand.h		patch \| blob \| history
flowdata.cpp		patch \| blob \| history
homovacommand.cpp		patch \| blob \| history
makefile		patch \| blob \| history
mothurout.cpp		patch \| blob \| history
mothurout.h		patch \| blob \| history
pairwiseseqscommand.cpp		patch \| blob \| history
pairwiseseqscommand.h		patch \| blob \| history
phylotree.cpp		patch \| blob \| history
preclustercommand.cpp		patch \| blob \| history
preclustercommand.h		patch \| blob \| history
rarefactcommand.cpp		patch \| blob \| history
rarefactcommand.h		patch \| blob \| history
screenseqscommand.cpp		patch \| blob \| history
screenseqscommand.h		patch \| blob \| history
seqerrorcommand.cpp		patch \| blob \| history
seqerrorcommand.h		patch \| blob \| history
seqsummarycommand.cpp		patch \| blob \| history
seqsummarycommand.h		patch \| blob \| history
sequenceparser.cpp	[new file with mode: 0644]	patch \| blob
sequenceparser.h	[new file with mode: 0644]	patch \| blob
sffinfocommand.cpp		patch \| blob \| history
sffinfocommand.h		patch \| blob \| history
sharedrabundvector.cpp		patch \| blob \| history
shhhercommand.cpp		patch \| blob \| history
shhhercommand.h		patch \| blob \| history
simpson.cpp		patch \| blob \| history
subsamplecommand.cpp		patch \| blob \| history
systemcommand.cpp		patch \| blob \| history
trimflowscommand.cpp		patch \| blob \| history
trimflowscommand.h		patch \| blob \| history
trimoligos.cpp	[new file with mode: 0644]	patch \| blob
trimoligos.h	[new file with mode: 0644]	patch \| blob
trimseqscommand.cpp		patch \| blob \| history
trimseqscommand.h		patch \| blob \| history