]> git.donarmstrong.com Git - mothur.git/commitdiff
added debug output classify.seqs. worked on make.contigs command. added large paramet...
authorSarah Westcott <mothur.westcott@gmail.com>
Tue, 22 May 2012 20:12:50 +0000 (16:12 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Tue, 22 May 2012 20:12:50 +0000 (16:12 -0400)
classify.cpp
makecontigscommand.cpp
makefile
phylotree.cpp
sequence.cpp
shhhercommand.cpp
shhhercommand.h

index 2d01183479a504c798a18071df045f723643f850..7726b3e00cca9de19971b70656f9581e43e97c8a 100644 (file)
@@ -249,7 +249,8 @@ int Classify::readTaxonomy(string file) {
                
                m->mothurOutEndLine();
                m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
-
+        if (m->debug) { m->mothurOut("[DEBUG]: Taxonomies read in...\n"); }
+        
 #ifdef USE_MPI 
                int pid, num, processors;
                vector<unsigned long long> positions;
@@ -298,6 +299,7 @@ int Classify::readTaxonomy(string file) {
                        istringstream iss (tempBuf,istringstream::in);
                        iss >> name; m->gobble(iss);
             iss >> taxInfo;
+            if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); }
                        taxonomy[name] = taxInfo;
                        phyloTree->addSeqToTree(name, taxInfo);
                }
@@ -312,7 +314,9 @@ int Classify::readTaxonomy(string file) {
                while (!inTax.eof()) {
                        inTax >> name; m->gobble(inTax);
             inTax >> taxInfo;
-               
+            
+            if (m->debug) {  m->mothurOut("[DEBUG]: name = '" + name + "' tax = '" + taxInfo + "'\n");  }
+
                        taxonomy[name] = taxInfo;
                        
                        phyloTree->addSeqToTree(name, taxInfo);
@@ -321,6 +325,8 @@ int Classify::readTaxonomy(string file) {
                }
                inTax.close();
 #endif 
+        
+        
        
                phyloTree->assignHeirarchyIDs(0);
                
index f672840eda46116a14a917eaf82a1aa9f43e4e56..523edd5e8ba1fd00f1bf5050a380b43e1549ee4f 100644 (file)
@@ -392,10 +392,32 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
             int numMismatches = 0;
             string seq1 = fSeq.getAligned();
             string seq2 = rSeq.getAligned();
-           
             vector<int> scores1 = fQual.getQualityScores();
             vector<int> scores2 = rQual.getQualityScores();
-            for (int i = 0; i < length; i++) {
+
+           // if (num < 5) {  cout << fSeq.getStartPos() << '\t' << fSeq.getEndPos() << '\t' << rSeq.getStartPos() << '\t' << rSeq.getEndPos() << endl; }
+            int overlapStart = fSeq.getStartPos();
+            int seq2Start = rSeq.getStartPos();
+            //bigger of the 2 starting positions is the location of the overlapping start
+            if (overlapStart < seq2Start) { //seq2 starts later so take from 0 to seq2Start from seq1
+                overlapStart = seq2Start; 
+                for (int i = 0; i < overlapStart; i++) {
+                    contig += seq1[i];
+                    contigScores.push_back(scores1[ABaseMap[i]]);
+                }
+            }else { //seq1 starts later so take from 0 to overlapStart from seq2
+                for (int i = 0; i < overlapStart; i++) {
+                    contig += seq2[i];
+                    contigScores.push_back(scores2[BBaseMap[i]]);
+                }
+            }
+            
+            int seq1End = fSeq.getEndPos();
+            int seq2End = rSeq.getEndPos();
+            int overlapEnd = seq1End;
+            if (seq2End < overlapEnd) { overlapEnd = seq2End; }  //smallest end position is where overlapping ends
+            
+            for (int i = overlapStart; i < overlapEnd; i++) {
                 if (seq1[i] == seq2[i]) { //match, add base and choose highest score
                     contig += seq1[i];
                     contigScores.push_back(scores1[ABaseMap[i]]);
@@ -423,6 +445,19 @@ int MakeContigsCommand::driver(vector<string> files, string outputFasta, string
                 }
             }
             
+            if (seq1End < seq2End) { //seq1 ends before seq2 so take from overlap to length from seq2
+                for (int i = overlapEnd; i < length; i++) {
+                    contig += seq2[i];
+                    contigScores.push_back(scores2[BBaseMap[i]]);
+                }
+            }else { //seq2 ends before seq1 so take from overlap to length from seq1
+                for (int i = overlapEnd; i < length; i++) {
+                    contig += seq1[i];
+                    contigScores.push_back(scores1[ABaseMap[i]]);
+                }
+
+            }
+            //if (num < 5) { cout << overlapStart << '\t' << overlapEnd << endl << seq1 << endl << seq2 << endl<< contig << endl; }
             //output
             outFasta << ">" << fSeq.getName() << endl << contig << endl;
             outQual << ">" << fSeq.getName() << endl;
index e06c334636410c7d5b31512fd3d1440c26711dad..6f8a42343be48654dc77ef2e021caf601a02ff2e 100644 (file)
--- a/makefile
+++ b/makefile
@@ -15,8 +15,8 @@ USEREADLINE ?= yes
 CYGWIN_BUILD ?= no
 USECOMPRESSION ?= no
 MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"4/30/2012\""
-VERSION = "\"1.25.0\""
+RELEASE_DATE = "\"5/14/2012\""
+VERSION = "\"1.25.1\""
 FORTAN_COMPILER = gfortran
 FORTRAN_FLAGS = 
 
index a9ef6cb0d85f44bde915606eb452db993ec40d8d..4ed3d8c1c02e1c5a1ee93b9dd6143309f5606f5c 100644 (file)
@@ -619,13 +619,15 @@ bool PhyloTree::ErrorCheck(vector<string> templateFileNames){
                map<string, int>::iterator itFind;
                map<string, int> taxonomyFileNames = name2Taxonomy;
                
+        if (m->debug) { m->mothurOut("[DEBUG]: in error check. Numseqs in template = " + toString(templateFileNames.size()) + ". Numseqs in taxonomy = " + toString(taxonomyFileNames.size()) + ".\n"); }
+        
                for (int i = 0; i < templateFileNames.size(); i++) {
                        itFind = taxonomyFileNames.find(templateFileNames[i]);
                        
                        if (itFind != taxonomyFileNames.end()) { //found it so erase it
                                taxonomyFileNames.erase(itFind);
                        }else {
-                               m->mothurOut(templateFileNames[i] + " is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
+                               m->mothurOut("'" +templateFileNames[i] + "' is in your template file and is not in your taxonomy file. Please correct."); m->mothurOutEndLine();
                                okay = false;
                        }
                        
index d877f4b93f33d8123ff6a1a9f7f839dc680fcf61..a1e787bab8cd24ebb352593055e7d2fc10b406de 100644 (file)
@@ -603,7 +603,7 @@ int Sequence::getLongHomoPolymer(){
 int Sequence::getStartPos(){
        if(startPos == -1){
                for(int j = 0; j < alignmentLength; j++) {
-                       if(aligned[j] != '.'){
+                       if((aligned[j] != '.')&&(aligned[j] != '-')){
                                startPos = j + 1;
                                break;
                        }
@@ -668,7 +668,7 @@ int Sequence::filterFromPos(int end){
 int Sequence::getEndPos(){
        if(endPos == -1){
                for(int j=alignmentLength-1;j>=0;j--){
-                       if(aligned[j] != '.'){
+                       if((aligned[j] != '.')&&(aligned[j] != '-')){
                                endPos = j + 1;
                                break;
                        }
index 508e2a90610d1b3705972a976580d90937358427..a09825e4b9a3cd11dbe77b3dd82e703481c650bc 100644 (file)
@@ -18,6 +18,7 @@ vector<string> ShhherCommand::setParameters(){
                CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
                CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
                CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
+        CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
                CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
                CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
                CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
@@ -307,6 +308,17 @@ ShhherCommand::ShhherCommand(string option) {
 
                        temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
                        m->mothurConvert(temp, maxIters); 
+            
+            temp = validParameter.validFile(parameters, "large", false);       if (temp == "not found"){       temp = "0";             }
+                       m->mothurConvert(temp, largeSize); 
+            if (largeSize != 0) { large = true; }
+            else { large = false;  }
+            if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
+            
+#ifdef USE_MPI
+            if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                          
+#endif
+            
 
                        temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
                        m->mothurConvert(temp, sigma); 
@@ -2014,170 +2026,248 @@ int ShhherCommand::createProcesses(vector<string> filenames){
 }
 /**************************************************************************************************/
 
-int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
+vector<string> ShhherCommand::parseFlowFiles(string filename){
     try {
+        vector<string> files;
+        int count = 0;
         
-        for(int i=start;i<end;i++){
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       string flowFileName = filenames[i];
-            
-                       m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
-                       m->mothurOut("Reading flowgrams...\n");
-                       
-            vector<string> seqNameVector;
-            vector<int> lengths;
-            vector<short> flowDataIntI;
-            vector<double> flowDataPrI;
-            map<string, int> nameMap;
-            vector<short> uniqueFlowgrams;
-            vector<int> uniqueCount;
-            vector<int> mapSeqToUnique;
-            vector<int> mapUniqueToSeq;
-            vector<int> uniqueLengths;
-            int numFlowCells;
-            
-            int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       m->mothurOut("Identifying unique flowgrams...\n");
-                       int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       m->mothurOut("Calculating distances between flowgrams...\n");
-            string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
-            unsigned long long begTime = time(NULL);
-            double begClock = clock();
+        ifstream in;
+        m->openInputFile(filename, in);
         
-            flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);    
-            
-            m->mothurOutEndLine();
-            m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
-
+        int thisNumFLows = 0;
+        in >> thisNumFLows; m->gobble(in);
+        
+        while (!in.eof()) {
+            if (m->control_pressed) { break; }
             
-                       string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
-                       createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       m->mothurOut("\nClustering flowgrams...\n");
-            string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
-                       cluster(listFileName, distFileName, namesFileName);
-                       
-                       if (m->control_pressed) { break; }
+            ofstream out;
+            string outputFileName = filename + toString(count) + ".temp";
+            m->openOutputFile(outputFileName, out);
+            out << thisNumFLows << endl;
+            files.push_back(outputFileName);
             
-            vector<int> otuData;
-            vector<int> cumNumSeqs;
-            vector<int> nSeqsPerOTU;
-            vector<vector<int> > aaP;  //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
-            vector<vector<int> > aaI;  //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
-            vector<int> seqNumber;             //tMaster->anP:         the sequence id number sorted by OTU
-            vector<int> seqIndex;              //tMaster->anI;         the index that corresponds to seqNumber
+            for (int i = 0; i < largeSize; i++) {
+                if (in.eof()) { break; }
+                string line = m->getline(in);
+                out << line << endl;
+            }
+            out.close();
+            count++;
+        }
+        in.close();
+        
+        if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
+        
+        return files;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "ShhherCommand", "parseFlowFiles");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
 
-                       
-                       int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
+    try {
+        
+        for(int i=start;i<end;i++){
                        
                        if (m->control_pressed) { break; }
                        
-                       m->mothurRemove(distFileName);
-                       m->mothurRemove(namesFileName);
-                       m->mothurRemove(listFileName);
-                       
-            vector<double> dist;               //adDist - distance of sequences to centroids
-            vector<short> change;              //did the centroid sequence change? 0 = no; 1 = yes
-            vector<int> centroids;             //the representative flowgram for each cluster m
-            vector<double> weight;
-            vector<double> singleTau;  //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
-            vector<int> nSeqsBreaks;
-            vector<int> nOTUsBreaks;
+            vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
+            if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
             
-                       dist.assign(numSeqs * numOTUs, 0);
-            change.assign(numOTUs, 1);
-            centroids.assign(numOTUs, -1);
-            weight.assign(numOTUs, 0);
-            singleTau.assign(numSeqs, 1.0);
-            
-            nSeqsBreaks.assign(2, 0);
-            nOTUsBreaks.assign(2, 0);
+            if (m->control_pressed) { break; }
             
-            nSeqsBreaks[0] = 0;
-            nSeqsBreaks[1] = numSeqs;
-            nOTUsBreaks[1] = numOTUs;
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       double maxDelta = 0;
-                       int iter = 0;
-                       
-                       begClock = clock();
-                       begTime = time(NULL);
+            double begClock = clock();
+            unsigned long long begTime;
             
-                       m->mothurOut("\nDenoising flowgrams...\n");
-                       m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
-                       
-                       while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
-                               
-                               if (m->control_pressed) { break; }
-                               
-                               double cycClock = clock();
-                               unsigned long long cycTime = time(NULL);
-                               fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
-                               
-                               if (m->control_pressed) { break; }
+            for (int g = 0; g < theseFlowFileNames.size(); g++) {
                 
-                               calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
-                               
-                               if (m->control_pressed) { break; }
+                string flowFileName = theseFlowFileNames[g];
+                m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
+                m->mothurOut("Reading flowgrams...\n");
+                
+                vector<string> seqNameVector;
+                vector<int> lengths;
+                vector<short> flowDataIntI;
+                vector<double> flowDataPrI;
+                map<string, int> nameMap;
+                vector<short> uniqueFlowgrams;
+                vector<int> uniqueCount;
+                vector<int> mapSeqToUnique;
+                vector<int> mapUniqueToSeq;
+                vector<int> uniqueLengths;
+                int numFlowCells;
                 
-                               maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
+                int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
                 
                 if (m->control_pressed) { break; }
                 
-                               double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
+                m->mothurOut("Identifying unique flowgrams...\n");
+                int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
                 
                 if (m->control_pressed) { break; }
                 
-                               checkCentroids(numOTUs, centroids, weight);
-                               
-                               if (m->control_pressed) { break; }
-                               
-                               calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
-                               
-                               if (m->control_pressed) { break; }
-                               
-                               iter++;
-                               
-                               m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+                m->mothurOut("Calculating distances between flowgrams...\n");
+                string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+                begTime = time(NULL);
+               
                 
-                       }       
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       m->mothurOut("\nFinalizing...\n");
-                       fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
-                       
-                       if (m->control_pressed) { break; }
-                       
-                       vector<int> otuCounts(numOTUs, 0);
-                       for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
-                       
-                       calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
-            
-            if (m->control_pressed) { break; }
+                flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);        
+                
+                m->mothurOutEndLine();
+                m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+                
+                
+                string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+                createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
+                
+                if (m->control_pressed) { break; }
+                
+                m->mothurOut("\nClustering flowgrams...\n");
+                string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+                cluster(listFileName, distFileName, namesFileName);
+                
+                if (m->control_pressed) { break; }
+                
+                vector<int> otuData;
+                vector<int> cumNumSeqs;
+                vector<int> nSeqsPerOTU;
+                vector<vector<int> > aaP;      //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
+                vector<vector<int> > aaI;      //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
+                vector<int> seqNumber;         //tMaster->anP:         the sequence id number sorted by OTU
+                vector<int> seqIndex;          //tMaster->anI;         the index that corresponds to seqNumber
+                
+                
+                int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+                
+                if (m->control_pressed) { break; }
+                
+                m->mothurRemove(distFileName);
+                m->mothurRemove(namesFileName);
+                m->mothurRemove(listFileName);
+                
+                vector<double> dist;           //adDist - distance of sequences to centroids
+                vector<short> change;          //did the centroid sequence change? 0 = no; 1 = yes
+                vector<int> centroids;         //the representative flowgram for each cluster m
+                vector<double> weight;
+                vector<double> singleTau;      //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
+                vector<int> nSeqsBreaks;
+                vector<int> nOTUsBreaks;
+                
+                dist.assign(numSeqs * numOTUs, 0);
+                change.assign(numOTUs, 1);
+                centroids.assign(numOTUs, -1);
+                weight.assign(numOTUs, 0);
+                singleTau.assign(numSeqs, 1.0);
+                
+                nSeqsBreaks.assign(2, 0);
+                nOTUsBreaks.assign(2, 0);
+                
+                nSeqsBreaks[0] = 0;
+                nSeqsBreaks[1] = numSeqs;
+                nOTUsBreaks[1] = numOTUs;
+                
+                if (m->control_pressed) { break; }
+                
+                double maxDelta = 0;
+                int iter = 0;
+                
+                begClock = clock();
+                begTime = time(NULL);
+                
+                m->mothurOut("\nDenoising flowgrams...\n");
+                m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+                
+                while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    double cycClock = clock();
+                    unsigned long long cycTime = time(NULL);
+                    fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    checkCentroids(numOTUs, centroids, weight);
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
+                    
+                    if (m->control_pressed) { break; }
+                    
+                    iter++;
+                    
+                    m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+                    
+                }      
+                
+                if (m->control_pressed) { break; }
+                
+                m->mothurOut("\nFinalizing...\n");
+                fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+                
+                if (m->control_pressed) { break; }
+                
+                setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+                
+                if (m->control_pressed) { break; }
+                
+                vector<int> otuCounts(numOTUs, 0);
+                for(int i=0;i<numSeqs;i++)     {       otuCounts[otuData[i]]++;        }
+                
+                calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);        
+                
+                if (m->control_pressed) { break; }
+                
+                if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
+                string thisOutputDir = outputDir;
+                if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
+                string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
+                string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
+                string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
+                string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
+                string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+                string groupFileName = fileRoot + "shhh.groups";
+
+                
+                writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
+                writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
+                writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                             if (m->control_pressed) { break; }
+                writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                 if (m->control_pressed) { break; }
+                writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
+                
+                if (large) {
+                    if (g > 0) {
+                        m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
+                        m->mothurRemove(qualityFileName);
+                        m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
+                        m->mothurRemove(fastaFileName);
+                        m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
+                        m->mothurRemove(nameFileName);
+                        m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
+                        m->mothurRemove(otuCountsFileName);
+                        m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
+                        m->mothurRemove(groupFileName);
+                        m->mothurRemove(theseFlowFileNames[g]);
+                    }
+                }
+                       }
             
-                       writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
-                       writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
-                       writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
-                       writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                       if (m->control_pressed) { break; }
-                       writeGroups(flowFileName, numSeqs, seqNameVector);                                              if (m->control_pressed) { break; }
-                       
                        m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
                }
                
@@ -2977,14 +3067,11 @@ void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, ve
 }
 /**************************************************************************************************/
 
-void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
+void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
        
        try {
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
-               string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
         
                ofstream qualityFile;
                m->openOutputFile(qualityFileName, qualityFile);
@@ -3088,11 +3175,9 @@ void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filenam
 
 /**************************************************************************************************/
 
-void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
+void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
        try {
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
-               string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
+               
                ofstream fastaFile;
                m->openOutputFile(fastaFileName, fastaFile);
                
@@ -3117,7 +3202,8 @@ void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTU
                                        }
                                }
                                
-                               fastaFile << newSeq.substr(4) << endl;
+                               if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
+                else {  fastaFile << "NNNN" << endl;  }
                        }
                }
                fastaFile.close();
@@ -3136,11 +3222,9 @@ void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTU
 
 /**************************************************************************************************/
 
-void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
+void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
        try {
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
-               string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
+               
                ofstream nameFile;
                m->openOutputFile(nameFileName, nameFile);
                
@@ -3174,13 +3258,9 @@ void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, s
 
 /**************************************************************************************************/
 
-void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
+void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
        try {
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
-               string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
-               string groupFileName = fileRoot + "shhh.groups";
-               ofstream groupFile;
+        ofstream groupFile;
                m->openOutputFile(groupFileName, groupFile);
                
                for(int i=0;i<numSeqs;i++){
@@ -3199,11 +3279,8 @@ void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& se
 
 /**************************************************************************************************/
 
-void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
+void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
        try {
-               string thisOutputDir = outputDir;
-               if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
-               string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
                ofstream otuCountsFile;
                m->openOutputFile(otuCountsFileName, otuCountsFile);
                
index c9772af6f0e87e32d24518ee0419885770e11998..5ef43df17099c0b269c73c60dd6cf02e355b6176 100644 (file)
@@ -56,10 +56,10 @@ private:
                linePair(int i, int j) : start(i), end(j) {}
        };
     
-       int abort;
+       bool abort, large;
        string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName;
 
-       int processors, maxIters;
+       int processors, maxIters, largeSize;
        float cutoff, sigma, minDelta;
        string flowOrder;
     
@@ -68,6 +68,7 @@ private:
        vector<double> jointLookUp;
     vector<string> flowFileVector;
        
+    vector<string> parseFlowFiles(string);
     int driver(vector<string>, string, string, int, int);
     int createProcesses(vector<string>);
     int getFlowData(string, vector<string>&, vector<int>&, vector<short>&, map<string, int>&, int&);
@@ -90,7 +91,7 @@ private:
     void writeQualities(int, int, string, vector<int>, vector<int>&, vector<int>&, vector<double>&, vector<short>&, vector<short>&, vector<int>&, vector<int>&, vector<string>&, vector<int>&, vector<vector<int> >&);
     void writeSequences(string, int, int, string, vector<int>, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&);
     void writeNames(string, int, string, vector<int>, vector<string>&, vector<vector<int> >&, vector<int>&);
-    void writeGroups(string, int, vector<string>&);
+    void writeGroups(string, string, int, vector<string>&);
     void writeClusters(string, int, int, vector<int>, vector<int>&, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&);
     
        void getSingleLookUp();