]> git.donarmstrong.com Git - mothur.git/commitdiff
fixed bug with trim.flows that was adding flow files names to the .flow.files file...
authorwestcott <westcott>
Fri, 11 Nov 2011 14:17:31 +0000 (14:17 +0000)
committerwestcott <westcott>
Fri, 11 Nov 2011 14:17:31 +0000 (14:17 +0000)
17 files changed:
Mothur.xcodeproj/project.pbxproj
chimeraperseuscommand.cpp
chimerauchimecommand.cpp
chimerauchimecommand.h
commandfactory.cpp
distancecommand.h
metastatscommand.cpp
mothur.h
myPerseus.cpp
myseqdist.cpp [new file with mode: 0644]
myseqdist.h [new file with mode: 0644]
seqnoise.cpp [new file with mode: 0644]
seqnoise.h [new file with mode: 0644]
sharedrabundvector.cpp
shhhseqscommand.cpp [new file with mode: 0644]
shhhseqscommand.h [new file with mode: 0644]
trimflowscommand.cpp

index a756931435626cc568867fc76399c68b64fde994..ff6df12d115aecca886d2498ce36a8df54908ecb 100644 (file)
@@ -43,6 +43,9 @@
                A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; };
                A75790591301749D00A30DAB /* homovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75790581301749D00A30DAB /* homovacommand.cpp */; };
                A7730EFF13967241007433A3 /* countseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7730EFE13967241007433A3 /* countseqscommand.cpp */; };
+               A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774101314695AF60098E6AC /* shhhseqscommand.cpp */; };
+               A774104814696F320098E6AC /* myseqdist.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774104614696F320098E6AC /* myseqdist.cpp */; };
+               A77410F614697C300098E6AC /* seqnoise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77410F414697C300098E6AC /* seqnoise.cpp */; };
                A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; };
                A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; };
                A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; };
                A75790581301749D00A30DAB /* homovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = homovacommand.cpp; sourceTree = "<group>"; };
                A7730EFD13967241007433A3 /* countseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countseqscommand.h; sourceTree = "<group>"; };
                A7730EFE13967241007433A3 /* countseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = countseqscommand.cpp; sourceTree = "<group>"; };
+               A774101214695AF60098E6AC /* shhhseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shhhseqscommand.h; sourceTree = "<group>"; };
+               A774101314695AF60098E6AC /* shhhseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shhhseqscommand.cpp; sourceTree = "<group>"; };
+               A774104614696F320098E6AC /* myseqdist.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myseqdist.cpp; sourceTree = "<group>"; };
+               A774104714696F320098E6AC /* myseqdist.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myseqdist.h; sourceTree = "<group>"; };
+               A77410F414697C300098E6AC /* seqnoise.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqnoise.cpp; sourceTree = "<group>"; };
+               A77410F514697C300098E6AC /* seqnoise.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqnoise.h; sourceTree = "<group>"; };
                A778FE69134CA6CA00C0BA33 /* getcommandinfocommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getcommandinfocommand.h; sourceTree = "<group>"; };
                A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcommandinfocommand.cpp; sourceTree = "<group>"; };
                A77A221D139001B600B0BE70 /* deuniquetreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deuniquetreecommand.h; sourceTree = "<group>"; };
                                A7E9B75C12D37EC400DA6239 /* mothur.h */,
                                A7E9B75D12D37EC400DA6239 /* mothurout.cpp */,
                                A7E9B75E12D37EC400DA6239 /* mothurout.h */,
+                               A774104714696F320098E6AC /* myseqdist.h */,
+                               A774104614696F320098E6AC /* myseqdist.cpp */,
                                A7E9B76112D37EC400DA6239 /* nast.cpp */,
                                A7E9B76212D37EC400DA6239 /* nast.hpp */,
                                A7E9B76312D37EC400DA6239 /* nastreport.cpp */,
                                A7E9B7AD12D37EC400DA6239 /* rarefactioncurvedata.h */,
                                7E6BE10812F710D8007ADDBE /* refchimeratest.h */,
                                7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */,
+                               A77410F514697C300098E6AC /* seqnoise.h */,
+                               A77410F414697C300098E6AC /* seqnoise.cpp */,
                                A7E9BA5312D39A5E00DA6239 /* read */,
                                A7E9B82312D37EC400DA6239 /* sharedutilities.cpp */,
                                A7E9B82412D37EC400DA6239 /* sharedutilities.h */,
                                A7E9B7F212D37EC400DA6239 /* sharedcommand.cpp */,
                                A7E9B82812D37EC400DA6239 /* shhhercommand.h */,
                                A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */,
+                               A774101214695AF60098E6AC /* shhhseqscommand.h */,
+                               A774101314695AF60098E6AC /* shhhseqscommand.cpp */,
                                A7E9B84012D37EC400DA6239 /* splitabundcommand.h */,
                                A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */,
                                A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */,
                                A7FFB558142CA02C004884F2 /* summarytaxcommand.cpp in Sources */,
                                A7BF221414587886000AD524 /* myPerseus.cpp in Sources */,
                                A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */,
+                               A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */,
+                               A774104814696F320098E6AC /* myseqdist.cpp in Sources */,
+                               A77410F614697C300098E6AC /* seqnoise.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index 240ba65adf628099e8234ed7c8c7a692913d71ee..61970d53ac38a3f9877e5a010e48bc0b336dc669 100644 (file)
@@ -39,7 +39,7 @@ string ChimeraPerseusCommand::getHelpString(){
                helpString += "The chimera.perseus command reads a fastafile and namefile and outputs potentially chimeric sequences.\n";
                helpString += "The chimera.perseus command parameters are fasta, name, group, cutoff, processors, alpha and beta.\n";
                helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
-               helpString += "The name parameter allows you to provide a name file associated with your fasta file. If none is given unique.seqs will be run to generate one. \n";
+               helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
                helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                helpString += "The group parameter allows you to provide a group file.  When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
                helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
@@ -640,7 +640,6 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
                vector<bool> chimeras(numSeqs, 0);
                
                for(int i=0;i<numSeqs;i++){     
-                       cout << sequences[i].seqName << endl << sequences[i].sequence << endl << sequences[i].frequency << endl;
                        if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
                        
                        vector<bool> restricted = chimeras;
@@ -657,7 +656,6 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
                        vector<pwAlign> alignments(numSeqs);
                        
                        int comparisons = myPerseus.getAlignments(i, sequences, alignments, leftDiffs, leftMaps, rightDiffs, rightMaps, bestSingleIndex, bestSingleDiff, restricted);
-                       cout << "comparisons = " << comparisons << endl;
                        if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
 
                        int minMismatchToChimera, leftParentBi, rightParentBi, breakPointBi;
@@ -666,7 +664,6 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
                        
                        if(comparisons >= 2){   
                                minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
-                               cout << "minMismatchToChimera = " << minMismatchToChimera << endl;
                                if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
 
                                int minMismatchToTrimera = numeric_limits<int>::max();
@@ -674,12 +671,11 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
                                
                                if(minMismatchToChimera >= 3 && comparisons >= 3){
                                        minMismatchToTrimera = myPerseus.getTrimera(sequences, leftDiffs, leftParentTri, middleParentTri, rightParentTri, breakPointTriA, breakPointTriB, singleLeft, bestLeft, singleRight, bestRight, restricted);
-                                       cout << "minMismatchToTrimera = " << minMismatchToTrimera << endl;
                                        if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
                                }
                                
                                double singleDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, sequences[bestSingleIndex].sequence, dummyA, dummyB, correctModel);
-                               cout << "singleDist = " << singleDist << endl;
+                               
                                if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
 
                                string type;
@@ -693,16 +689,16 @@ int ChimeraPerseusCommand::driver(string chimeraFileName, vector<seqData>& seque
                                        type = "chimera";
                                        chimeraRefSeq = myPerseus.stitchBimera(alignments, leftParentBi, rightParentBi, breakPointBi, leftMaps, rightMaps);
                                }
-                               cout << "chimeraRefSeq = " << chimeraRefSeq << endl;
+                               ;
                                if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
                                
                                double chimeraDist = myPerseus.modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq, dummyA, dummyB, correctModel);
-                               cout << "chimeraDist = " << chimeraDist << endl;
+                               
                                if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
 
                                double cIndex = chimeraDist;//modeledPairwiseAlignSeqs(sequences[i].sequence, chimeraRefSeq);
                                double loonIndex = myPerseus.calcLoonIndex(sequences[i].sequence, sequences[leftParentBi].sequence, sequences[rightParentBi].sequence, breakPointBi, binMatrix);                
-                               cout << "loonIndex = " << loonIndex << endl;
+                               
                                if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
 
                                chimeraFile << i << '\t' << sequences[i].seqName << '\t' << bestSingleDiff << '\t' << bestSingleIndex << '\t' << sequences[bestSingleIndex].seqName << '\t';
@@ -841,7 +837,7 @@ int ChimeraPerseusCommand::createProcessesGroups(SequenceParser& parser, string
                
                //Wait until all threads have terminated.
                WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
-               cout << "here" << endl; 
+                       
                //Close all thread handles and free memory allocations.
                for(int i=0; i < pDataArray.size(); i++){
                        num += pDataArray[i]->count;
index ee7add9ebd4ff55ea37312d3194bbd4df91270e2..54b1d9b4caace40335934db39c6306687ddafdf5 100644 (file)
@@ -560,12 +560,19 @@ int ChimeraUchimeCommand::execute(){
                        
                                int numSeqs = 0;
                                int numChimeras = 0;
-       //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
                                if(processors == 1){ numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); }
                                else{   numSeqs = createProcesses(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras); }
-       //#else
-       //                      numSeqs = driver(outputFileName, fastaFileNames[s], accnosFileName, alnsFileName, numChimeras);
-       //#endif
+                               
+                               //add headings
+                               ofstream out;
+                               m->openOutputFile(outputFileName+".temp", out); 
+                               out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n";
+                               out.close();
+                               
+                               m->appendFiles(outputFileName, outputFileName+".temp");
+                               m->mothurRemove(outputFileName); rename((outputFileName+".temp").c_str(), outputFileName.c_str());
+                               
                                if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
                        
                                //remove file made for uchime
@@ -653,6 +660,7 @@ int ChimeraUchimeCommand::deconvoluteResults(SequenceParser& parser, string outp
                
                ofstream out;
                m->openOutputFile(outputFileName+".temp", out); out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+               out << "Score\tQuery\tParentA\tParentB\tIdQM\tIdQA\tIdQB\tIdAB\tIdQT\tLY\tLN\tLA\tRY\tRN\tRA\tDiv\tYN\n";
                
                float temp1;
                string parent1, parent2, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12, temp13, flag;
@@ -985,17 +993,24 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                alns = "\"" + alns + "\"";
                                
                vector<char*> cPara;
-       
-               char* tempUchime;
+               
+               string path = m->argv;
+               string tempPath = path;
+               for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
+               path = path.substr(0, (tempPath.find_last_of('m')));
+               
+               string uchimeCommand = path;
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-               tempUchime= new char[10];  
-               *tempUchime = '\0';
-               strncat(tempUchime, "./uchime ", 9); 
+               uchimeCommand += "uchime ";
 #else
-               tempUchime= new char[8]; 
-               *tempUchime = '\0';
-               strncat(tempUchime, "uchime ", 7); 
+               uchimeCommand += "uchime";
+               uchimeCommand = "\"" + uchimeCommand + "\"";
 #endif
+               
+               char* tempUchime;
+               tempUchime= new char[uchimeCommand.length()+1]; 
+               *tempUchime = '\0';
+               strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
                cPara.push_back(tempUchime);
                
                char* tempIn = new char[8]; 
@@ -1223,6 +1238,10 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                
                //uchime_main(numArgs, uchimeParameters); 
                //cout << "commandString = " << commandString << endl;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+               commandString = "\"" + commandString + "\"";
+#endif
                system(commandString.c_str());
                
                //free memory
index 359b68c9c97f144f0914f4ff309c973ca2ec5d60..6e1f809ce55a332020240bdc0b5e30398b5d5b49 100644 (file)
@@ -48,6 +48,7 @@ private:
        string fastafile, groupfile, templatefile, outputDir, namefile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract;
        int processors;
        
+       
        vector<string> outputNames;
        vector<string> fastaFileNames;
        vector<string> nameFileNames;
@@ -178,10 +179,23 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        
                        vector<char*> cPara;
                        
+                       string path = pDataArray->m->argv;
+                       string tempPath = path;
+                       for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
+                       path = path.substr(0, (tempPath.find_last_of('m')));
+                       
+                       string uchimeCommand = path;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       uchimeCommand += "uchime ";
+#else
+                       uchimeCommand += "uchime";
+                       uchimeCommand = "\"" + uchimeCommand + "\"";
+#endif                 
+                       
                        char* tempUchime;
-                       tempUchime= new char[8]; 
+                       tempUchime= new char[uchimeCommand.length()+1]; 
                        *tempUchime = '\0';
-                       strncat(tempUchime, "uchime ", 7); 
+                       strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
                        cPara.push_back(tempUchime);
                        
                        char* tempIn = new char[8]; 
@@ -396,6 +410,10 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        
                        //uchime_main(numArgs, uchimeParameters); 
                        //cout << "commandString = " << commandString << endl;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+                       commandString = "\"" + commandString + "\"";
+#endif
                        system(commandString.c_str());
                        
                        //free memory
index 69bb568e761f590e2419ee61079783679c914efa..364c5ed2eae3f9cb90dc4db6d3edced157c2168d 100644 (file)
 #include "clearmemorycommand.h"
 #include "summarytaxcommand.h"
 #include "chimeraperseuscommand.h"
+#include "shhhseqscommand.h"
 
 /*******************************************************/
 
@@ -268,7 +269,8 @@ CommandFactory::CommandFactory(){
        commands["shhh.flows"]                  = "MPIEnabled";
        commands["sens.spec"]                   = "sens.spec";
        commands["seq.error"]                   = "seq.error";
-       commands["seq.error"]                   = "summary.tax";
+       commands["summary.tax"]                 = "summary.tax";
+       commands["shhh.seqs"]                   = "shhh.seqs";
        
        commands["quit"]                                = "MPIEnabled"; 
 
@@ -428,6 +430,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
                else if(commandName == "clear.memory")                  {       command = new ClearMemoryCommand(optionString);                         }
                else if(commandName == "summary.tax")                   {       command = new SummaryTaxCommand(optionString);                          }
                else if(commandName == "chimera.perseus")               {       command = new ChimeraPerseusCommand(optionString);                      }
+               else if(commandName == "shhh.seqs")                             {       command = new ShhhSeqsCommand(optionString);                            }
                else                                                                                    {       command = new NoCommand(optionString);                                          }
 
                return command;
@@ -570,6 +573,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str
                else if(commandName == "clear.memory")                  {       pipecommand = new ClearMemoryCommand(optionString);                             }
                else if(commandName == "summary.tax")                   {       pipecommand = new SummaryTaxCommand(optionString);                              }
                else if(commandName == "chimera.perseus")               {       pipecommand = new ChimeraPerseusCommand(optionString);                  }
+               else if(commandName == "shhh.seqs")                             {       pipecommand = new ShhhSeqsCommand(optionString);                                }
                else                                                                                    {       pipecommand = new NoCommand(optionString);                                              }
 
                return pipecommand;
@@ -700,6 +704,7 @@ Command* CommandFactory::getCommand(string commandName){
                else if(commandName == "clear.memory")                  {       shellcommand = new ClearMemoryCommand();                        }
                else if(commandName == "summary.tax")                   {       shellcommand = new SummaryTaxCommand();                         }
                else if(commandName == "chimera.perseus")               {       shellcommand = new ChimeraPerseusCommand();                     }
+               else if(commandName == "shhh.seqs")                             {       shellcommand = new ShhhSeqsCommand();                           }
                else                                                                                    {       shellcommand = new NoCommand();                                         }
 
                return shellcommand;
index 66d8af15d1c3b2ffc67e747bb14c9ee7ca0f2480..f55f7144fa548fe30eae2b075a8abcf8fd4584f0 100644 (file)
@@ -24,7 +24,7 @@
 //custom data structure for threads to use.
 // This is passed by void pointer so it can be any data type
 // that can be passed using a single void pointer (LPVOID).
-typedef struct distanceData {
+struct distanceData {
        int startLine;
        int endLine;
        string dFileName;
index 975924bae580f11271c84ebe79a9897dfe71f58d..0ff8b3f7b92369beff5414b605c3696ca14dd50f 100644 (file)
@@ -394,7 +394,7 @@ int MetaStatsCommand::driver(int start, int num, vector<SharedRAbundVector*>& th
                        //get set names
                        string setA = namesOfGroupCombos[c][0]; 
                        string setB = namesOfGroupCombos[c][1];
-               cout << setA << '\t' << setB << endl;
+               //cout << setA << '\t' << setB << endl;
                        //get filename
                        string outputFileName = outputDir +  m->getRootName(m->getSimpleName(sharedfile)) + thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats";
                        outputNames.push_back(outputFileName); outputTypes["metastats"].push_back(outputFileName);
@@ -425,8 +425,8 @@ int MetaStatsCommand::driver(int start, int num, vector<SharedRAbundVector*>& th
                                }
                        }
                        
-                       for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; }
-                       cout << setACount << endl;
+                       //for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; }
+                       //cout << setACount << endl;
                        
                        if ((setACount == 0) || (setBCount == 0))  { 
                                m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine(); 
index f8ec32363516bcc6543569b57f313505796acef7..57ece6785e1e798e310797b747e6b8d049c0113d 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -85,6 +85,7 @@ using namespace std;
 #define isnan(x) ((x) != (x))
 #define isinf(x) (fabs(x) == std::numeric_limits<double>::infinity())
 
+
 typedef unsigned long ull;
 
 struct IntNode {
index b342393aa9a82e3dcb00190f6648edc7110779b3..3cf38cdb9d752652cc70a865ce6b1b87e1da939b 100644 (file)
@@ -548,7 +548,7 @@ int Perseus::getChimera(vector<seqData> sequences,
                rightParent = -1;
                breakPoint = -1;
                
-               for(int l=0;l<seqLength;l++){
+               for(int l=0;l<seqLength-1;l++){
                        
                        if (m->control_pressed) { return 0; }
                        
diff --git a/myseqdist.cpp b/myseqdist.cpp
new file mode 100644 (file)
index 0000000..f5c8b40
--- /dev/null
@@ -0,0 +1,416 @@
+/*
+ *  pds.seqdist.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 8/12/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "myseqdist.h"
+#include "sequence.hpp"
+
+/**************************************************************************************************/
+correctDist::correctDist(int p) : processors(p) {
+       try {
+               m = MothurOut::getInstance();
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "correctDist");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+correctDist::correctDist(string sequenceFileName, int p) : processors(p) {
+       try {
+               m = MothurOut::getInstance();
+               getSequences(sequenceFileName);
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "correctDist");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int correctDist::addSeq(string seqName, string seqSeq){
+       try {
+               names.push_back(seqName);
+               sequences.push_back(fixSequence(seqSeq));
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "addSeq");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int correctDist::execute(string distanceFileName){
+       try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+               processors = 1;
+#endif
+               correctMatrix.resize(4);
+               for(int i=0;i<4;i++){   correctMatrix[i].resize(4);     }
+               
+               correctMatrix[0][0] = 0.000000;         //AA
+               correctMatrix[1][0] = 11.619259;        //CA
+               correctMatrix[2][0] = 11.694004;        //TA
+               correctMatrix[3][0] = 7.748623;         //GA
+               
+               correctMatrix[1][1] = 0.000000;         //CC
+               correctMatrix[2][1] = 7.619657;         //TC
+               correctMatrix[3][1] = 12.852562;        //GC
+               
+               correctMatrix[2][2] = 0.000000;         //TT
+               correctMatrix[3][2] = 10.964048;        //TG
+               
+               correctMatrix[3][3] = 0.000000;         //GG
+               
+               for(int i=0;i<4;i++){
+                       for(int j=0;j<i;j++){
+                               correctMatrix[j][i] = correctMatrix[i][j];
+                       }
+               }
+               
+               numSeqs = names.size();
+                               
+               if(processors == 1){ driver(0, numSeqs, distanceFileName); }
+               else{
+                       
+                       for(int i=0;i<processors;i++){
+                               start.push_back(int (sqrt(float(i)/float(processors)) * numSeqs));
+                               end.push_back(int (sqrt(float(i+1)/float(processors)) * numSeqs));
+                       }
+                       
+                       createProcess(distanceFileName);
+               }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "execute");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int correctDist::getSequences(string sequenceFileName){
+       try {
+               ifstream sequenceFile;
+               m->openInputFile(sequenceFileName, sequenceFile);
+               string seqName, seqSeq;
+               
+               while(!sequenceFile.eof()){
+                       if (m->control_pressed) { break; }
+                       
+                       Sequence temp(sequenceFile); m->gobble(sequenceFile);
+                       
+                       if (temp.getName() != "") {
+                               names.push_back(temp.getName());
+                               sequences.push_back(fixSequence(temp.getAligned()));
+                       }
+               }
+               sequenceFile.close();
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "getSequences");
+               exit(1);
+       }       
+}
+
+/**************************************************************************************************/
+vector<int> correctDist::fixSequence(string sequence){
+       try {
+               int alignLength = sequence.length();
+               
+               vector<int> seqVector;
+               
+               for(int i=0;i<alignLength;i++){
+                       if(sequence[i] == 'A')          {       seqVector.push_back(0);         }
+                       else if(sequence[i] == 'C')     {       seqVector.push_back(1);         }
+                       else if(sequence[i] == 'T')     {       seqVector.push_back(2);         }
+                       else if(sequence[i] == 'G')     {       seqVector.push_back(3);         }
+                       else if(sequence[i] == 'N')     {       seqVector.push_back(0);         }//hmmmm....
+               }
+               
+               return seqVector;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "fixSequence");
+               exit(1);
+       }       
+}
+
+/**************************************************************************************************/
+
+int correctDist::createProcess(string distanceFileName){
+       try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               int process = 1;
+               vector<int> processIDs;
+               
+               while(process != processors){
+                       
+                       int pid = fork();
+                       
+                       if(pid > 0){
+                               processIDs.push_back(pid);
+                               process++;
+                       }
+                       else if(pid == 0){
+                               driver(start[process], end[process], distanceFileName + toString(getpid()) + ".temp");
+                               exit(0);
+                       }
+                       else{
+                               cout << "Doh!" << endl;
+                               for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill(temp, SIGINT); }
+                               exit(0);
+                       }
+               }
+               
+               driver(start[0], end[0], distanceFileName);
+               
+               for (int i=0;i<processIDs.size();i++) { 
+                       int temp = processIDs[i];
+                       wait(&temp);
+               }
+               
+               for(int i=0;i<processIDs.size();i++){
+                       m->appendFiles((distanceFileName + toString(processIDs[i]) + ".temp"), distanceFileName);
+                       remove((distanceFileName + toString(processIDs[i]) + ".temp").c_str());
+               }
+#endif
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "createProcess");
+               exit(1);
+       }       
+}
+
+/**************************************************************************************************/
+
+int correctDist::driver(int start, int end, string distFileName){
+       try {
+               ofstream distFile;
+               m->openOutputFile(distFileName, distFile);
+               distFile << setprecision(9);
+               
+               if(start == 0){
+                       distFile << numSeqs << endl;
+               }
+               
+               int startTime = time(NULL);
+               
+               m->mothurOut("\nCalculating distances...\n");
+               
+               for(int i=start;i<end;i++){
+                       
+                       distFile << i;
+                       
+                       for(int j=0;j<i;j++){
+                               
+                               if (m->control_pressed) { distFile.close(); return 0; }
+                               
+                               double dist = getDist(sequences[i], sequences[j]);
+                               
+                               distFile << ' ' << dist;
+                       }
+                       distFile << endl;
+                       
+                       if(i % 100 == 0){ m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); }
+               }
+               distFile.close();
+               
+               if((end-1) % 100 != 0){ m->mothurOut(toString(end-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine(); }
+               m->mothurOut("Done.\n");
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "driver");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+double correctDist::getDist(vector<int>& seqA, vector<int>& seqB){
+       try {
+               
+               int lengthA = seqA.size();
+               int lengthB = seqB.size();
+               
+               vector<vector<double> > alignMatrix(lengthA+1);
+               vector<vector<char> > alignMoves(lengthA+1);
+               
+               for(int i=0;i<=lengthA;i++){
+                       alignMatrix[i].resize(lengthB+1, 0);
+                       alignMoves[i].resize(lengthB+1, 'x');
+               }
+               
+               for(int i=0;i<=lengthA;i++){
+                       alignMatrix[i][0] = 15.0 * i;
+                       alignMoves[i][0] = 'u';
+               }
+               for(int i=0;i<=lengthB;i++){
+                       alignMatrix[0][i] = 15.0 * i;
+                       alignMoves[0][i] = 'l';
+               }
+               
+               for(int i=1;i<=lengthA;i++){
+                       for(int j=1;j<=lengthB;j++){
+                               
+                               if (m->control_pressed) {  return 0;  }
+                               
+                               double nogap;           
+                               nogap = alignMatrix[i-1][j-1] + correctMatrix[seqA[i-1]][seqB[j-1]];
+                               
+                               
+                               double gap;
+                               double left;
+                               if(i == lengthA){ //terminal gap
+                                       left = alignMatrix[i][j-1];
+                               }
+                               else{
+                                       if(seqB[j-1] == getLastMatch('l', alignMoves, i, j, seqA, seqB)){
+                                               gap = 4.0;
+                                       }
+                                       else{
+                                               gap = 15.0;
+                                       }
+                                       
+                                       left = alignMatrix[i][j-1] + gap;
+                               }
+                               
+                               
+                               double up;
+                               if(j == lengthB){ //terminal gap
+                                       up = alignMatrix[i-1][j];
+                               }
+                               else{
+                                       
+                                       if(seqA[i-1] == getLastMatch('u', alignMoves, i, j, seqA, seqB)){
+                                               gap = 4.0;
+                                       }
+                                       else{
+                                               gap = 15.0;
+                                       }
+                                       
+                                       up = alignMatrix[i-1][j] + gap;
+                               }
+                               
+                               
+                               
+                               if(nogap < left){
+                                       if(nogap < up){
+                                               alignMoves[i][j] = 'd';
+                                               alignMatrix[i][j] = nogap;
+                                       }
+                                       else{
+                                               alignMoves[i][j] = 'u';
+                                               alignMatrix[i][j] = up;
+                                       }
+                               }
+                               else{
+                                       if(left < up){
+                                               alignMoves[i][j] = 'l';
+                                               alignMatrix[i][j] = left;
+                                       }
+                                       else{
+                                               alignMoves[i][j] = 'u';
+                                               alignMatrix[i][j] = up;
+                                       }
+                               }
+                       }
+               }
+               
+               int i = lengthA;
+               int j = lengthB;
+               int count = 0;
+               
+               
+               //      string alignA = "";
+               //      string alignB = "";
+               //      string bases = "ACTG";
+               //      
+               //      for(int i=0;i<lengthA;i++){
+               //              cout << bases[seqA[i]];
+               //      }cout << endl;
+               //
+               //      for(int i=0;i<lengthB;i++){
+               //              cout << bases[seqB[i]];
+               //      }cout << endl;
+               
+               while(i > 0 && j > 0){
+                       
+                       if (m->control_pressed) {  return 0;  }
+                       
+                       if(alignMoves[i][j] == 'd'){
+                               //                      alignA = bases[seqA[i-1]] + alignA;
+                               //                      alignB = bases[seqB[j-1]] + alignB;
+                               
+                               count++;
+                               i--;
+                               j--;
+                       }
+                       else if(alignMoves[i][j] == 'u'){
+                               if(j != lengthB){
+                                       //                              alignA = bases[seqA[i-1]] + alignA;
+                                       //                              alignB = '-' + alignB;
+                                       count++;
+                               }
+                               
+                               i--;
+                       }
+                       else if(alignMoves[i][j] == 'l'){
+                               if(i != lengthA){
+                                       //                              alignA = '-' + alignA;
+                                       //                              alignB = bases[seqB[j-1]] + alignB;
+                                       count++;
+                               }
+                               
+                               j--;
+                       }
+               }
+               
+               //      cout << alignA << endl << alignB << endl;
+               
+               return alignMatrix[lengthA][lengthB] / (double)count;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "getDist");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+int correctDist::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
+       try {
+               
+               char nullReturn = -1;
+               
+               while(i>=1 && j>=1){
+                       
+                       if (m->control_pressed) { return nullReturn; }
+                       
+                       if(direction == 'd'){
+                               if(seqA[i-1] == seqB[j-1])      {       return seqA[i-1];       }
+                               else                                            {       return nullReturn;      }
+                       }
+                       
+                       else if(direction == 'l')               {       j--;                            }
+                       else                                                    {       i--;                            }
+                       
+                       direction = alignMoves[i][j];
+               }
+               
+               return nullReturn;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "correctDist", "getLastMatch");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+
+
+
diff --git a/myseqdist.h b/myseqdist.h
new file mode 100644 (file)
index 0000000..74c9ea7
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef CORRECTDIST_H
+#define CORRECTDIST_H
+
+
+/*
+ *  pds.seqdist.h
+ *  
+ *
+ *  Created by Pat Schloss on 8/12/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothurout.h"
+
+/**************************************************************************************************/
+
+class correctDist {
+public:
+       correctDist(string, int);
+       correctDist(int);
+       ~correctDist(){}
+       
+       int addSeq(string, string);
+       int execute(string);
+       
+private:
+       MothurOut* m;
+       int getSequences(string);
+       vector<int> fixSequence(string);
+       
+       int driver(int, int, string);
+       int createProcess(string);
+       
+       double getDist(vector<int>&, vector<int>&);
+       int getLastMatch(char, vector<vector<char> >&, int, int, vector<int>&, vector<int>&);
+       
+       vector<vector<double> > correctMatrix;
+       
+       vector<vector<int> > sequences;
+       
+       vector<string> names;   
+       int numSeqs;
+       int processors;
+       vector<int> start;
+       vector<int> end;
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
diff --git a/seqnoise.cpp b/seqnoise.cpp
new file mode 100644 (file)
index 0000000..578daaf
--- /dev/null
@@ -0,0 +1,959 @@
+/*
+ *  mySeqNoise.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 8/31/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "seqnoise.h"
+#include "sequence.hpp"
+
+#define MIN_DELTA 1.0e-6
+#define MIN_ITER 20
+#define MAX_ITER 1000
+#define MIN_COUNT 0.1
+#define MIN_TAU   1.0e-4
+#define MIN_WEIGHT 0.1
+
+
+/**************************************************************************************************/
+int seqNoise::getSequenceData(string sequenceFileName, vector<string>& sequences){
+       try {
+               
+               ifstream sequenceFile;
+               m->openInputFile(sequenceFileName, sequenceFile);
+               
+               while(!sequenceFile.eof()){
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       Sequence temp(sequenceFile); m->gobble(sequenceFile);
+                       
+                       if (temp.getName() != "") {
+                               sequences.push_back(temp.getAligned());
+                       }
+               }
+               sequenceFile.close();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "getSequenceData");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int seqNoise::addSeq(string seq, vector<string>& sequences){ 
+       try {
+               sequences.push_back(seq);
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "addSeq");
+               exit(1);
+       }
+}      
+/**************************************************************************************************/
+//no checks for file mismatches
+int seqNoise::getRedundantNames(string namesFileName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
+       try {
+               string unique, redundant;
+               ifstream namesFile;
+               m->openInputFile(namesFileName, namesFile);
+               
+               for(int i=0;i<redundantNames.size();i++){
+                       
+                       if (m->control_pressed) { break; }
+                       
+                       namesFile >> uniqueNames[i]; m->gobble(namesFile);
+                       namesFile >> redundantNames[i]; m->gobble(namesFile);
+                       
+                       seqFreq[i] = m->getNumNames(redundantNames[i]);
+               }
+               namesFile.close();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "getRedundantNames");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int seqNoise::addRedundantName(string uniqueName, string redundantName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){ 
+       try {
+               
+               uniqueNames.push_back(uniqueName);
+               redundantNames.push_back(redundantName);
+               seqFreq.push_back(m->getNumNames(redundantName));
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "addRedundantName");
+               exit(1);
+       }
+}      
+/**************************************************************************************************/
+int seqNoise::getDistanceData(string distFileName, vector<double>& distances){
+       try {
+               
+               ifstream distFile;
+               m->openInputFile(distFileName, distFile);
+               
+               int numSeqs = 0;
+               string name = "";
+               
+               distFile >> numSeqs;
+               
+               for(int i=0;i<numSeqs;i++){
+                       
+                       if (m->control_pressed) {  break; }
+                       
+                       distances[i * numSeqs + i] = 0.0000;
+                       
+                       distFile >> name;
+                       
+                       for(int j=0;j<i;j++){
+                               distFile >> distances[i * numSeqs + j];
+                               distances[j * numSeqs + i] = distances[i * numSeqs + j];
+                       }
+               }
+               
+               distFile.close();       
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "getDistanceData");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+int seqNoise::getListData(string listFileName, double cutOff, vector<int>& otuData, vector<int>& otuFreq, vector<vector<int> >& otuBySeqLookUp){
+       try {
+               
+               ifstream listFile;
+               m->openInputFile(listFileName, listFile);
+               double threshold;
+               int numOTUs;
+               
+               if(listFile.peek() == 'u'){     m->getline(listFile);   }
+               
+               while(listFile){
+                       listFile >> threshold;
+                       
+                       if(threshold < cutOff){
+                               m->getline(listFile);   
+                       }
+                       else{
+                               listFile >> numOTUs;
+                               otuFreq.resize(numOTUs, 0);
+                               
+                               for(int i=0;i<numOTUs;i++){
+                                       
+                                       if (m->control_pressed) { return 0; }
+                                       
+                                       string otu;
+                                       listFile >> otu;
+                                       
+                                       int count = 0;
+                                       
+                                       string number = "";
+                                       
+                                       for(int j=0;j<otu.size();j++){
+                                               if(otu[j] != ','){
+                                                       number += otu[j];
+                                               }
+                                               else{
+                                                       int index = atoi(number.c_str());
+                                                       otuData[index] = i;
+                                                       count++;
+                                                       number = "";
+                                               }
+                                       }
+                                       
+                                       int index = atoi(number.c_str());
+                                       otuData[index] = i;
+                                       count++;
+                                       
+                                       otuFreq[i] = count;
+                               }
+                               
+                               otuBySeqLookUp.resize(numOTUs);
+                               
+                               int numSeqs = otuData.size();
+                               
+                               for(int i=0;i<numSeqs;i++){
+                                       if (m->control_pressed) { return 0; }
+                                       otuBySeqLookUp[otuData[i]].push_back(i);
+                               }
+                               for(int i=0;i<numOTUs;i++){
+                                       if (m->control_pressed) { return 0; }
+                                       for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
+                                               otuBySeqLookUp[i].push_back(0);
+                                       }
+                               }
+                               
+                               break;
+                       }
+               }
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "getListData");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+int seqNoise::updateOTUCountData(vector<int> otuFreq,
+                                                                vector<vector<int> > otuBySeqLookUp,
+                                                                vector<vector<int> > aanI,
+                                                                vector<int>& anP,
+                                                                vector<int>& anI,
+                                                                vector<int>& cumCount
+                                                                ){
+       try {
+               int numOTUs = otuFreq.size();
+               
+               int count = 0;
+               
+               for(int i=0;i<numOTUs;i++){
+                       cumCount[i] = count;
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       for(int j=0;j<otuFreq[i];j++){
+                               anP[count] = otuBySeqLookUp[i][j];
+                               anI[count] = aanI[i][j];
+                               
+                               count++;
+                       }
+               }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "updateOTUCountData");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+double seqNoise::calcNewWeights(
+                                         vector<double>& weights,      //
+                                         vector<int> seqFreq,          //
+                                         vector<int> anI,                      //
+                                         vector<int> cumCount,         //
+                                         vector<int> anP,                      //
+                                         vector<int> otuFreq,          //
+                                         vector<double> tau            //
+                                         ){
+       try {
+               
+               int numOTUs = weights.size();
+               double maxChange = -1;
+               
+               cout.flush();
+               
+               for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       double change = weights[i];
+                       
+                       weights[i] = 0.0000;
+                       
+                       for(int j=0;j<otuFreq[i];j++){
+                               
+                               int index1 = cumCount[i] + j;
+                               int index2 = anI[index1];
+                               
+                               double currentTau = tau[anP[index1]];
+                               double freq = double(seqFreq[index2]);
+                               
+                               weights[i] += currentTau * freq;
+                       }
+                       change = fabs(weights[i] - change);
+                       
+                       if(change > maxChange){ maxChange = change;     }
+                       cout.flush();
+               }
+               return maxChange;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "calcNewWeights");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::calcCentroids(
+                                  vector<int> anI,
+                                  vector<int> anP,
+                                  vector<int>& change, 
+                                  vector<int>& centroids, 
+                                  vector<int> cumCount,
+                                  vector<double> distances,///
+                                  vector<int> seqFreq, 
+                                  vector<int> otuFreq, 
+                                  vector<double> tau 
+                                  ){
+       try {
+               int numOTUs = change.size();
+               int numSeqs = seqFreq.size();
+               
+               for(int i=0;i<numOTUs;i++){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       int minFIndex = -1;
+                       double minFValue = 1e10;
+                       
+                       change[i] = 0;
+                       double count = 0.00000;
+                       
+                       int freqOfOTU = otuFreq[i];
+                       
+                       for(int j=0;j<freqOfOTU;j++){
+                               int index = cumCount[i] + j;
+                               count += seqFreq[anI[index]]*tau[anP[index]];
+                       }
+                       
+                       if(freqOfOTU > 0 && count > MIN_COUNT){
+                               
+                               vector<double> adF(freqOfOTU);
+                               vector<int> anL(freqOfOTU);
+                               
+                               for(int j=0;j<freqOfOTU;j++){
+                                       anL[j] = anI[cumCount[i] + j];
+                                       adF[j] = 0.0000;
+                               }
+                               
+                               for(int j=0;j<freqOfOTU;j++){           
+                                       int index = cumCount[i] + j;
+                                       double curTau = tau[anP[index]];
+                                       
+                                       for(int k=0;k<freqOfOTU;k++){
+                                               double dist = distances[anL[j]*numSeqs + anL[k]];
+                                               
+                                               adF[k] += dist * curTau * seqFreq[anL[j]];
+                                       }
+                               }
+                               
+                               for(int j=0;j<freqOfOTU;j++){
+                                       if(adF[j] < minFValue){
+                                               minFIndex = j;
+                                               minFValue = adF[j];
+                                       }
+                               }
+                               
+                               if(centroids[i] != anL[minFIndex]){
+                                       change[i] = 1;
+                                       centroids[i] = anL[minFIndex];
+                               }
+                       }
+                       else if(centroids[i] != -1){
+                               change[i] = 1;
+                               centroids[i] = -1;                      
+                       }
+               }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "calcCentroids");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::checkCentroids(vector<double>& weights, vector<int> centroids){
+       try {
+               int numOTUs = centroids.size();
+               vector<int> unique(numOTUs, 1);
+               
+               double minWeight = MIN_WEIGHT;
+               for(int i=0;i<numOTUs;i++){
+                       if (m->control_pressed) { return 0; }
+                       if(weights[i] < minWeight){     unique[i] = -1; }
+               }
+               
+               for(int i=0;i<numOTUs;i++){
+                       if (m->control_pressed) { return 0; }
+                       if(unique[i] == 1){
+                               for(int j=i+1; j<numOTUs;j++){
+                                       if(unique[j] == 1){
+                                               if(centroids[i] == centroids[j]){
+                                                       unique[j] = 0;
+                                                       weights[i] += weights[j];
+                                                       weights[j] = 0;
+                                               }
+                                       }
+                               }
+                       }               
+               }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "checkCentroids");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::setUpOTUData(vector<int>& otuData, vector<double>& percentage, vector<int> cumCount, vector<double> tau, vector<int> otuFreq, vector<int> anP, vector<int> anI){
+       try {
+
+               int numOTUs = cumCount.size();
+               int numSeqs = otuData.size();
+               
+               vector<double> bestTau(numSeqs, 0);
+               vector<double> bestIndex(numSeqs, -1);
+               
+               for(int i=0;i<numOTUs;i++){
+                       if (m->control_pressed) { return 0; }
+                       for(int j=0;j<otuFreq[i];j++){
+                               
+                               int index1 = cumCount[i] + j;
+                               double thisTau = tau[anP[index1]];
+                               int index2 = anI[index1];
+                               
+                               if(thisTau > bestTau[index2]){
+                                       bestTau[index2] = thisTau;
+                                       bestIndex[index2] = i;
+                               }
+                       }               
+               }
+               
+               for(int i=0;i<numSeqs;i++){
+                       if (m->control_pressed) { return 0; }
+                       otuData[i] = bestIndex[i];
+                       percentage[i] = 1 - bestTau[i];
+               }
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "setUpOTUData");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::finishOTUData(vector<int> otuData, vector<int>& otuFreq, vector<int>& anP, vector<int>& anI, vector<int>& cumCount, vector<vector<int> >& otuBySeqLookUp, vector<vector<int> >& aanI, vector<double>& tau){
+       try {
+               int numSeqs = otuData.size();
+               int numOTUs = otuFreq.size();
+               int total = numSeqs;
+               
+               otuFreq.assign(numOTUs, 0);
+               tau.assign(numSeqs, 1);
+               anP.assign(numSeqs, 0);
+               anI.assign(numSeqs, 0);
+               
+               for(int i=0;i<numSeqs;i++){
+                       if (m->control_pressed) { return 0; }
+                       int otu = otuData[i];
+                       total++;
+                       
+                       otuBySeqLookUp[otu][otuFreq[otu]] = i;
+                       aanI[otu][otuFreq[otu]] = i;
+                       otuFreq[otu]++;
+               }
+               updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "finishOTUData");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
+       try{
+               char nullReturn = -1;
+               
+               while(i>=1 && j>=1){
+                       if (m->control_pressed) { return nullReturn; }
+                       if(direction == 'd'){
+                               if(seqA[i-1] == seqB[j-1])      {       return seqA[i-1];       }
+                               else                                            {       return nullReturn;      }
+                       }
+                       
+                       else if(direction == 'l')               {       j--;                            }
+                       else                                                    {       i--;                            }
+                       
+                       direction = alignMoves[i][j];
+               }
+               
+               return nullReturn;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "getLastMatch");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::countDiffs(vector<int> query, vector<int> ref){
+       try {
+               //double MATCH = 5.0;
+               //double MISMATCH = -2.0;
+               //double GAP = -2.0;
+               
+               vector<vector<double> > correctMatrix(4);
+               for(int i=0;i<4;i++){   correctMatrix[i].resize(4);     }
+               
+               correctMatrix[0][0] = 0.000000;         //AA
+               correctMatrix[1][0] = 11.619259;        //CA
+               correctMatrix[2][0] = 11.694004;        //TA
+               correctMatrix[3][0] = 7.748623;         //GA
+               
+               correctMatrix[1][1] = 0.000000;         //CC
+               correctMatrix[2][1] = 7.619657;         //TC
+               correctMatrix[3][1] = 12.852562;        //GC
+               
+               correctMatrix[2][2] = 0.000000;         //TT
+               correctMatrix[3][2] = 10.964048;        //TG
+               
+               correctMatrix[3][3] = 0.000000;         //GG
+               
+               for(int i=0;i<4;i++){
+                       for(int j=0;j<i;j++){
+                               correctMatrix[j][i] = correctMatrix[i][j];
+                       }
+               }
+               
+               int queryLength = query.size();
+               int refLength = ref.size();
+               
+               vector<vector<double> > alignMatrix(queryLength + 1);
+               vector<vector<char> > alignMoves(queryLength + 1);
+               
+               for(int i=0;i<=queryLength;i++){
+                       if (m->control_pressed) { return 0; }
+                       alignMatrix[i].resize(refLength + 1, 0);
+                       alignMoves[i].resize(refLength + 1, 'x');
+               }
+               
+               for(int i=0;i<=queryLength;i++){
+                       if (m->control_pressed) { return 0; }
+                       alignMatrix[i][0] = 15.0 * i;
+                       alignMoves[i][0] = 'u';
+               }
+               
+               for(int i=0;i<=refLength;i++){
+                       if (m->control_pressed) { return 0; }
+                       alignMatrix[0][i] = 15.0 * i;
+                       alignMoves[0][i] = 'l';
+               }
+               
+               for(int i=1;i<=queryLength;i++){
+                       if (m->control_pressed) { return 0; }
+                       for(int j=1;j<=refLength;j++){
+                               
+                               double nogap;           
+                               nogap = alignMatrix[i-1][j-1] + correctMatrix[query[i-1]][ref[j-1]];
+                               
+                               
+                               double gap;
+                               double left;
+                               if(i == queryLength){ //terminal gap
+                                       left = alignMatrix[i][j-1];
+                               }
+                               else{
+                                       if(ref[j-1] == getLastMatch('l', alignMoves, i, j, query, ref)){
+                                               gap = 4.0;
+                                       }
+                                       else{
+                                               gap = 15.0;
+                                       }
+                                       
+                                       left = alignMatrix[i][j-1] + gap;
+                               }
+                               
+                               
+                               double up;
+                               if(j == refLength){ //terminal gap
+                                       up = alignMatrix[i-1][j];
+                               }
+                               else{
+                                       
+                                       if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, ref)){
+                                               gap = 4.0;
+                                       }
+                                       else{
+                                               gap = 15.0;
+                                       }
+                                       
+                                       up = alignMatrix[i-1][j] + gap;
+                               }
+                               
+                               
+                               
+                               if(nogap < left){
+                                       if(nogap < up){
+                                               alignMoves[i][j] = 'd';
+                                               alignMatrix[i][j] = nogap;
+                                       }
+                                       else{
+                                               alignMoves[i][j] = 'u';
+                                               alignMatrix[i][j] = up;
+                                       }
+                               }
+                               else{
+                                       if(left < up){
+                                               alignMoves[i][j] = 'l';
+                                               alignMatrix[i][j] = left;
+                                       }
+                                       else{
+                                               alignMoves[i][j] = 'u';
+                                               alignMatrix[i][j] = up;
+                                       }
+                               }
+                       }
+               }
+               
+               int i = queryLength;
+               int j = refLength;
+               int diffs = 0;
+               
+               //      string alignA = "";
+               //      string alignB = "";
+               //      string bases = "ACTG";
+               
+               while(i > 0 && j > 0){
+                       if (m->control_pressed) { return 0; }
+                       if(alignMoves[i][j] == 'd'){
+                               //                      alignA = bases[query[i-1]] + alignA;
+                               //                      alignB = bases[ref[j-1]] + alignB;
+                               
+                               if(query[i-1] != ref[j-1])      {       diffs++;        }
+                               
+                               i--;
+                               j--;
+                       }
+                       else if(alignMoves[i][j] == 'u'){
+                               if(j != refLength){
+                                       //                              alignA = bases[query[i-1]] + alignA;
+                                       //                              alignB = '-' + alignB;
+                                       
+                                       diffs++;
+                               }
+                               
+                               i--;
+                       }
+                       else if(alignMoves[i][j] == 'l'){
+                               if(i != queryLength){
+                                       //                              alignA = '-' + alignA;
+                                       //                              alignB = bases[ref[j-1]] + alignB;
+                                       
+                                       diffs++;
+                               }
+                               
+                               j--;
+                       }
+               }
+               
+               //      cout << diffs << endl;
+               //      cout << alignA << endl;
+               //      cout << alignB << endl;
+               //      cout << endl;
+               
+               return diffs;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "countDiffs");
+               exit(1);
+       }
+       
+}
+
+/**************************************************************************************************/
+
+vector<int> seqNoise::convertSeq(string bases){
+       try {
+               vector<int> numbers(bases.length(), -1);
+               
+               for(int i=0;i<bases.length();i++){
+                       if (m->control_pressed) { return numbers; }
+                       
+                       char b = bases[i];
+                       
+                       if(b == 'A')    {       numbers[i] = 0; }
+                       else if(b=='C') {       numbers[i] = 1; }
+                       else if(b=='T') {       numbers[i] = 2; }
+                       else if(b=='G') {       numbers[i] = 3; }
+                       else                    {       numbers[i] = 0; }
+               }
+               
+               return numbers;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "convertSeq");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+string seqNoise::degapSeq(string aligned){
+       try {
+               string unaligned = "";
+               
+               for(int i=0;i<aligned.length();i++){
+                       
+                       if (m->control_pressed) { return ""; }
+                       
+                       if(aligned[i] != '-' && aligned[i] != '.'){
+                               unaligned += aligned[i];
+                       }
+               }
+               
+               return unaligned;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "degapSeq");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int seqNoise::writeOutput(string fastaFileName, string namesFileName, string uMapFileName, vector<int> finalTau, vector<int> centroids, vector<int> otuData, vector<string> sequences, vector<string> uniqueNames, vector<string> redundantNames, vector<int> seqFreq, vector<double>& distances){
+       try {
+               int numOTUs = finalTau.size();
+               int numSeqs = uniqueNames.size();
+               
+               ofstream fastaFile(fastaFileName.c_str());
+               ofstream namesFile(namesFileName.c_str());
+               ofstream uMapFile(uMapFileName.c_str());
+               
+               vector<int> maxSequenceAbund(numOTUs, 0);
+               vector<int> maxSequenceIndex(numOTUs, 0);
+               
+               for(int i=0;i<numSeqs;i++){
+                       if (m->control_pressed) { return 0; }
+                       if(maxSequenceAbund[otuData[i]] < seqFreq[i]){
+                               maxSequenceAbund[otuData[i]] = seqFreq[i];
+                               maxSequenceIndex[otuData[i]] = i;
+                       }
+               }
+               
+               int count = 1;
+               
+               for(int i=0;i<numOTUs;i++){
+                       if (m->control_pressed) { return 0; }
+                       
+                       if(finalTau[i] > 0){
+                               
+                               if(maxSequenceIndex[i] != centroids[i] && distances[maxSequenceIndex[i]*numSeqs + centroids[i]] == 0){
+                                       //                              cout << uniqueNames[centroids[i]] << '\t' << uniqueNames[maxSequenceIndex[i]] << '\t' << count << endl;
+                                       centroids[i] = maxSequenceIndex[i];
+                               }
+                               
+                               int index = centroids[i];
+                               
+                               fastaFile << '>' << uniqueNames[index] << endl << sequences[index] << endl;
+                               namesFile << uniqueNames[index] << '\t';
+                               
+                               string refSeq = sequences[index];
+                               string redundantSeqs = redundantNames[index];;
+                               
+                               
+                               vector<freqData> frequencyData;
+                               
+                               for(int j=0;j<numSeqs;j++){
+                                       if(otuData[j] == i && j != index){
+                                               frequencyData.push_back(freqData(j, seqFreq[j]));
+                                       }
+                               }
+                               sort(frequencyData.rbegin(), frequencyData.rend());
+                               
+                               string refDegap = degapSeq(refSeq);
+                               vector<int> rUnalign = convertSeq(refDegap);
+                               
+                               uMapFile << "ideal_seq_" << count << '\t' << finalTau[i] << endl;
+                               uMapFile << uniqueNames[index] << '\t' << seqFreq[index] << "\t0\t" << refDegap << endl;
+                               
+                               
+                               for(int j=0;j<frequencyData.size();j++){
+                                       if (m->control_pressed) { return 0; }
+                                       redundantSeqs += ',' + redundantNames[frequencyData[j].index];
+                                       
+                                       uMapFile << uniqueNames[frequencyData[j].index] << '\t' << seqFreq[frequencyData[j].index] << '\t';
+                                       
+                                       string querySeq = sequences[frequencyData[j].index];
+                                       
+                                       string queryDegap = degapSeq(querySeq);
+                                       vector<int> qUnalign = convertSeq(queryDegap);
+                                       
+                                       int udiffs = countDiffs(qUnalign, rUnalign);
+                                       uMapFile << udiffs << '\t' << queryDegap << endl;
+                                       
+                               }                                       
+                               
+                               uMapFile << endl;
+                               namesFile << redundantSeqs << endl;
+                               count++;
+                               
+                       }
+               }
+               fastaFile.close();
+               namesFile.close();
+               uMapFile.close();
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "seqNoise", "writeOutput");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************
+
+int main(int argc, char *argv[]){
+       
+       double sigma = 100;
+       sigma = atof(argv[5]);
+       
+       double cutOff = 0.08;
+       int minIter = 10;
+       int maxIter = 1000;
+       double minDelta = 1e-6;
+       
+       string sequenceFileName = argv[1];
+       string fileNameStub = sequenceFileName.substr(0,sequenceFileName.find_last_of('.')) + ".shhh";
+       
+       vector<string> sequences;
+       getSequenceData(sequenceFileName, sequences);
+       
+       int numSeqs = sequences.size();
+       
+       vector<string> uniqueNames(numSeqs);
+       vector<string> redundantNames(numSeqs);
+       vector<int> seqFreq(numSeqs);
+       
+       string namesFileName = argv[4];
+       getRedundantNames(namesFileName, uniqueNames, redundantNames, seqFreq);
+       
+       string distFileName = argv[2];
+       vector<double> distances(numSeqs * numSeqs);
+       getDistanceData(distFileName, distances);
+       
+       string listFileName = argv[3];
+       vector<int> otuData(numSeqs);
+       vector<int> otuFreq;
+       vector<vector<int> > otuBySeqLookUp;
+       
+       getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
+       
+       int numOTUs = otuFreq.size();
+       
+       vector<double> weights(numOTUs, 0);
+       vector<int> change(numOTUs, 1);
+       vector<int> centroids(numOTUs, -1);
+       vector<int> cumCount(numOTUs, 0);
+       
+       vector<double> tau(numSeqs, 1);
+       vector<int> anP(numSeqs, 0);
+       vector<int> anI(numSeqs, 0);
+       vector<int> anN(numSeqs, 0);
+       vector<vector<int> > aanI = otuBySeqLookUp;
+       
+       int numIters = 0;
+       double maxDelta = 1e6;
+       
+       while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
+               
+               updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
+               maxDelta = calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);
+               
+               calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
+               checkCentroids(weights, centroids);
+               
+               otuFreq.assign(numOTUs, 0);
+               
+               int total = 0;
+               
+               for(int i=0;i<numSeqs;i++){
+                       double offset = 1e6;
+                       double norm = 0.0000;
+                       double minWeight = MIN_WEIGHT;
+                       vector<double> currentTau(numOTUs);
+                       
+                       for(int j=0;j<numOTUs;j++){
+                               if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
+                                       offset = distances[i * numSeqs+centroids[j]];
+                               }
+                       }
+                       
+                       for(int j=0;j<numOTUs;j++){
+                               if(weights[j] > minWeight){
+                                       currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
+                                       norm += currentTau[j];
+                               }
+                               else{
+                                       currentTau[j] = 0.0000;
+                               }
+                       }                       
+                       
+                       for(int j=0;j<numOTUs;j++){
+                               currentTau[j] /= norm;
+                       }
+                       
+                       for(int j=0;j<numOTUs;j++){
+                               
+                               if(currentTau[j] > MIN_TAU){
+                                       int oldTotal = total;
+                                       total++;
+                                       
+                                       tau.resize(oldTotal+1);
+                                       tau[oldTotal] = currentTau[j];
+                                       otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
+                                       aanI[j][otuFreq[j]] = i;
+                                       otuFreq[j]++;
+                                       
+                               }
+                       }
+                       
+                       anP.resize(total);
+                       anI.resize(total);
+               }
+               
+               numIters++;
+       }
+       
+       updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
+       
+       vector<double> percentage(numSeqs);
+       setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);
+       finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);
+       
+       change.assign(numOTUs, 1);
+       calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
+       
+       
+       vector<int> finalTau(numOTUs, 0);
+       for(int i=0;i<numSeqs;i++){
+               finalTau[otuData[i]] += int(seqFreq[i]);
+       }
+       
+       writeOutput(fileNameStub, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
+       
+       return 0;
+}
+
+/**************************************************************************************************/
diff --git a/seqnoise.h b/seqnoise.h
new file mode 100644 (file)
index 0000000..d6760a3
--- /dev/null
@@ -0,0 +1,71 @@
+#ifndef SEQNOISE
+#define SEQNOISE
+
+
+
+/*
+ *  mySeqNoise.h
+ *  
+ *
+ *  Created by Pat Schloss on 8/31/11.
+ *  Copyright 2011 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+/*****************************************************************************************************************************/
+/*****************************************************************************************************************************/
+/* NOTE: Order matters in this class.  If you are going to use it, make sure your files have the sequences in the same order. */
+/*****************************************************************************************************************************/
+/*****************************************************************************************************************************/
+
+
+#include "mothurout.h"
+/**************************************************************************************************/
+
+struct freqData {
+       
+       freqData(int i, int freq) : frequency(freq), index(i){  }
+       
+       bool operator<( freqData const& rhs ) const {
+               return frequency < rhs.frequency; 
+       }
+       
+       int frequency;
+       int index;
+       
+};
+/**************************************************************************************************/
+
+class seqNoise {
+public:
+       seqNoise() { m = MothurOut::getInstance();  }
+       ~seqNoise(){}
+       
+       int getSequenceData(string, vector<string>&);
+       int addSeq(string, vector<string>&); 
+       int getRedundantNames(string, vector<string>&, vector<string>&, vector<int>&);
+       int addRedundantName(string, string, vector<string>&, vector<string>&, vector<int>&);
+    int getDistanceData(string, vector<double>&);
+       int getListData(string, double, vector<int>&, vector<int>&, vector<vector<int> >&);
+       int updateOTUCountData(vector<int>, vector<vector<int> >, vector<vector<int> >, vector<int>&, vector<int>&, vector<int>&);
+       double calcNewWeights(vector<double>&,vector<int>,vector<int>,vector<int>,vector<int>,vector<int>,vector<double>);
+       int calcCentroids(vector<int>,vector<int>,vector<int>&,vector<int>&,vector<int>,vector<double>,vector<int>,vector<int>,vector<double>);
+       int checkCentroids(vector<double>&, vector<int>);
+       int setUpOTUData(vector<int>&, vector<double>&, vector<int>, vector<double>, vector<int>, vector<int>, vector<int>);
+       int finishOTUData(vector<int>, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&, vector<double>&);
+       int writeOutput(string, string, string, vector<int>, vector<int>, vector<int>, vector<string>, vector<string>, vector<string>, vector<int>, vector<double>&);
+
+
+private:
+       MothurOut* m;
+       
+       int getLastMatch(char, vector<vector<char> >&, int, int, vector<int>&, vector<int>&);
+       int countDiffs(vector<int>, vector<int>);
+       vector<int> convertSeq(string);
+       string degapSeq(string);
+       
+};
+
+/**************************************************************************************************/
+#endif
+
index f42d24ba4081cd95c1aa2f89367054755f136807..4a0e5e8f09c59c18c419353cbecbdfecae2c1ba9 100644 (file)
@@ -72,7 +72,7 @@ SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0),
                //are we at the beginning of the file??
                if (m->saveNextLabel == "") {  
                        f >> label; 
-                       
+       
                        //is this a shared file that has headers
                        if (label == "label") { 
                                //gets "group"
@@ -104,7 +104,6 @@ SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0),
                
                //read in first row since you know there is at least 1 group.
                f >> groupN >> num;
-
                holdLabel = label;
                
                //add new vector to lookup
@@ -155,7 +154,6 @@ SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0),
                }
                m->saveNextLabel = nextLabel;
                m->setAllGroups(allGroups);
-               
        }
        catch(exception& e) {
                m->errorOut(e, "SharedRAbundVector", "SharedRAbundVector");
diff --git a/shhhseqscommand.cpp b/shhhseqscommand.cpp
new file mode 100644 (file)
index 0000000..625b939
--- /dev/null
@@ -0,0 +1,681 @@
+/*
+ *  shhhseqscommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/8/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "shhhseqscommand.h"
+
+
+
+//**********************************************************************************************************************
+vector<string> ShhhSeqsCommand::setParameters(){       
+       try {
+               CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
+               CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
+               CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+               CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+               CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+               CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+               CommandParameter psigma("sigma", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(psigma);
+               
+               vector<string> myArray;
+               for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
+               return myArray;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "setParameters");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+string ShhhSeqsCommand::getHelpString(){       
+       try {
+               string helpString = "";
+               helpString += "The shhh.seqs command reads a fasta and name file and ....\n";
+               helpString += "The shhh.seqs command parameters are fasta, name, group, sigma and processors.\n";
+               helpString += "The fasta parameter allows you to enter the fasta file containing your potentially sequences, and is required, unless you have a valid current fasta file. \n";
+               helpString += "The name parameter allows you to provide a name file associated with your fasta file. It is required. \n";
+               helpString += "The group parameter allows you to provide a group file.  When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
+               helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
+               helpString += "The sigma parameter ....  The default is 0.01. \n";
+               helpString += "The shhh.seqs command should be in the following format: \n";
+               helpString += "shhh.seqs(fasta=yourFastaFile, name=yourNameFile) \n";
+               helpString += "Example: shhh.seqs(fasta=AD.align, name=AD.names) \n";
+               helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";       
+               return helpString;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "getHelpString");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+ShhhSeqsCommand::ShhhSeqsCommand(){    
+       try {
+               abort = true; calledHelp = true;
+               setParameters();
+               vector<string> tempOutNames;
+               outputTypes["fasta"] = tempOutNames;
+               outputTypes["name"] = tempOutNames;
+               outputTypes["map"] = tempOutNames;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "ShhhSeqsCommand");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+ShhhSeqsCommand::ShhhSeqsCommand(string option) {
+       try {
+               abort = false; calledHelp = false;   
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+               
+               else {
+                       vector<string> myArray = setParameters();
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+                       map<string, string>::iterator it;
+                       
+                       //check to make sure all parameters are valid for command
+                       for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
+                               if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
+                       }
+                       
+                       //initialize outputTypes
+                       vector<string> tempOutNames;
+                       outputTypes["fasta"] = tempOutNames;
+                       outputTypes["name"] = tempOutNames;
+                       outputTypes["map"] = tempOutNames;
+                       
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("fasta");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("name");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["name"] = inputDir + it->second;             }
+                               }
+                               
+                               it = parameters.find("group");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["group"] = inputDir + it->second;            }
+                               }
+                       }
+                       
+                       //check for required parameters
+                       fastafile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastafile == "not found") {                                 
+                               fastafile = m->getFastaFile(); 
+                               if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }
+                       else if (fastafile == "not open") { abort = true; }     
+                       else { m->setFastaFile(fastafile); }
+                       
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
+                       
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       namefile = validParameter.validFile(parameters, "name", true);
+                       if (namefile == "not found") {                  
+                               namefile = m->getNameFile(); 
+                               if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
+                               else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
+                       }
+                       else if (namefile == "not open") { namefile =  ""; abort = true; }      
+                       else {  m->setNameFile(namefile); }
+                       
+                       groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not found") { groupfile =  "";   }
+                       else if (groupfile == "not open") { abort = true; groupfile =  ""; }    
+                       else {   m->setGroupFile(groupfile);  }
+                       
+                       string temp     = validParameter.validFile(parameters, "sigma", false);         if(temp == "not found"){        temp = "0.01"; }
+                       convert(temp, sigma); 
+                       
+                       temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
+                       m->setProcessors(temp);
+                       convert(temp, processors);
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "ShhhSeqsCommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::execute() {
+       try {
+               
+               if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
+               
+               if (outputDir == "") { outputDir = m->hasPath(fastafile);  }//if user entered a file with a path then preserve it                               
+               string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.fasta";
+               string nameFileName = outputDir + m->getRootName(m->getSimpleName(fastafile))  + "shhh.names";
+               string mapFileName = outputDir + m->getRootName(m->getSimpleName(fastafile))  + "shhh.map";
+               
+               if (groupfile != "") {
+                       //Parse sequences by group
+                       SequenceParser parser(groupfile, fastafile, namefile);
+                       vector<string> groups = parser.getNamesOfGroups();
+                       
+                       if (m->control_pressed) {  return 0; }
+                       
+                       //clears files
+                       ofstream out, out1, out2;
+                       m->openOutputFile(outputFileName, out); out.close(); 
+                       m->openOutputFile(nameFileName, out1); out1.close();
+                       mapFileName = outputDir + m->getRootName(m->getSimpleName(fastafile))  + "shhh.";
+                       
+                       if(processors == 1)     {       driverGroups(parser, outputFileName, nameFileName, mapFileName, 0, groups.size(), groups);      }
+                       else                            {       createProcessesGroups(parser, outputFileName, nameFileName, mapFileName, groups);                       }
+                       
+                       if (m->control_pressed) {    return 0;  }                               
+                       
+                       //deconvolute results by running unique.seqs
+                       
+                       
+                       if (m->control_pressed) {   return 0;   }                               
+                       
+               }else{  
+                       vector<string> sequences;
+                       vector<string> uniqueNames;
+                       vector<string> redundantNames;
+                       vector<int> seqFreq;
+                       
+                       seqNoise noise;
+                       correctDist* correct = new correctDist(processors);
+                       
+                       //reads fasta and name file and loads them in order
+                       readData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq);
+                       if (m->control_pressed) { return 0; }
+                       
+                       //calc distances for cluster
+                       string distFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "shhh.dist";
+                       correct->execute(distFileName);
+                       delete correct;
+                       
+                       if (m->control_pressed) { m->mothurRemove(distFileName); return 0; }
+                       
+                       driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, outputFileName, nameFileName, mapFileName); 
+               }
+               
+               if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) {        m->mothurRemove(outputNames[j]);        } return 0; }
+               
+               outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
+               outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
+               outputNames.push_back(mapFileName); outputTypes["map"].push_back(mapFileName);
+               
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }       
+               m->mothurOutEndLine();
+               
+               //set accnos file as new current accnosfile
+               string current = "";
+               itTypes = outputTypes.find("fasta");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+               }
+               
+               itTypes = outputTypes.find("name");
+               if (itTypes != outputTypes.end()) {
+                       if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+               }
+               
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "execute");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::readData(correctDist* correct, seqNoise& noise, vector<string>& seqs, vector<string>& uNames, vector<string>& rNames, vector<int>& freq) {
+       try {
+               map<string, string> nameMap; 
+               map<string, string>::iterator it;
+               m->readNames(namefile, nameMap);
+               bool error = false;
+               
+               ifstream in;
+               m->openInputFile(fastafile, in);
+               
+               while (!in.eof()) {
+                       
+                       if (m->control_pressed) { in.close(); return 0; }
+                       
+                       Sequence seq(in); m->gobble(in);
+                       
+                       if (seq.getName() != "") {
+                               correct->addSeq(seq.getName(), seq.getAligned());
+                               
+                               it = nameMap.find(seq.getName());
+                               if (it != nameMap.end()) {
+                                       noise.addSeq(seq.getAligned(), seqs);
+                                       noise.addRedundantName(it->first, it->second, uNames, rNames, freq);
+                               }else {
+                                       m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your namefile, please correct.");
+                                       error = true;
+                               }
+                       }
+               }
+               in.close();
+               
+               if (error) { m->control_pressed = true; }
+               
+               return seqs.size();
+               
+       }catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "readData");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::loadData(correctDist* correct, seqNoise& noise, vector<string>& seqs, vector<string>& uNames, vector<string>& rNames, vector<int>& freq, map<string, string>& nameMap, vector<Sequence>& sequences) {
+       try {
+               bool error = false;
+               map<string, string>::iterator it;
+               
+               for (int i = 0; i < sequences.size(); i++) {
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       if (sequences[i].getName() != "") {
+                               correct->addSeq(sequences[i].getName(), sequences[i].getAligned());
+                               
+                               it = nameMap.find(sequences[i].getName());
+                               if (it != nameMap.end()) {
+                                       noise.addSeq(sequences[i].getAligned(), seqs);
+                                       noise.addRedundantName(it->first, it->second, uNames, rNames, freq);
+                               }else {
+                                       m->mothurOut("[ERROR]: " + sequences[i].getName() + " is in your fasta file and not in your namefile, please correct.");
+                                       error = true;
+                               }
+                       }
+               }
+                               
+               if (error) { m->control_pressed = true; }
+               
+               return seqs.size();
+               
+       }catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "loadData");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int ShhhSeqsCommand::createProcessesGroups(SequenceParser& parser, string newFName, string newNName, string newMName, vector<string> groups) {
+       try {
+               
+               vector<int> processIDS;
+               int process = 1;
+               
+               //sanity check
+               if (groups.size() < processors) { processors = groups.size(); }
+               
+               //divide the groups between the processors
+               vector<linePair> lines;
+               int numGroupsPerProcessor = groups.size() / processors;
+               for (int i = 0; i < processors; i++) {
+                       int startIndex =  i * numGroupsPerProcessor;
+                       int endIndex = (i+1) * numGroupsPerProcessor;
+                       if(i == (processors - 1)){      endIndex = groups.size();       }
+                       lines.push_back(linePair(startIndex, endIndex));
+               }
+               
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)          
+               
+               //loop through and create all the processes you want
+               while (process != processors) {
+                       int pid = fork();
+                       
+                       if (pid > 0) {
+                               processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
+                               process++;
+                       }else if (pid == 0){
+                               driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMName, lines[process].start, lines[process].end, groups);
+                               exit(0);
+                       }else { 
+                               m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
+                               for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+                               exit(0);
+                       }
+               }
+               
+               //do my part
+               driverGroups(parser, newFName, newNName, newMName, lines[0].start, lines[0].end, groups);
+               
+               //force parent to wait until all the processes are done
+               for (int i=0;i<processIDS.size();i++) { 
+                       int temp = processIDS[i];
+                       wait(&temp);
+               }
+               
+#else
+               
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               //Windows version shared memory, so be careful when passing variables through the shhhseqsData struct. 
+               //Above fork() will clone, so memory is separate, but that's not the case with windows, 
+               //////////////////////////////////////////////////////////////////////////////////////////////////////
+               
+               vector<shhhseqsData*> pDataArray; 
+               DWORD   dwThreadIdArray[processors-1];
+               HANDLE  hThreadArray[processors-1]; 
+               
+               //Create processor worker threads.
+               for( int i=1; i<processors; i++ ){
+                       // Allocate memory for thread data.
+                       string extension = toString(i) + ".temp";
+                       
+                       shhhseqsData* tempShhhseqs = new shhhseqsData(fastafile, namefile, groupfile, (newFName+extension), (newNName+extension), newMName, groups, m, lines[i].start, lines[i].end, sigma, i);
+                       pDataArray.push_back(tempShhhseqs);
+                       processIDS.push_back(i);
+                       
+                       //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+                       //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+                       hThreadArray[i-1] = CreateThread(NULL, 0, MyShhhSeqsThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);   
+               }
+               
+               
+               //using the main process as a worker saves time and memory
+               driverGroups(parser, newFName, newNName, newMName, lines[0].start, lines[0].end, groups);
+               
+               //Wait until all threads have terminated.
+               WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+               
+               //Close all thread handles and free memory allocations.
+               for(int i=0; i < pDataArray.size(); i++){
+                       CloseHandle(hThreadArray[i]);
+                       delete pDataArray[i];
+               }
+               
+#endif         
+               
+               //append output files
+               for(int i=0;i<processIDS.size();i++){
+                       m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
+                       m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
+                       
+                       m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
+                       m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
+               }
+               
+               return 0;       
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "createProcessesGroups");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int ShhhSeqsCommand::driverGroups(SequenceParser& parser, string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
+       try {
+               
+               for (int i = start; i < end; i++) {
+                       
+                       start = time(NULL);
+                       
+                       if (m->control_pressed) {  return 0; }
+                       
+                       m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
+                       
+                       map<string, string> thisNameMap;
+                       thisNameMap = parser.getNameMap(groups[i]); 
+                       vector<Sequence> thisSeqs = parser.getSeqs(groups[i]);
+                       
+                       vector<string> sequences;
+                       vector<string> uniqueNames;
+                       vector<string> redundantNames;
+                       vector<int> seqFreq;
+                       
+                       seqNoise noise;
+                       correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load.
+                       
+                       //load this groups info in order
+                       loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs);
+                       if (m->control_pressed) { return 0; }
+                       
+                       //calc distances for cluster
+                       string distFileName = outputDir + m->getRootName(m->getSimpleName(fastafile)) + groups[i] + ".shhh.dist";
+                       correct->execute(distFileName);
+                       delete correct;
+                       
+                       if (m->control_pressed) { m->mothurRemove(distFileName); return 0; }
+                       
+                       driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map"); 
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       m->appendFiles(newFFile+groups[i], newFFile); m->mothurRemove(newFFile+groups[i]);
+                       m->appendFiles(newNFile+groups[i], newNFile); m->mothurRemove(newNFile+groups[i]);
+                       
+                       m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + groups[i] + "."); m->mothurOutEndLine(); 
+               }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "driverGroups");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int ShhhSeqsCommand::driver(seqNoise& noise, 
+                                                       vector<string>& sequences, 
+                                                       vector<string>& uniqueNames, 
+                                                       vector<string>& redundantNames, 
+                                                       vector<int>& seqFreq, 
+                                                       string distFileName, string outputFileName, string nameFileName, string mapFileName) {
+       try {
+               double cutOff = 0.08;
+               int minIter = 10;
+               int maxIter = 1000;
+               double minDelta = 1e-6;
+               int numIters = 0;
+               double maxDelta = 1e6;
+               int numSeqs = sequences.size();
+                               
+               //run cluster command
+               string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08";
+               m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+               m->mothurOut("Running command: cluster(" + inputString + ")"); m->mothurOutEndLine(); 
+               
+               Command* clusterCommand = new ClusterCommand(inputString);
+               clusterCommand->execute();
+               
+               map<string, vector<string> > filenames = clusterCommand->getOutputFiles();
+               string listFileName = filenames["list"][0];
+               string rabundFileName = filenames["rabund"][0]; m->mothurRemove(rabundFileName);
+               string sabundFileName = filenames["sabund"][0]; m->mothurRemove(sabundFileName);
+       
+               delete clusterCommand;
+               m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+               
+               if (m->control_pressed) { m->mothurRemove(distFileName); m->mothurRemove(listFileName); return 0; }
+               
+               vector<double> distances(numSeqs * numSeqs);
+               noise.getDistanceData(distFileName, distances);
+               m->mothurRemove(distFileName); 
+               if (m->control_pressed) { m->mothurRemove(listFileName); return 0; }
+               
+               vector<int> otuData(numSeqs);
+               vector<int> otuFreq;
+               vector<vector<int> > otuBySeqLookUp;
+               noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
+               m->mothurRemove(listFileName);
+               if (m->control_pressed) { return 0; }
+               
+               int numOTUs = otuFreq.size();
+               
+               vector<double> weights(numOTUs, 0);
+               vector<int> change(numOTUs, 1);
+               vector<int> centroids(numOTUs, -1);
+               vector<int> cumCount(numOTUs, 0);
+               
+               vector<double> tau(numSeqs, 1);
+               vector<int> anP(numSeqs, 0);
+               vector<int> anI(numSeqs, 0);
+               vector<int> anN(numSeqs, 0);
+               vector<vector<int> > aanI = otuBySeqLookUp;
+               
+               while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (m->control_pressed) { return 0; }
+                       maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);  if (m->control_pressed) { return 0; }
+                       
+                       noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (m->control_pressed) { return 0; }
+                       noise.checkCentroids(weights, centroids); if (m->control_pressed) { return 0; }
+                        
+                       otuFreq.assign(numOTUs, 0);
+                       
+                       int total = 0;
+                       
+                       for(int i=0;i<numSeqs;i++){
+                               if (m->control_pressed) { return 0; }
+                               
+                               double offset = 1e6;
+                               double norm = 0.0000;
+                               double minWeight = 0.1;
+                               vector<double> currentTau(numOTUs);
+                               
+                               for(int j=0;j<numOTUs;j++){
+                                       if (m->control_pressed) { return 0; }
+                                       if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
+                                               offset = distances[i * numSeqs+centroids[j]];
+                                       }
+                               }
+                               
+                               for(int j=0;j<numOTUs;j++){
+                                       if (m->control_pressed) { return 0; }
+                                       if(weights[j] > minWeight){
+                                               currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
+                                               norm += currentTau[j];
+                                       }
+                                       else{
+                                               currentTau[j] = 0.0000;
+                                       }
+                               }                       
+                               
+                               for(int j=0;j<numOTUs;j++){
+                                       if (m->control_pressed) { return 0; }
+                                       currentTau[j] /= norm;
+                               }
+                               
+                               for(int j=0;j<numOTUs;j++){
+                                       if (m->control_pressed) { return 0; }
+                                       
+                                       if(currentTau[j] > 1.0e-4){
+                                               int oldTotal = total;
+                                               total++;
+                                               
+                                               tau.resize(oldTotal+1);
+                                               tau[oldTotal] = currentTau[j];
+                                               otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
+                                               aanI[j][otuFreq[j]] = i;
+                                               otuFreq[j]++;
+                                               
+                                       }
+                               }
+                               
+                               anP.resize(total);
+                               anI.resize(total);
+                       }
+                       
+                       numIters++;
+               }
+               
+               noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);  if (m->control_pressed) { return 0; }
+               
+               vector<double> percentage(numSeqs);
+               noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);  if (m->control_pressed) { return 0; }
+               noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);  if (m->control_pressed) { return 0; }
+               
+               change.assign(numOTUs, 1);
+               noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (m->control_pressed) { return 0; }
+               
+               
+               vector<int> finalTau(numOTUs, 0);
+               for(int i=0;i<numSeqs;i++){
+                       if (m->control_pressed) { return 0; }
+                       finalTau[otuData[i]] += int(seqFreq[i]);
+               }
+               
+               noise.writeOutput(outputFileName, nameFileName, mapFileName, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
+               
+               return 0;
+               
+       }catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "driver");
+               exit(1);
+       }
+}      
+//**********************************************************************************************************************
+int ShhhSeqsCommand::deconvoluteResults(string fastaFile, string nameFile){
+       try {
+               m->mothurOutEndLine(); m->mothurOut("Deconvoluting results:"); m->mothurOutEndLine(); m->mothurOutEndLine();
+               
+               //use unique.seqs to create new name and fastafile
+               string inputString = "fasta=" + fastaFile + ", name=" + nameFile;
+               m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+               m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+               
+               Command* uniqueCommand = new DeconvoluteCommand(inputString);
+               uniqueCommand->execute();
+               
+               map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+               
+               delete uniqueCommand;
+               
+               m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+               
+               string newnameFile = filenames["name"][0];
+               string newfastaFile = filenames["fasta"][0];
+               
+               m->mothurRemove(fastaFile); rename(newfastaFile.c_str(), fastaFile.c_str()); 
+               m->mothurRemove(nameFile); rename(newnameFile.c_str(), nameFile.c_str()); 
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ShhhSeqsCommand", "deconvoluteResults");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+
+
diff --git a/shhhseqscommand.h b/shhhseqscommand.h
new file mode 100644 (file)
index 0000000..5aceca8
--- /dev/null
@@ -0,0 +1,332 @@
+#ifndef SHHHSEQSCOMMAND_H
+#define SHHHSEQSCOMMAND_H
+
+/*
+ *  shhhseqscommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 11/8/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+#include "myseqdist.h"
+#include "seqnoise.h"
+#include "sequenceparser.h"
+#include "deconvolutecommand.h"
+#include "clustercommand.h"
+
+//**********************************************************************************************************************
+
+class ShhhSeqsCommand : public Command {
+       
+public:
+       ShhhSeqsCommand(string);
+       ShhhSeqsCommand();
+       ~ShhhSeqsCommand() {}
+       
+       vector<string> setParameters();
+       string getCommandName()                 { return "shhh.seqs";   }
+       string getCommandCategory()             { return "Sequence Processing";         }
+       string getHelpString(); 
+       string getCitation() { return "http://www.mothur.org/wiki/Shhh.seqs"; }
+       string getDescription()         { return "shhh.seqs"; }
+       
+       
+       int execute(); 
+       void help() { m->mothurOut(getHelpString()); }          
+       
+private:
+       
+       struct linePair {
+               int start;
+               int end;
+               linePair(int i, int j) : start(i), end(j) {}
+       };
+       
+       bool abort;
+       string outputDir, fastafile, namefile, groupfile;
+       int processors;
+       double sigma;
+       vector<string> outputNames;
+       
+       int readData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&);
+       int loadData(correctDist*, seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, map<string, string>&, vector<Sequence>&);
+
+       int driver(seqNoise&, vector<string>&, vector<string>&, vector<string>&, vector<int>&, string, string, string, string);
+       int driverGroups(SequenceParser&, string, string, string, int, int, vector<string>);
+       int createProcessesGroups(SequenceParser&, string, string, string, vector<string>);
+       int deconvoluteResults(string, string);
+
+
+
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct shhhseqsData {
+       string fastafile; 
+       string namefile; 
+       string groupfile;
+       string newFName, newNName, newMName;
+       MothurOut* m;
+       int start;
+       int end;
+       int sigma, threadID;
+       vector<string> groups;
+       
+       shhhseqsData(){}
+       shhhseqsData(string f, string n, string g, string nff,  string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int s, int tid) {
+               fastafile = f;
+               namefile = n;
+               groupfile = g;
+               newFName = nff;
+               newNName = nnf;
+               newNName = nmf;
+               m = mout;
+               start = st;
+               end = en;
+               sigma = s;
+               threadID = tid;
+               groups = gr;
+       }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyShhhSeqsThreadFunction(LPVOID lpParam){ 
+       shhhseqsData* pDataArray;
+       pDataArray = (shhhseqsData*)lpParam;
+       
+       try {
+               
+               //parse fasta and name file by group
+               SequenceParser parser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);
+                                               
+               //precluster each group
+               for (int k = pDataArray->start; k < pDataArray->end; k++) {
+                       
+                       int start = time(NULL);
+                       
+                       if (pDataArray->m->control_pressed) {  return 0; }
+                       
+                       pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Processing group " + pDataArray->groups[k] + ":"); pDataArray->m->mothurOutEndLine();
+                       
+                       map<string, string> thisNameMap;
+                       thisNameMap = parser.getNameMap(pDataArray->groups[k]); 
+                       vector<Sequence> thisSeqs = parser.getSeqs(pDataArray->groups[k]);
+                       
+                       if (pDataArray->m->control_pressed) {  return 0; }
+                       
+                       vector<string> sequences;
+                       vector<string> uniqueNames;
+                       vector<string> redundantNames;
+                       vector<int> seqFreq;
+                       
+                       seqNoise noise;
+                       correctDist* correct = new correctDist(1); //we use one processor since we already split up the work load.
+                       
+                       //load this groups info in order
+                       //loadData(correct, noise, sequences, uniqueNames, redundantNames, seqFreq, thisNameMap, thisSeqs);
+                       //////////////////////////////////////////////////////////////////////////////////////////////////
+                       bool error = false;
+                       map<string, string>::iterator it;
+                       
+                       for (int i = 0; i < thisSeqs.size(); i++) {
+                               
+                               if (pDataArray->m->control_pressed) { return 0; }
+                               
+                               if (thisSeqs[i].getName() != "") {
+                                       correct->addSeq(thisSeqs[i].getName(), thisSeqs[i].getAligned());
+                                       
+                                       it = thisNameMap.find(thisSeqs[i].getName());
+                                       if (it != thisNameMap.end()) {
+                                               noise.addSeq(thisSeqs[i].getAligned(), sequences);
+                                               noise.addRedundantName(it->first, it->second, uniqueNames, redundantNames, seqFreq);
+                                       }else {
+                                               pDataArray->m->mothurOut("[ERROR]: " + thisSeqs[i].getName() + " is in your fasta file and not in your namefile, please correct.");
+                                               error = true;
+                                       }
+                               }
+                       }
+                       
+                       if (error) { return 0; }
+                       //////////////////////////////////////////////////////////////////////////////////////////////////
+                       
+                       if (pDataArray->m->control_pressed) { return 0; }
+                       
+                       //calc distances for cluster
+                       string distFileName = pDataArray->m->getRootName(pDataArray->m->getSimpleName(pDataArray->fastafile)) + pDataArray->groups[k] + ".shhh.dist";
+                       correct->execute(distFileName);
+                       delete correct;
+                       
+                       if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); return 0; }
+                       
+                       //driver(noise, sequences, uniqueNames, redundantNames, seqFreq, distFileName, newFFile+groups[i], newNFile+groups[i], newMFile+groups[i]+".map"); 
+                       ///////////////////////////////////////////////////////////////////////////////////////////////////
+                       double cutOff = 0.08;
+                       int minIter = 10;
+                       int maxIter = 1000;
+                       double minDelta = 1e-6;
+                       int numIters = 0;
+                       double maxDelta = 1e6;
+                       int numSeqs = sequences.size();
+                       
+                       //run cluster command
+                       string inputString = "phylip=" + distFileName + ", method=furthest, cutoff=0.08";
+                       pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); 
+                       pDataArray->m->mothurOut("Running command: cluster(" + inputString + ")"); pDataArray->m->mothurOutEndLine(); 
+                       
+                       Command* clusterCommand = new ClusterCommand(inputString);
+                       clusterCommand->execute();
+                       
+                       map<string, vector<string> > filenames = clusterCommand->getOutputFiles();
+                       string listFileName = filenames["list"][0];
+                       string rabundFileName = filenames["rabund"][0]; pDataArray->m->mothurRemove(rabundFileName);
+                       string sabundFileName = filenames["sabund"][0]; pDataArray->m->mothurRemove(sabundFileName);
+                       
+                       delete clusterCommand;
+                       pDataArray->m->mothurOut("/******************************************/"); pDataArray->m->mothurOutEndLine(); 
+                       
+                       if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(distFileName); pDataArray->m->mothurRemove(listFileName); return 0; }
+                       
+                       vector<double> distances(numSeqs * numSeqs);
+                       noise.getDistanceData(distFileName, distances);
+                       pDataArray->m->mothurRemove(distFileName); 
+                       if (pDataArray->m->control_pressed) { pDataArray->m->mothurRemove(listFileName); return 0; }
+                       
+                       vector<int> otuData(numSeqs);
+                       vector<int> otuFreq;
+                       vector<vector<int> > otuBySeqLookUp;
+                       noise.getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
+                       pDataArray->m->mothurRemove(listFileName);
+                       if (pDataArray->m->control_pressed) { return 0; }
+                       
+                       int numOTUs = otuFreq.size();
+                       
+                       vector<double> weights(numOTUs, 0);
+                       vector<int> change(numOTUs, 1);
+                       vector<int> centroids(numOTUs, -1);
+                       vector<int> cumCount(numOTUs, 0);
+                       
+                       vector<double> tau(numSeqs, 1);
+                       vector<int> anP(numSeqs, 0);
+                       vector<int> anI(numSeqs, 0);
+                       vector<int> anN(numSeqs, 0);
+                       vector<vector<int> > aanI = otuBySeqLookUp;
+                       
+                       while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
+                               
+                               if (pDataArray->m->control_pressed) { return 0; }
+                               
+                               noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount); if (pDataArray->m->control_pressed) { return 0; }
+                               maxDelta = noise.calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);  if (pDataArray->m->control_pressed) { return 0; }
+                               
+                               noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
+                               noise.checkCentroids(weights, centroids); if (pDataArray->m->control_pressed) { return 0; }
+                               
+                               otuFreq.assign(numOTUs, 0);
+                               
+                               int total = 0;
+                               
+                               for(int i=0;i<numSeqs;i++){
+                                       if (pDataArray->m->control_pressed) { return 0; }
+                                       
+                                       double offset = 1e6;
+                                       double norm = 0.0000;
+                                       double minWeight = 0.1;
+                                       vector<double> currentTau(numOTUs);
+                                       
+                                       for(int j=0;j<numOTUs;j++){
+                                               if (pDataArray->m->control_pressed) { return 0; }
+                                               if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
+                                                       offset = distances[i * numSeqs+centroids[j]];
+                                               }
+                                       }
+                                       
+                                       for(int j=0;j<numOTUs;j++){
+                                               if (pDataArray->m->control_pressed) { return 0; }
+                                               if(weights[j] > minWeight){
+                                                       currentTau[j] = exp(pDataArray->sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
+                                                       norm += currentTau[j];
+                                               }
+                                               else{
+                                                       currentTau[j] = 0.0000;
+                                               }
+                                       }                       
+                                       
+                                       for(int j=0;j<numOTUs;j++){
+                                               if (pDataArray->m->control_pressed) { return 0; }
+                                               currentTau[j] /= norm;
+                                       }
+                                       
+                                       for(int j=0;j<numOTUs;j++){
+                                               if (pDataArray->m->control_pressed) { return 0; }
+                                               
+                                               if(currentTau[j] > 1.0e-4){
+                                                       int oldTotal = total;
+                                                       total++;
+                                                       
+                                                       tau.resize(oldTotal+1);
+                                                       tau[oldTotal] = currentTau[j];
+                                                       otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
+                                                       aanI[j][otuFreq[j]] = i;
+                                                       otuFreq[j]++;
+                                                       
+                                               }
+                                       }
+                                       
+                                       anP.resize(total);
+                                       anI.resize(total);
+                               }
+                               
+                               numIters++;
+                       }
+                       
+                       noise.updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);  if (pDataArray->m->control_pressed) { return 0; }
+                       
+                       vector<double> percentage(numSeqs);
+                       noise.setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);  if (pDataArray->m->control_pressed) { return 0; }
+                       noise.finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);  if (pDataArray->m->control_pressed) { return 0; }
+                       
+                       change.assign(numOTUs, 1);
+                       noise.calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau); if (pDataArray->m->control_pressed) { return 0; }
+                       
+                       
+                       vector<int> finalTau(numOTUs, 0);
+                       for(int i=0;i<numSeqs;i++){
+                               if (pDataArray->m->control_pressed) { return 0; }
+                               finalTau[otuData[i]] += int(seqFreq[i]);
+                       }
+                       
+                       noise.writeOutput(pDataArray->newFName+pDataArray->groups[k], pDataArray->newNName+pDataArray->groups[k], pDataArray->newMName+pDataArray->groups[k]+".map", finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
+                       
+                       ///////////////////////////////////////////////////////////////////////////////////////////////////
+                       
+                       if (pDataArray->m->control_pressed) { return 0; }
+                       
+                       pDataArray->m->appendFiles(pDataArray->newFName+pDataArray->groups[k], pDataArray->newFName); pDataArray->m->mothurRemove(pDataArray->newFName+pDataArray->groups[k]);
+                       pDataArray->m->appendFiles(pDataArray->newNName+pDataArray->groups[k], pDataArray->newNName); pDataArray->m->mothurRemove(pDataArray->newNName+pDataArray->groups[k]);
+                       
+                       pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to process group " + pDataArray->groups[k] + "."); pDataArray->m->mothurOutEndLine(); 
+                       
+                                                       
+               }
+               
+               return 0;
+               
+               
+       }
+       catch(exception& e) {
+               pDataArray->m->errorOut(e, "PreClusterCommand", "MyPreclusterThreadFunction");
+               exit(1);
+       }
+} 
+#endif
+/**************************************************************************************************/
+
+#endif
index 6d697359bb2c26c4957faa9d981fb7399ac54313..7a1af627a5e290c6d348e4a3783c645ba575982c 100644 (file)
@@ -272,32 +272,34 @@ int TrimFlowsCommand::execute(){
                ofstream output;
                
                if(allFiles){
-                       
+                       set<string> namesAlreadyProcessed;
                        flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
                        m->openOutputFile(flowFilesFileName, output);
 
                        for(int i=0;i<barcodePrimerComboFileNames.size();i++){
                                for(int j=0;j<barcodePrimerComboFileNames[0].size();j++){
-                                       
-                                       FILE * pFile;
-                                       unsigned long long size;
-                                       
-                                       //get num bytes in file
-                                       pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
-                                       if (pFile==NULL) perror ("Error opening file");
-                                       else{
-                                               fseek (pFile, 0, SEEK_END);
-                                               size=ftell(pFile);
-                                               fclose (pFile);
-                                       }
-
-                                       if(size < 10){
-                                               m->mothurRemove(barcodePrimerComboFileNames[i][j]);
-                                       }
-                                       else{
-                                               output << barcodePrimerComboFileNames[i][j] << endl;
-                                               outputNames.push_back(barcodePrimerComboFileNames[i][j]);
-                                               outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]);
+                                       if (namesAlreadyProcessed.count(barcodePrimerComboFileNames[i][j]) == 0) {
+                                               FILE * pFile;
+                                               unsigned long long size;
+                                               
+                                               //get num bytes in file
+                                               pFile = fopen (barcodePrimerComboFileNames[i][j].c_str(),"rb");
+                                               if (pFile==NULL) perror ("Error opening file");
+                                               else{
+                                                       fseek (pFile, 0, SEEK_END);
+                                                       size=ftell(pFile);
+                                                       fclose (pFile);
+                                               }
+                                               
+                                               if(size < 10){
+                                                       m->mothurRemove(barcodePrimerComboFileNames[i][j]);
+                                               }
+                                               else{
+                                                       output << barcodePrimerComboFileNames[i][j] << endl;
+                                                       outputNames.push_back(barcodePrimerComboFileNames[i][j]);
+                                                       outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]);
+                                               }
+                                               namesAlreadyProcessed.insert(barcodePrimerComboFileNames[i][j]);
                                        }
                                }
                        }